In [32]:
import pandas as pd
collection = pd.read_csv('google_scholar_683.csv')

def add_criteria_columns(prefix):
    for x in range(1, 6):
        collection[prefix + str(x)] = False

# Add selection criteria as columns
add_criteria_columns('i') # Inclusion
add_criteria_columns('e') # Exclusion

collection

Unnamed: 0,Key,Item Type,Publication Year,Author,Title,Publication Title,ISBN,ISSN,DOI,Url,...,i1,i2,i3,i4,i5,e1,e2,e3,e4,e5
0,6BH5WM7V,journalArticle,2017.0,"Du, Zhihui; He, Ligang; Chen, Yinong; Xiao, Yu...",Robot Cloud: Bridging the power of robotics an...,Future Generation Computer Systems,,,,,...,False,False,False,False,False,False,False,False,False,False
1,CAV3PKJB,journalArticle,2007.0,"Han, Jun; Asada, Akira; Ura, Tamaki; Yamauchi,...",Noncontact power supply for seafloor geodetic ...,Journal of marine science and technology,,,,,...,False,False,False,False,False,False,False,False,False,False
2,LCA5ZRZQ,conferencePaper,2010.0,"Wang, Binhai; Chen, Xiguang; Wang, Qian; Liu, ...",Power line inspection with a flying robot,2010 1st International Conference on Applied R...,,,,,...,False,False,False,False,False,False,False,False,False,False
3,8YQZGJKS,journalArticle,2005.0,"CAI, Gai-pin; HUANG, Zhi-qing",The Realization of the Industrial Revolving Tu...,Machine Tool & Hydraulics,,,,,...,False,False,False,False,False,False,False,False,False,False
4,GK66MU6T,conferencePaper,2006.0,"Mei, Yongguo; Lu, Yung-Hsiang; Lee, CS George;...",Energy-efficient mobile robot exploration,Proceedings 2006 IEEE International Conference...,,,,,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
678,XF5HJDYH,journalArticle,2018.0,"Abadi, Vida Shams Esfand; Rostami, Mostafa; Ra...",Walking Path Prevision of Biped Robot along wi...,Modares Mechanical Engineering,,,,,...,False,False,False,False,False,False,False,False,False,False
679,8CEZTE8L,journalArticle,2019.0,"Almécija Murciano, Francisco Javier",Power management strategies for a mobile robot,,,,,,...,False,False,False,False,False,False,False,False,False,False
680,GXWNQ2F3,journalArticle,2001.0,"Krstulović, Ante",Robot Energy Efficiency Through Redundancy,Strojarstvo,,,,,...,False,False,False,False,False,False,False,False,False,False
681,YUR8N5QG,journalArticle,2011.0,王文俊,高效率電源管理之智慧型僕役機器人-總計畫: 高效率電源管理之智慧型僕役機器人; Intell...,財團法人國家實驗研究院科技政策研究與資訊中心,,,,,...,False,False,False,False,False,False,False,False,False,False


In [83]:
# Filter records based on types that are to be considered (peer reviewed by nature).
def filter_types(col):
    df = pd.DataFrame(columns=col.columns)
    for index, row in col.iterrows():
        item_type = row['Item Type']
        if "journalArticle" in item_type or \
            "conferencePaper" in item_type or \
            "bookSection" in item_type or \
            "book" in item_type:
            row['i4'] = True
            df.loc[len(df)] = row
    
    return df

# Filter duplicates but keep highest priority variant.
def filter_duplicates(col):
    seen = list()
    df = pd.DataFrame(columns=col.columns)
    for index, row in col.iterrows():
        title = row['Title']
        itemType = row['Item Type']
        
        # Convert itemType from str to priority integer (lower = better)
        if itemType == "journalArticle":
            itemType = 0
        elif itemType == "conferencePaper":
            itemType = 1
        elif itemType == "book":
            itemType = 2
        elif itemType == "bookSection":
            itemType = 3
        
        # Create 3-tuple of important, necessary items.
        item = (index, title, itemType)
        
        if not item[1] in [t[1] for t in seen]:       # If title not yet seen, add it to DF and to seen
            df_index = len(df)
            df.loc[df_index] = row
            item = (df_index, item[1], item[2])       # Update index of item to new index in new DF
            seen.append(item)
        else:
            stored_item = [t for t in seen if t[1] == item[1]][0] # Retrieve stored_item which has title seen (dup)
            if stored_item[2] > item[2]:              # If current item has higher priority than stored item, update
                df.drop(stored_item[0])               # Drop original from DF
                df_index = len(df)                    # Index at which item will be placed
                df.loc[df_index] = row                # Place row at index
                item = (df_index, item[1], item[2])   # Update item index to new index in new DF
                seen[seen.index(stored_item)] = item  # Update seen by overwriting stored_item to current item
                
    return df

# Purely for testing, not used at all and should not be used on the collection.
# Only to get an indication of the amount of papers that are more explicit in the title.
def filter_titles(col):
    df = pd.DataFrame(columns=col.columns)
    for index, row in col.iterrows():
        title = row['Title']
        if "efficiency" in title or \
            "consumption" in title or \
            "optimisation" in title or \
            "optimization" in title or \
            "modeling" in title or \
            "analysis" in title:
            df.loc[len(df)] = row

    return df

In [51]:
# Filter the collection based on 'peer review by nature' types as discussed in email contact.
col_types = filter_types(collection)

In [85]:
# Filter duplicates and keep highest priority ones, export to CSV (final record set).
col_dups = filter_duplicates(col_types)
col_dups.to_csv("filtered_types_duplicates.csv")