In [1]:
import pandas as pd
collection = pd.read_csv('forward_snowballing_1.csv')

def add_criteria_columns(prefix):
    for x in range(1, 6):
        collection[prefix + str(x)] = False

# Add selection criteria as columns
add_criteria_columns('i') # Inclusion
add_criteria_columns('e') # Exclusion

collection

Unnamed: 0,Key,Item Type,Publication Year,Author,Title,Publication Title,ISBN,ISSN,DOI,Url,...,i1,i2,i3,i4,i5,e1,e2,e3,e4,e5
0,XENBJP9B,conferencePaper,2008,"Borgstrom, Per Henrik; Singh, Amarjeet; Jordan...",Energy based path planning for a novel cabled ...,2008 IEEE/RSJ International Conference on Inte...,,,,,...,False,False,False,False,False,False,False,False,False,False
1,5JTXRBSV,conferencePaper,2011,"Rassõlkin, Anton; Hõimoja, Hardi; Teemets, Raivo",Energy saving possibilities in the industrial ...,2011 7th International Conference-Workshop Com...,,,,,...,False,False,False,False,False,False,False,False,False,False
2,7Q37UI9S,journalArticle,2014,"Salan, Serge; Drumwright, Evan; Lin, King-Ip",Minimum-energy robotic exploration: A formulat...,"IEEE Transactions on Systems, Man, and Cyberne...",,,,,...,False,False,False,False,False,False,False,False,False,False
3,CWUMUWPQ,thesis,2007,"Mei, Yongguo",Energy-efficient mobile robots,,,,,,...,False,False,False,False,False,False,False,False,False,False
4,BW8CFLUP,journalArticle,2016,"Henkel, Christian; Bubeck, Alexander; Xu, Weil...",Energy efficient dynamic window approach for l...,IFAC-PapersOnLine,,,,,...,False,False,False,False,False,False,False,False,False,False
5,3QM8DTAQ,conferencePaper,2016,"Bartlett, Oliver; Gurau, Corina; Marchegiani, ...",Enabling intelligent energy management for rob...,2016 IEEE/RSJ International Conference on Inte...,,,,,...,False,False,False,False,False,False,False,False,False,False
6,S5K5ZCUY,conferencePaper,2016,"Rappaport, Micha",Energy-aware mobile robot exploration with ada...,Proceedings of ISR 2016: 47st International Sy...,,,,,...,False,False,False,False,False,False,False,False,False,False
7,NRT2IS7G,conferencePaper,2013,"Ogunniyi, Samuel; Tsoeu, Mohohlo S.",Q-learning based energy efficient path plannin...,24th Symposium of the Pattern Recognition Asso...,,,,,...,False,False,False,False,False,False,False,False,False,False
8,Z8IP73RF,journalArticle,2012,"Kottas, Apostolos Dimitrios",Energy-efficient designs and principles for mi...,,,,,,...,False,False,False,False,False,False,False,False,False,False
9,CXTI5UZ7,journalArticle,2019,"Wu, Lingying; Sugiyama, Ayumi; Sugawara, Toshi...",Energy-efficient strategies for multi-agent co...,Procedia Computer Science,,,,,...,False,False,False,False,False,False,False,False,False,False


In [2]:
# Filter records based on types that are to be considered (peer reviewed by nature).
def filter_types(col):
    df = pd.DataFrame(columns=col.columns)
    for index, row in col.iterrows():
        item_type = row['Item Type']
        if "journalArticle" in item_type or \
            "conferencePaper" in item_type or \
            "bookSection" in item_type or \
            "book" in item_type:
            row['i4'] = True
            df.loc[len(df)] = row
    
    return df

# Filter duplicates but keep highest priority variant.
def filter_duplicates(col):
    seen = list()
    df = pd.DataFrame(columns=col.columns)
    for index, row in col.iterrows():
        title = row['Title']
        itemType = row['Item Type']
        
        # Convert itemType from str to priority integer (lower = better)
        if itemType == "journalArticle":
            itemType = 0
        elif itemType == "conferencePaper":
            itemType = 1
        elif itemType == "book":
            itemType = 2
        elif itemType == "bookSection":
            itemType = 3
        
        # Create 3-tuple of important, necessary items.
        item = (index, title, itemType)
        
        if not item[1] in [t[1] for t in seen]:       # If title not yet seen, add it to DF and to seen
            df_index = len(df)
            df.loc[df_index] = row
            item = (df_index, item[1], item[2])       # Update index of item to new index in new DF
            seen.append(item)
        else:
            stored_item = [t for t in seen if t[1] == item[1]][0] # Retrieve stored_item which has title seen (dup)
            if stored_item[2] > item[2]:              # If current item has higher priority than stored item, update
                df.drop(stored_item[0])               # Drop original from DF
                df_index = len(df)                    # Index at which item will be placed
                df.loc[df_index] = row                # Place row at index
                item = (df_index, item[1], item[2])   # Update item index to new index in new DF
                seen[seen.index(stored_item)] = item  # Update seen by overwriting stored_item to current item
                
    return df

# Purely for testing, not used at all and should not be used on the collection.
# Only to get an indication of the amount of papers that are more explicit in the title.
def filter_titles(col):
    df = pd.DataFrame(columns=col.columns)
    for index, row in col.iterrows():
        title = row['Title']
        if "efficiency" in title or \
            "consumption" in title or \
            "optimisation" in title or \
            "optimization" in title or \
            "modeling" in title or \
            "analysis" in title:
            df.loc[len(df)] = row

    return df

In [3]:
# Filter the collection based on 'peer review by nature' types as discussed in email contact.
col_types = filter_types(collection)

In [4]:
# Filter duplicates and keep highest priority ones, export to CSV (final record set).
col_dups = filter_duplicates(col_types)
col_dups.to_csv("forward_snowballing_1_filtered.csv")