In [1]:
import pandas as pd
collection = pd.read_csv('forward_snowballing_1.csv')

def add_criteria_columns(prefix):
    for x in range(1, 6):
        collection[prefix + str(x)] = False

# Add selection criteria as columns
add_criteria_columns('i') # Inclusion
add_criteria_columns('e') # Exclusion

collection

Unnamed: 0,Key,Item Type,Publication Year,Author,Title,Publication Title,ISBN,ISSN,DOI,Url,...,i1,i2,i3,i4,i5,e1,e2,e3,e4,e5
0,4JMC2M4R,report,1993,"Stentz, Anthony",Optimal and efficient path planning for unknow...,,,,,,...,False,False,False,False,False,False,False,False,False,False
1,IRE58EQR,bookSection,2015,"Nakajima, Shin",Model checking of energy consumption behavior,Complex Systems Design & Management Asia,,,,,...,False,False,False,False,False,False,False,False,False,False
2,GRSUW8Q5,conferencePaper,2008,"Senn, Eric; Laurent, Johann; Juin, Emmanuel; D...",Refining power consumption estimations in the ...,"2008 Forum on Specification, Verification and ...",,,,,...,False,False,False,False,False,False,False,False,False,False
3,VZMHZS6R,conferencePaper,2015,"Gang, Hou; Yinfeng, Ban; Kuanjiu, Zhou; Jie, W...",Energy consumption analysis method of CPS soft...,2015 Ninth International Conference on Frontie...,,,,,...,False,False,False,False,False,False,False,False,False,False
4,84KD7GKI,conferencePaper,2015,"Marinescu, Raluca; Enoiu, Eduard Paul; Secelea...",Statistical analysis of resource usage of embe...,2015 IEEE Computer Society Annual Symposium on...,,,,,...,False,False,False,False,False,False,False,False,False,False
5,4WV7ULB7,conferencePaper,2013,"Kang, Eun-Young; Perrouin, Gilles; Schobbens, ...",Model-based verification of energy-aware real-...,2013 18th International Conference on Engineer...,,,,,...,False,False,False,False,False,False,False,False,False,False
6,Q9P5F36T,conferencePaper,2015,"Gang, Hou; Yinfeng, Ban; Kuanjiu, Zhou; Jie, W...",Energy consumption analysis method of CPS soft...,2015 Ninth International Conference on Frontie...,,,,,...,False,False,False,False,False,False,False,False,False,False
7,I3KCTRA9,journalArticle,2000,"Hwang, Chi-Hong; Wu, Allen C.-H.",A predictive system shutdown method for energy...,ACM Transactions on Design Automation of Elect...,,,,,...,False,False,False,False,False,False,False,False,False,False
8,FLFTW8XS,conferencePaper,2001,"Liu, Jinfeng; Chou, Pai H.; Bagherzadeh, Nader...",Power-aware scheduling under timing constraint...,Proceedings of the 38th annual Design Automati...,,,,,...,False,False,False,False,False,False,False,False,False,False
9,RCZKB66J,conferencePaper,1999,"Simunic, Tajana; Benini, Luca; De Micheli, Gio...",Cycle-accurate simulation of energy consumptio...,Proceedings 1999 Design Automation Conference ...,,,,,...,False,False,False,False,False,False,False,False,False,False


In [2]:
# Filter records based on types that are to be considered (peer reviewed by nature).
def filter_types(col):
    df = pd.DataFrame(columns=col.columns)
    for index, row in col.iterrows():
        item_type = row['Item Type']
        if "journalArticle" in item_type or \
            "conferencePaper" in item_type or \
            "bookSection" in item_type or \
            "book" in item_type:
            row['i4'] = True
            df.loc[len(df)] = row
    
    return df

# Filter duplicates but keep highest priority variant.
def filter_duplicates(col):
    seen = list()
    df = pd.DataFrame(columns=col.columns)
    for index, row in col.iterrows():
        title = row['Title']
        itemType = row['Item Type']
        
        # Convert itemType from str to priority integer (lower = better)
        if itemType == "journalArticle":
            itemType = 0
        elif itemType == "conferencePaper":
            itemType = 1
        elif itemType == "book":
            itemType = 2
        elif itemType == "bookSection":
            itemType = 3
        
        # Create 3-tuple of important, necessary items.
        item = (index, title, itemType)
        
        if not item[1] in [t[1] for t in seen]:       # If title not yet seen, add it to DF and to seen
            df_index = len(df)
            df.loc[df_index] = row
            item = (df_index, item[1], item[2])       # Update index of item to new index in new DF
            seen.append(item)
        else:
            stored_item = [t for t in seen if t[1] == item[1]][0] # Retrieve stored_item which has title seen (dup)
            if stored_item[2] > item[2]:              # If current item has higher priority than stored item, update
                df.drop(stored_item[0])               # Drop original from DF
                df_index = len(df)                    # Index at which item will be placed
                df.loc[df_index] = row                # Place row at index
                item = (df_index, item[1], item[2])   # Update item index to new index in new DF
                seen[seen.index(stored_item)] = item  # Update seen by overwriting stored_item to current item
                
    return df

# Purely for testing, not used at all and should not be used on the collection.
# Only to get an indication of the amount of papers that are more explicit in the title.
def filter_titles(col):
    df = pd.DataFrame(columns=col.columns)
    for index, row in col.iterrows():
        title = row['Title']
        if "efficiency" in title or \
            "consumption" in title or \
            "optimisation" in title or \
            "optimization" in title or \
            "modeling" in title or \
            "analysis" in title:
            df.loc[len(df)] = row

    return df

In [3]:
# Filter the collection based on 'peer review by nature' types as discussed in email contact.
col_types = filter_types(collection)

In [4]:
# Filter duplicates and keep highest priority ones, export to CSV (final record set).
col_dups = filter_duplicates(col_types)
col_dups.to_csv("backward_snowballing_2_filtered.csv")