In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
inds = pd.read_csv('individuals-only.csv', 
                   dtype={'Agent_Nam F': str, 
                          'Sup_Opp_Cd': str,
                          'G_From_E_F': str,
                          'XRef_Sch Nm': str},
                  parse_dates=[3, 4, 5, 6])

In [3]:
inds.head()

Unnamed: 0,Filer_ID,Filer_Nam L,Committee_Type,Rpt_Date,From_Date,Thru_Date,Rpt_Elect_Date,Rec_Type,Form_Type,Tran_ID,...,Bal_Juris,Sup_Opp_Cd,Memo_Code,Memo_Ref No,Bak Ref_TID,G_From_E_F,XRef_Sch Nm,XRef_Match,Int_Rate,Int_Cmte Id
0,1398238.0,Shay Franco-Clausen for San Jose City Council ...,,2018-07-31 21:35:20,2018-06-04,2018-06-30,NaT,RCPT,A,RC4410,...,,,NO,,,,,,,
1,1398238.0,Shay Franco-Clausen for San Jose City Council ...,,2018-07-31 21:35:20,2018-06-04,2018-06-30,NaT,RCPT,A,RC4411,...,,,NO,,,,,,,
2,1398238.0,Shay Franco-Clausen for San Jose City Council ...,,2018-07-31 21:35:20,2018-06-04,2018-06-30,NaT,RCPT,A,RC4628,...,,,NO,,,,,,,
3,1398238.0,Shay Franco-Clausen for San Jose City Council ...,,2018-07-31 21:35:20,2018-06-04,2018-06-30,NaT,RCPT,A,RC4629,...,,,NO,,,,,,,
4,1398238.0,Shay Franco-Clausen for San Jose City Council ...,,2018-07-31 21:35:20,2018-06-04,2018-06-30,NaT,RCPT,A,RC4630,...,,,NO,,,,,,,


In [4]:
## Pull out schedules A and E only

def extract_schedules(list_of_schedules, df):
    new_df = df.loc[df['Form_Type'].isin(list_of_schedules), ]
    return new_df    

In [5]:
inds_ae = extract_schedules(['A', 'E'], inds)

In [6]:
def mark_duplicates(df):
    
    ## Check there are no NaNs in dates
    if len(df.loc[df['From_Date'].isnull(), ]) != 0:
        return 'Some "From_Dates" are missing. Remove these to de-duplicate.'
    
    ## Add a remove indicator column
    df['to_remove'] = np.nan
    
    ## Create new df
    df_new = pd.DataFrame(columns=df.columns)
    
    ## Record all candidates
    candidates = np.asarray(inds_ae['Filer_Nam L'].value_counts().index)
    
    ## Do candidate by candidate
    for candidate in candidates:
        print('Processing candidate', candidate)
        cand_df = df.loc[df['Filer_Nam L'] == candidate, ]
        
        ## Find unique From_Dates
        from_dates = pd.to_datetime(cand_df['From_Date'].unique())
        
        ## Create most_recent_date dictionary
        most_recent_date = {}
        
        ## Fill dictionary with rpt_date to use for each from_date
        for from_date in from_dates:
            rpt_dates = pd.to_datetime(cand_df.loc[cand_df['From_Date'] == from_date, 'Rpt_Date'].unique())
            max_date = rpt_dates.max()
            most_recent_date[from_date] = max_date
    
        ## Mark each row
        for row in cand_df.index:
            
            ## Mark row as keep (0) or remove (1)
            if most_recent_date[cand_df.loc[row, 'From_Date']] == cand_df.loc[row, 'Rpt_Date']:
                cand_df.loc[row, 'to_remove'] = 0
            else:
                cand_df.loc[row, 'to_remove'] = 1
                
        ## Add candidate df to new df
        df_new = pd.concat([df_new, cand_df], axis=0)
        
    return df_new
        

In [7]:
start_time = time.process_time()
inds_marked = mark_duplicates(inds_ae)
print('Time in seconds:', (time.process_time() - start_time))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Processing candidate Neighbors for Sam Liccardo for San Jose Mayor 2018
Processing candidate Kalen Gallagher for San Jose City Council District 9 2018
Processing candidate Pam Foley for San Jose City Council D9 2018
Processing candidate Shay Franco-Clausen for San Jose City Council District 9 2018
Processing candidate Tam Nguyen for San Jose Council 2018
Processing candidate Maya Esparza for San Jose City Council D7 2018
Processing candidate Van Le for City Council 2018 District 7
Processing candidate Thomas Duong for City Council D7 2018
Processing candidate Raul Peralez for SJ City Council 2018 D3
Processing candidate Re-Elect Magdalena Carrasco for City Council D5 2018
Processing candidate Chappie Jones for City Council 2018 D1
Processing candidate Omar Vasquez for San Jose City Council 2018 D7
Processing candidate Sabuhi Siddique for Council District 9 2018
Processing candidate Rosie Zepeda for San Jose City Council District 9 2018
Processing candidate Neighbors for Jonathan Flemin

In [8]:
inds_marked['to_remove'].sum()

4586.0

In [9]:
def remove_duplicates(df):
    df_new = df.loc[df['to_remove'] == 0, ]
    return df_new

In [10]:
inds_final = remove_duplicates(inds_marked)

In [12]:
inds_final.to_csv('individuals_unduplicated.csv')