In [24]:
import re
import numpy as np
import pandas as pd
from thefuzz import fuzz
import matplotlib.pyplot as plt

In [134]:
import fresh_data.get_datasets
import importlib
importlib.reload(fresh_data.get_datasets) # reload get_datasets every time this cell is run
from fresh_data.get_datasets import *


## Representative Information:

In [135]:
polarization = load_polarization_data()
polarization[polarization["congress"]==116].head(3)

Unnamed: 0,representative,congress,chamber,icpsr,state_icpsr,district_code,state_abbrev,party_code,last_means,bioguide_id,...,died,nominate_dim1,nominate_dim2,nominate_log_likelihood,nominate_geo_mean_probability,nominate_number_of_votes,nominate_number_of_errors,nokken_poole_dim1,nokken_poole_dim2,state_name
39245,"ROGERS, Mike Dennis",116,House,20301,41,3,AL,200,,R000575,...,,0.374,0.394,-162.02017,0.80868,763.0,68.0,0.52,0.388,Alabama
39246,"SEWELL, Terri",116,House,21102,41,7,AL,100,,S001185,...,,-0.396,0.399,-28.30035,0.96477,789.0,11.0,-0.43,0.384,Alabama
39247,"ROBY, Martha",116,House,21192,41,2,AL,200,,R000591,...,,0.362,0.658,-90.42097,0.88244,723.0,31.0,0.346,0.672,Alabama


In [136]:
fec = load_FEC_data("FEC/")
fec[fec["congress"]==116].head(3)

Unnamed: 0,year_range,state_name,district_code,representative,party,running_as,receipts,contributions_from_individuals,contributions_from_pacs,contributions_and_loans_from_candidate,disbursements,cash_on_hand,debts,congress
22705,2019-2021,Alabama,1,"AVERHART, JAMES",Democratic Party,Open,80094.95,50849.95,0.0,29245.0,78973.24,1121.71,29245.0,116
22706,2019-2021,Alabama,1,"CARL, JERRY LEE, JR",Republican Party,Open,2344516.53,1044195.95,387000.0,760555.5,2232543.94,111972.59,469225.79,116
22707,2019-2021,Alabama,1,"CASTORANI, JOHN",Republican Party,Open,26809.68,11650.0,0.0,15159.68,26809.68,0.0,6559.68,116


In [163]:
def fuzzy_entity_res(rep_1, rep_2):
    """
        Returns an integer prediction of how close two strings are in similarity.
        100 is the highest level of similarity. 0 is the lowest.
    """
    if pd.isna(rep_2):
        return 0
    prediction = fuzz.partial_ratio(rep_1.strip().lower(),rep_2.strip().lower())
    return prediction

def check_subset(row_1, df_2, suffix_1, suffix_2):
    """
        Perform entity resolution on a record in the polarize and census df 
        Only parses a subset of the FEC df which has matches in state, and district
        Then uses The Fuzz(TM) to find the best match within the subset.
    """

    # get subset of df_2 with matches in state and district:
    match_subset = (df_2["district_code"] == row_1["district_code"]) & (df_2["state_name"] == row_1["state_name"])
    df_2_subset = df_2[match_subset]
    
    if len(df_2_subset) == 0:
        print(f"ERROR: no matches in df_2 for district: '{row_1['district_code']}', state: '{row_1['state_name']}'")
        return row_1
    
    # Instantiate the columns in row_1 with the columns from df_2:
    for column in df_2.columns:
        row_1[column] = np.nan

    df_2_subset["distance"] = df_2_subset.apply(lambda row_2: fuzzy_entity_res(row_1[f"representative_{suffix_1}"], row_2[f"representative_{suffix_2}"]), axis=1)

    # display(df_2_subset)

    closest_match_row = df_2_subset[df_2_subset["distance"]==df_2_subset["distance"].max()].iloc[0] # Get closest match
    row_1[f"{suffix_1}-{suffix_2}_closeness"] = df_2_subset['distance'].max()
    if df_2_subset["distance"].max() < 70: # YOUNG, Donald Edwin - YOUNG, DONALD E is scored as 73, this should be a match.
        print(f"no match, closest: {df_2_subset['distance'].max()}, for {row_1[f'representative_{suffix_1}']} - {closest_match_row[f'representative_{suffix_2}']}")
        return row_1
    else: # if we have a match above 70, replace instantiated values with values from row_2
        # print(f"match, on closest: {df_2_subset['distance'].max()}, for {row_1[f'representative_{suffix_1}']} - {closest_match_row[f'representative_{suffix_2}']}")
        for column in df_2_subset.columns:
            row_1[column] = closest_match_row[column]
        return row_1
        
def fuzzy_merge(df_1, df_2, suffix_1, suffix_2):
    # Apply merge algorithm on each record of df_1
    df_1.loc[:, f"representative_{suffix_1}"] = df_1["representative"]
    df_2.loc[:, f"representative_{suffix_2}"] = df_2["representative"]

    # Only include matches, remove all failed matches (NaNs):
    match_df = df_1.apply(lambda x: check_subset(x, df_2, suffix_1, suffix_2), axis=1)
    # return match_df[~pd.isna(match_df["representative"])]
    return match_df

# fec_116 = fec[fec["congress"]==116]
polarization_116 = polarization[polarization["congress"]==116]
test = fuzzy_merge(polarization_116, fec, "polarization", "fec")

### Weird data facts/edge cases:

- **Sander Levin** is a representative from Michigan who at two points in his career represented two different districts in Michigan. From 1983-1993, he represented the **17th** district, and did not recieve any contributions (at least on file with the FEC). In 1993 he retired from the 17th district to campaign and win the **12th** district House seat for Michigan. This seat would later get redistricted to the **9th** district in 2012. In 2017, he announced he would not run for reelection in 2018. Instead, his son, Andy Levin, became his successor as the representative for the 9th district. 

- Alaska, Wyoming, Montana, North Dakota, South Dakota, Vermont, and Delaware all have only **one seat** in the US House of Representatives. Different databases encode this single district number differently, as either 01, or 00. These needed to be manually recoded.

- **Year encodings** make matching on these datasets wildly difficult. The most confusing part of researching this part of my project was understanding electoral cycles (despite being a political science major in undergrad). I quickly realized that trying to match based on year would not work out for several reasons. Not only are electoral cycles measured slightly differently across different sources, but financial records for campaigns often encompass spending in the years leading up to an election and after, making matching much more difficult. This resulted in the current implementation, matching based on name, district, and state, a time-consuming and inefficient method of matching which takes a very long time. 

- The FEC reports finances for *candidates*, while OpenSecrets and our polarization DB both include information for *representatives*. The FEC includes individuals who ran for office, even if they didn't hold office.

- The FEC includes finances for representatives of **non-voting entites** like Guam or the Virgin Islands, which are part of the United States but have no representation in congress. These have to be excluded from our study.



### Data ranges:

- **Polarization**: 1789-2023, representatives
- **FEC**: 1989-2021, candidates
- **OpenSecrets**: 1999-2020, representatives

In [143]:
polarization_116

Unnamed: 0,representative,congress,chamber,icpsr,state_icpsr,district_code,state_abbrev,party_code,last_means,bioguide_id,...,nominate_dim1,nominate_dim2,nominate_log_likelihood,nominate_geo_mean_probability,nominate_number_of_votes,nominate_number_of_errors,nokken_poole_dim1,nokken_poole_dim2,state_name,representative_polarization
39245,"ROGERS, Mike Dennis",116,House,20301,41,3,AL,200,,R000575,...,0.374,0.394,-162.02017,0.80868,763.0,68.0,0.520,0.388,Alabama,"ROGERS, Mike Dennis"
39246,"SEWELL, Terri",116,House,21102,41,7,AL,100,,S001185,...,-0.396,0.399,-28.30035,0.96477,789.0,11.0,-0.430,0.384,Alabama,"SEWELL, Terri"
39247,"ROBY, Martha",116,House,21192,41,2,AL,200,,R000591,...,0.362,0.658,-90.42097,0.88244,723.0,31.0,0.346,0.672,Alabama,"ROBY, Martha"
39248,"BROOKS, Mo",116,House,21193,41,5,AL,200,,B001274,...,0.652,-0.417,-140.71682,0.83962,805.0,57.0,0.772,-0.337,Alabama,"BROOKS, Mo"
39249,"BYRNE, Bradley",116,House,21376,41,1,AL,200,,B001289,...,0.610,0.250,-107.81607,0.85611,694.0,42.0,0.702,0.194,Alabama,"BYRNE, Bradley"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39691,"GALLAGHER, Michael",116,House,21720,25,8,WI,200,,G000579,...,0.425,0.037,-164.99126,0.80913,779.0,71.0,0.411,-0.078,Wisconsin,"GALLAGHER, Michael"
39692,"STEIL, Bryan",116,House,21970,25,1,WI,200,,S001213,...,0.410,0.085,-139.88392,0.83976,801.0,65.0,0.421,-0.061,Wisconsin,"STEIL, Bryan"
39693,"TIFFANY, Thomas P.",116,House,21989,25,7,WI,200,,T000165,...,0.641,-0.222,-17.91626,0.85338,113.0,8.0,0.590,-0.238,Wisconsin,"TIFFANY, Thomas P."
39694,"KIND, Ron",116,House,29769,25,3,WI,100,,K000188,...,-0.260,-0.080,-67.15408,0.91781,783.0,24.0,-0.261,0.178,Wisconsin,"KIND, Ron"


In [157]:
barbara_polarization_mask = polarization["representative"].apply(lambda x: True if "lee, barbara" in x.lower().strip() else False)
polarization[(barbara_polarization_mask) & (polarization["congress"]==116)]

Unnamed: 0,representative,congress,chamber,icpsr,state_icpsr,district_code,state_abbrev,party_code,last_means,bioguide_id,...,nominate_dim1,nominate_dim2,nominate_log_likelihood,nominate_geo_mean_probability,nominate_number_of_votes,nominate_number_of_errors,nokken_poole_dim1,nokken_poole_dim2,state_name,representative_polarization
39318,"LEE, Barbara",116,House,29778,71,13,CA,100,,L000551,...,-0.677,-0.571,-25.80241,0.96834,802.0,11.0,-0.545,-0.828,California,"LEE, Barbara"


In [158]:
barbara_mask = fec["representative"].apply(lambda x: True if "lee, barbara" in x.lower().strip() else False)
fec[(barbara_mask) & (fec["congress"] == 116)]

Unnamed: 0,year_range,state_name,district_code,representative,party,running_as,receipts,contributions_from_individuals,contributions_from_pacs,contributions_and_loans_from_candidate,disbursements,cash_on_hand,debts,congress,representative_fec
22837,2019-2021,California,13,"LEE, BARBARA",Democratic Party,Incumbent,1888202.33,1347704.25,479849.0,0.0,1810132.05,210373.72,4401.75,116,"LEE, BARBARA"


In [167]:
test[test["polarization-fec_closeness"] == test["polarization-fec_closeness"].min()][["polarization-fec_closeness", "representative", "representative_fec", "representative_polarization", "state_name", "district_code"]]

Unnamed: 0,polarization-fec_closeness,representative,representative_fec,representative_polarization,state_name,district_code
39652,73,"LEE, SHEILA JACKSON","LEE, SHEILA JACKSON","JACKSON LEE, Sheila",Texas,18


In [165]:
test[pd.isna(test["representative"])][["polarization-fec_closeness", "representative", "representative_fec", "representative_polarization", "state_name", "district_code"]]

Unnamed: 0,polarization-fec_closeness,representative,representative_fec,representative_polarization,state_name,district_code


In [169]:
test[~pd.isna(test["representative"])]

Unnamed: 0,representative,congress,chamber,icpsr,state_icpsr,district_code,state_abbrev,party_code,last_means,bioguide_id,...,receipts,contributions_from_individuals,contributions_from_pacs,contributions_and_loans_from_candidate,disbursements,cash_on_hand,debts,representative_fec,polarization-fec_closeness,distance
39245,"ROGERS, MICHAEL",108,House,20301,41,3,AL,200,,R000575,...,2121838.37,1194102.11,818567.58,0.00,1893594.35,246299.64,0.00,"ROGERS, MICHAEL",81,81
39246,"SEWELL, TERRI A.",111,House,21102,41,7,AL,100,,S001185,...,1802832.56,1381888.86,357855.00,63000.00,1766658.30,36018.28,1533.73,"SEWELL, TERRI A.",100,100
39247,"ROBY, MARTHA",111,House,21192,41,2,AL,200,,R000591,...,1253557.11,825430.34,361615.70,0.00,1240275.64,13281.47,234.34,"ROBY, MARTHA",100,100
39248,"BROOKS, MO",111,House,21193,41,5,AL,200,,B001274,...,961210.63,565669.74,292995.13,100000.00,910790.40,50420.23,0.00,"BROOKS, MO",100,100
39249,"BYRNE, BRADLEY ROBERTS",113,House,21376,41,1,AL,200,,B001289,...,1907597.44,914507.30,986350.99,0.00,1655998.60,251598.84,0.00,"BYRNE, BRADLEY ROBERTS",100,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39691,"GALLAGHER, MICHAEL JOHN",114,House,21720,25,8,WI,200,,G000579,...,2765648.26,2138575.53,606126.06,0.00,2703822.81,61825.45,255000.00,"GALLAGHER, MICHAEL JOHN",100,100
39692,"STEIL, BRYAN GEORGE",115,House,21970,25,1,WI,200,,S001213,...,2314018.67,1671550.93,456243.00,795.58,2287662.97,26355.70,138455.34,"STEIL, BRYAN GEORGE",100,100
39693,"TIFFANY, TOM",116,House,21989,25,7,WI,200,,T000165,...,2652454.74,2235625.83,363117.39,0.00,2529735.40,122719.34,0.00,"TIFFANY, TOM",92,92
39694,"KIND, RONALD JAMES",104,House,29769,25,3,WI,100,,K000188,...,501680.00,161925.00,99822.00,68389.00,497919.00,724.00,24449.00,"KIND, RONALD JAMES",100,100


In [168]:
test

Unnamed: 0,representative,congress,chamber,icpsr,state_icpsr,district_code,state_abbrev,party_code,last_means,bioguide_id,...,receipts,contributions_from_individuals,contributions_from_pacs,contributions_and_loans_from_candidate,disbursements,cash_on_hand,debts,representative_fec,polarization-fec_closeness,distance
39245,"ROGERS, MICHAEL",108,House,20301,41,3,AL,200,,R000575,...,2121838.37,1194102.11,818567.58,0.00,1893594.35,246299.64,0.00,"ROGERS, MICHAEL",81,81
39246,"SEWELL, TERRI A.",111,House,21102,41,7,AL,100,,S001185,...,1802832.56,1381888.86,357855.00,63000.00,1766658.30,36018.28,1533.73,"SEWELL, TERRI A.",100,100
39247,"ROBY, MARTHA",111,House,21192,41,2,AL,200,,R000591,...,1253557.11,825430.34,361615.70,0.00,1240275.64,13281.47,234.34,"ROBY, MARTHA",100,100
39248,"BROOKS, MO",111,House,21193,41,5,AL,200,,B001274,...,961210.63,565669.74,292995.13,100000.00,910790.40,50420.23,0.00,"BROOKS, MO",100,100
39249,"BYRNE, BRADLEY ROBERTS",113,House,21376,41,1,AL,200,,B001289,...,1907597.44,914507.30,986350.99,0.00,1655998.60,251598.84,0.00,"BYRNE, BRADLEY ROBERTS",100,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39691,"GALLAGHER, MICHAEL JOHN",114,House,21720,25,8,WI,200,,G000579,...,2765648.26,2138575.53,606126.06,0.00,2703822.81,61825.45,255000.00,"GALLAGHER, MICHAEL JOHN",100,100
39692,"STEIL, BRYAN GEORGE",115,House,21970,25,1,WI,200,,S001213,...,2314018.67,1671550.93,456243.00,795.58,2287662.97,26355.70,138455.34,"STEIL, BRYAN GEORGE",100,100
39693,"TIFFANY, TOM",116,House,21989,25,7,WI,200,,T000165,...,2652454.74,2235625.83,363117.39,0.00,2529735.40,122719.34,0.00,"TIFFANY, TOM",92,92
39694,"KIND, RONALD JAMES",104,House,29769,25,3,WI,100,,K000188,...,501680.00,161925.00,99822.00,68389.00,497919.00,724.00,24449.00,"KIND, RONALD JAMES",100,100
