### Scrape House candidate face urls from ballotpedia

Note: 
- The output file will be different using the updated wmpcand_XXXXXX_wmpid.csv file
- Our RA Jasmine used the output of this script to clean up face urls
- DO NOT 'run all' this script, as it involves manual check for each state. 

In [1]:
import pandas as pd
import bs4 as bs # pulling data out of HTML and XML files.
import urllib.request # opening and reading URLs
import re # pattern matching

In [41]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [2]:
#Download the website's html
source = urllib.request.urlopen("https://ballotpedia.org/United_States_House_of_Representatives_elections,_2022").read()

In [3]:
#Parse the html
#This makes it look nice and structured for us, and much easier to read for the program
soup = bs.BeautifulSoup(source)

In [4]:
urls_css = "center small a"

In [5]:
urls = soup.select(urls_css)

In [6]:
extracted_urls = [url["href"] for url in urls]
len(extracted_urls)

50

In [7]:
full_urls = ["https://ballotpedia.org" + url_suffix for url_suffix in extracted_urls]

In [8]:
full_urls

['https://ballotpedia.org/United_States_House_of_Representatives_elections_in_Alabama,_2022',
 'https://ballotpedia.org/United_States_House_of_Representatives_election_in_Alaska,_2022',
 'https://ballotpedia.org/United_States_House_of_Representatives_elections_in_Arizona,_2022',
 'https://ballotpedia.org/United_States_House_of_Representatives_elections_in_Arkansas,_2022',
 'https://ballotpedia.org/United_States_House_of_Representatives_elections_in_California,_2022',
 'https://ballotpedia.org/United_States_House_of_Representatives_elections_in_Colorado,_2022',
 'https://ballotpedia.org/United_States_House_of_Representatives_elections_in_Connecticut,_2022',
 'https://ballotpedia.org/United_States_House_of_Representatives_election_in_Delaware,_2022',
 'https://ballotpedia.org/United_States_House_of_Representatives_elections_in_Florida,_2022',
 'https://ballotpedia.org/United_States_House_of_Representatives_elections_in_Georgia,_2022',
 'https://ballotpedia.org/United_States_House_of_Repr

#### Distinguish two types of html structure for different states

In [9]:
urls_1 = []
urls_2 = []

for i in full_urls:
  source = urllib.request.urlopen(i).read()
  soup = bs.BeautifulSoup(source)
  res_css = "#District_1"
  res = soup.select(res_css)
  if len(res) == 0:
    urls_1.append(i)
  else:
    urls_2.append(i)

In [10]:
urls_1

['https://ballotpedia.org/United_States_House_of_Representatives_election_in_Alaska,_2022',
 'https://ballotpedia.org/United_States_House_of_Representatives_election_in_Delaware,_2022',
 'https://ballotpedia.org/United_States_House_of_Representatives_election_in_North_Dakota,_2022',
 'https://ballotpedia.org/United_States_House_of_Representatives_election_in_South_Dakota,_2022',
 'https://ballotpedia.org/United_States_House_of_Representatives_election_in_Vermont,_2022',
 'https://ballotpedia.org/United_States_House_of_Representatives_election_in_Wyoming,_2022']

In [11]:
len(urls_2)

44

### Load the master file

In [14]:
h = pd.read_csv("../datasets/candidates/wmpcand_101422_wmpid.csv")

In [15]:
h.columns

Index(['wmpid', 'genelect_cd', 'CurrCand', 'cand_name', 'cand_id',
       'cand_office', 'cand_office_st', 'cand_office_dist',
       'cand_party_affiliation', 'cand_incumbent_challenger_open_s',
       'dateadded_cd', 'office_wapo', 'result_wapo', 'primarydate_wapo',
       'trumpe_wapo', 'gender_wmp', 'gender_crp', 'latino_wmp', 'latino_crp',
       'race_wmp', 'race_crp1', 'race_crp2', 'race_crpmena', 'hse_cmpt_gen',
       'full_name', 'first_name', 'last_name'],
      dtype='object')

In [16]:
h.genelect_cd.value_counts(dropna=False)

0    2929
1    1366
Name: genelect_cd, dtype: int64

In [17]:
h = h.loc[(h['genelect_cd']==1)&(h['cand_office']=='H')]

In [18]:
h.shape

(1193, 27)

In [19]:
h = h[['wmpid', 'cand_name', 'full_name', 'cand_office_st', 'cand_office_dist', 'cand_party_affiliation']]

In [20]:
h.shape

(1193, 6)

In [21]:
len(h.full_name.unique())

1193

In [22]:
h.head()

Unnamed: 0,wmpid,cand_name,full_name,cand_office_st,cand_office_dist,cand_party_affiliation
0,WMPID21,"CARL, JERRY LEE, JR",Jerry Carl,AL,1.0,REP
1,WMPID24,"HARVEY-HALL, PHYLLIS",Phyllis Harvey-Hall,AL,2.0,DEM
2,WMPID27,"SEWELL, TERRI A.",Terri A Sewell,AL,7.0,DEM
3,WMPID28,"CRAWFORD, ERIC ALAN RICK",Rick Crawford,AR,1.0,REP
4,WMPID30,"WOMACK, STEVE",Steve Womack,AR,3.0,REP


### Scape house candidates' face urls and fuzzy match with candidates in the masterfile at the state level

#### 1. States with one layer

- Here is exmaple code for one state Wyoming
- For each state, manually check the matching results to see if they're ok
- Once finishing all six states, combine them 

In [25]:
d1 = pd.DataFrame(columns=['bp_name_raw', 'bp_url'])

In [26]:
for i in urls_1[5:6]:
  print(i)
  images = []
  bp_names = []
  source = urllib.request.urlopen(i).read()
  soup = bs.BeautifulSoup(source)
  img_css = ".image-candidate-thumbnail"
  img = soup.select(img_css)
  img = img[0:5]
  images.extend(list([i['src'] for i in img]))
  n_css = "td.votebox-results-cell--text"
  nm = soup.select(n_css)
  nm = nm[0:5]
  bp_names.extend(list([url.text for url in nm]))
  dic = {'bp_name_raw':bp_names,'bp_url':images}
  df = pd.DataFrame(dic)
  d1=d1.append(df)

https://ballotpedia.org/United_States_House_of_Representatives_election_in_Wyoming,_2022


In [27]:
d1

Unnamed: 0,bp_name_raw,bp_url
0,Harriet Hageman (R) \n\t\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...
1,Lynnette Grey Bull (D) \n\t\t\t\t\t\t\t\t\t\t...,https://s3.amazonaws.com/ballotpedia-api4/file...
2,Richard Brubaker (L) \n\t\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...
3,Marissa Selvig (Constitution Party),https://s3.amazonaws.com/ballotpedia-api4/file...
4,Other/Write-in votes,https://s3.amazonaws.com/ballotpedia-api4/file...


In [28]:
df2 = d1['bp_name_raw'].str.rsplit("(", n=1, expand=True)

In [29]:
df2 = df2.rename(columns={0:'name'})

In [30]:
df3 =df2['name'].str.rsplit("(", n=1, expand=True)
df3 = df3.rename(columns={0:'name'})

In [31]:
df3['bp_name'] = df3['name'].str.replace("\n", "").replace("\t", "")

In [32]:
df3['bp_name']

0        Harriet Hageman 
1     Lynnette Grey Bull 
2       Richard Brubaker 
3         Marissa Selvig 
4    Other/Write-in votes
Name: bp_name, dtype: object

In [33]:
df['bp_name']=df3['bp_name']

In [34]:
df['bp_url_filename'] = df['bp_url'].str.replace("https://s3.amazonaws.com/ballotpedia-api4/files/thumbs/100/100/", '')

In [35]:
df['bp_name'] = df['bp_name'].str.strip()

In [36]:
df = df.drop_duplicates(subset=['bp_name'], keep='first')

In [37]:
df

Unnamed: 0,bp_name_raw,bp_url,bp_name,bp_url_filename
0,Harriet Hageman (R) \n\t\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Harriet Hageman,Harriet-Hageman.PNG
1,Lynnette Grey Bull (D) \n\t\t\t\t\t\t\t\t\t\t...,https://s3.amazonaws.com/ballotpedia-api4/file...,Lynnette Grey Bull,LynnetteGreyBullWY.png
2,Richard Brubaker (L) \n\t\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Richard Brubaker,Richard_Brubaker.jpg
3,Marissa Selvig (Constitution Party),https://s3.amazonaws.com/ballotpedia-api4/file...,Marissa Selvig,Marissa_Selvig.PNG
4,Other/Write-in votes,https://s3.amazonaws.com/ballotpedia-api4/file...,Other/Write-in votes,LynnetteGreyBullWY.png


In [38]:
WY = h.loc[h['cand_office_st']=='WY']

In [39]:
WY

Unnamed: 0,wmpid,cand_name,full_name,cand_office_st,cand_office_dist,cand_party_affiliation
369,WMPID767,"GREYBULL, LYNNETTE",Lynnette Grey Bull,WY,0.0,DEM
3155,WMPID5149,"BRUBAKER, RICHARD P.",Richard Brubaker,WY,0.0,LIB
3157,WMPID4061,"HAGEMAN, HARRIET",Harriet Hageman,WY,0.0,REP
3159,WMPID4514,"SELVIG, MARISSA JOY",Marissa Joy Selvig,WY,0.0,CON


In [43]:
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=2):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()
    
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

In [44]:
wy = fuzzy_merge(WY, df, 'full_name', 'bp_name', threshold=90)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [45]:
wy.head()

Unnamed: 0,wmpid,cand_name,full_name,cand_office_st,cand_office_dist,cand_party_affiliation,matches
369,WMPID767,"GREYBULL, LYNNETTE",Lynnette Grey Bull,WY,0.0,DEM,Lynnette Grey Bull
3155,WMPID5149,"BRUBAKER, RICHARD P.",Richard Brubaker,WY,0.0,LIB,Richard Brubaker
3157,WMPID4061,"HAGEMAN, HARRIET",Harriet Hageman,WY,0.0,REP,Harriet Hageman
3159,WMPID4514,"SELVIG, MARISSA JOY",Marissa Joy Selvig,WY,0.0,CON,Marissa Selvig


In [46]:
wy= wy.merge(df, left_on='matches', right_on='bp_name', indicator=True, how='left')

In [47]:
wy.head()

Unnamed: 0,wmpid,cand_name,full_name,cand_office_st,cand_office_dist,cand_party_affiliation,matches,bp_name_raw,bp_url,bp_name,bp_url_filename,_merge
0,WMPID767,"GREYBULL, LYNNETTE",Lynnette Grey Bull,WY,0.0,DEM,Lynnette Grey Bull,Lynnette Grey Bull (D) \n\t\t\t\t\t\t\t\t\t\t...,https://s3.amazonaws.com/ballotpedia-api4/file...,Lynnette Grey Bull,LynnetteGreyBullWY.png,both
1,WMPID5149,"BRUBAKER, RICHARD P.",Richard Brubaker,WY,0.0,LIB,Richard Brubaker,Richard Brubaker (L) \n\t\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Richard Brubaker,Richard_Brubaker.jpg,both
2,WMPID4061,"HAGEMAN, HARRIET",Harriet Hageman,WY,0.0,REP,Harriet Hageman,Harriet Hageman (R) \n\t\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Harriet Hageman,Harriet-Hageman.PNG,both
3,WMPID4514,"SELVIG, MARISSA JOY",Marissa Joy Selvig,WY,0.0,CON,Marissa Selvig,Marissa Selvig (Constitution Party),https://s3.amazonaws.com/ballotpedia-api4/file...,Marissa Selvig,Marissa_Selvig.PNG,both


Quick check if matches are ok

In [48]:
wy.loc[wy._merge=='both']

Unnamed: 0,wmpid,cand_name,full_name,cand_office_st,cand_office_dist,cand_party_affiliation,matches,bp_name_raw,bp_url,bp_name,bp_url_filename,_merge
0,WMPID767,"GREYBULL, LYNNETTE",Lynnette Grey Bull,WY,0.0,DEM,Lynnette Grey Bull,Lynnette Grey Bull (D) \n\t\t\t\t\t\t\t\t\t\t...,https://s3.amazonaws.com/ballotpedia-api4/file...,Lynnette Grey Bull,LynnetteGreyBullWY.png,both
1,WMPID5149,"BRUBAKER, RICHARD P.",Richard Brubaker,WY,0.0,LIB,Richard Brubaker,Richard Brubaker (L) \n\t\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Richard Brubaker,Richard_Brubaker.jpg,both
2,WMPID4061,"HAGEMAN, HARRIET",Harriet Hageman,WY,0.0,REP,Harriet Hageman,Harriet Hageman (R) \n\t\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Harriet Hageman,Harriet-Hageman.PNG,both
3,WMPID4514,"SELVIG, MARISSA JOY",Marissa Joy Selvig,WY,0.0,CON,Marissa Selvig,Marissa Selvig (Constitution Party),https://s3.amazonaws.com/ballotpedia-api4/file...,Marissa Selvig,Marissa_Selvig.PNG,both


#### Combine six states' results


In [208]:
#res1 = pd.concat([ak, de, nd, sd, vt, wy])

In [209]:
#res1.shape

(21, 12)

In [211]:
#res1

Unnamed: 0,wmpid,cand_name,full_name,cand_office_st,cand_office_dist,cand_party_affiliation,matches,bp_name_raw,bp_url,bp_name,bp_url_filename,_merge
0,WMPID2014,"PALIN, SARAH",Sarah Palin,AK,0.0,REP,Sarah Palin,Sarah Palin,https://s3.amazonaws.com/ballotpedia-api4/file...,Sarah Palin,Sarah_Palin.PNG,both
1,WMPID4003,"BEGICH, NICHOLAS III",Nicholas III Begich,AK,0.0,REP,Nicholas Begich,Nicholas Begich,https://s3.amazonaws.com/ballotpedia-api4/file...,Nicholas Begich,Nick-Begich.PNG,both
2,WMPID4308,"PELTOLA, MARY",Mary Peltola,AK,0.0,DEM,Mary Peltola,Mary Peltola,https://s3.amazonaws.com/ballotpedia-api4/file...,Mary Peltola,Mary-Peltola.PNG,both
3,WMPID4360,"SWEENEY, TARA M",Tara Sweeney,AK,0.0,REP,,,,,,left_only
0,WMPID5148,"ROGERS, DAVID L.",David Rogers,DE,0.0,OTH,David Rogers,David Rogers (Nonpartisan Party) \n\t\t\t\t\t...,https://s3.amazonaws.com/ballotpedia-api4/file...,David Rogers,DavidRogers.jpg,both
1,WMPID5144,"MCNUTT, CODY",Cody Mcnutt,DE,0.0,LIB,Cody McNutt,Cody McNutt (L) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Cody McNutt,Cody-McNutt.PNG,both
2,WMPID959,"BLUNT ROCHESTER, LISA",Lisa Blunt Rochester,DE,0.0,DEM,Lisa Blunt Rochester,Lisa Blunt Rochester (D) \n\t\t\t\t\t\t\t\t\t...,https://s3.amazonaws.com/ballotpedia-api4/file...,Lisa Blunt Rochester,Lisa_Blunt_Rochester.jpg,both
3,WMPID5139,"WALKER, SCOTT",Scott Walker,DE,0.0,W,Scott Walker,Scott Walker (Independent) (Write-in) \n\t\t\...,https://s3.amazonaws.com/ballotpedia-api4/file...,Scott Walker,Scott_WalkerDE.jpeg,both
4,WMPID1084,"MURPHY, LEE",Lee Murphy,DE,0.0,REP,Lee Murphy,Lee Murphy (R) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Lee Murphy,Lee_Murphy_for_U.S._Congress_Delaware_photo.JPG,both
0,WMPID2318,"HAUGEN, MARK",Mark Haugen,ND,0.0,DEM,Mark Haugen,Mark Haugen \n,https://s3.amazonaws.com/ballotpedia-api4/file...,Mark Haugen,KELLY_ARMSTRONG.jpg,both


#### 2. States with two layers

- Here is exmaple code for one state New_Mexico
- For each state, manually check the matching results to see if they're ok
- Once finishing all 44 states, combine them 

In [49]:
images = []
bp_name_raw = []

In [50]:
for i in urls_2[28:29]:
  print(i)
  source = urllib.request.urlopen(i).read()
  soup = bs.BeautifulSoup(source)
  dist_css = "dd dl dd i a"
  dist = soup.select(dist_css)
  # urls_css = "center small a"
  # urls = soup.select(urls_css)
  extracted_urls = [i["href"] for i in dist]
  st_urls = ["https://ballotpedia.org" + url_suffix for url_suffix in extracted_urls]
  print(st_urls)
  for i in st_urls:
    #if '27s' in i:
    if '27' in i:
        source2 = urllib.request.urlopen(i).read()
        soup2 = bs.BeautifulSoup(source2)
        img_css = ".image-candidate-thumbnail"
        img = soup2.select(img_css)
        img = img[0:5]
        #print(img)
        images.extend(list([i['src'] for i in img]))
        n_css = "td.votebox-results-cell--text"
        nm = soup2.select(n_css)
        nm = nm[0:5]
        #print(nm)
        bp_name_raw.extend(list([url.text for url in nm]))

https://ballotpedia.org/United_States_House_of_Representatives_elections_in_New_Mexico,_2022
['https://ballotpedia.org/New_Mexico%27s_1st_Congressional_District_election,_2022', 'https://ballotpedia.org/New_Mexico%27s_2nd_Congressional_District_election,_2022', 'https://ballotpedia.org/New_Mexico%27s_3rd_Congressional_District_election,_2022', 'https://ballotpedia.org/Race_rating_definitions_and_methods', 'https://ballotpedia.org/Primary_election_competitiveness_in_state_and_federal_government,_2022', 'https://ballotpedia.org/Presidential_election_in_New_Mexico,_2020', 'https://ballotpedia.org/The_Cook_Political_Report%27s_Partisan_Voter_Index']


In [51]:
len(images)

15

In [52]:
len(bp_name_raw)

15

In [53]:
h_dic = {'bp_name_raw':bp_name_raw,'bp_url':images}

In [54]:
df = pd.DataFrame(h_dic)

In [55]:
df2 = df['bp_name_raw'].str.rsplit("(", n=1, expand=True)

In [56]:
df2 = df2.rename(columns={0:'name'})

In [57]:
df3 =df2['name'].str.rsplit("(", n=1, expand=True)
df3 = df3.rename(columns={0:'name'})

In [58]:
df3['bp_name'] = df3['name'].str.replace("\n", "").replace("\t", "")

In [59]:
df3['bp_name']

0       Melanie Ann Stansbury 
1      Michelle Garcia Holmes 
2           Victoria Gonzales 
3        Melanie Ann Stansbury
4       Michelle Garcia Holmes
5             Gabriel Vasquez 
6              Yvette Herrell 
7                 Eliseo Luna 
8              Gabriel Vasquez
9               Darshan Patel 
10     Teresa Leger Fernandez 
11    Alexis Martinez Johnson 
12      Teresa Leger Fernandez
13     Alexis Martinez Johnson
14     Teresa Leger Fernandez 
Name: bp_name, dtype: object

In [60]:
df['bp_name']=df3['bp_name']

In [61]:
df['bp_url_filename'] = df['bp_url'].str.replace("https://s3.amazonaws.com/ballotpedia-api4/files/thumbs/100/100/", '')

In [62]:
df['bp_name'] = df['bp_name'].str.strip()

In [63]:
df = df.drop_duplicates(subset=['bp_name'], keep='first')

In [64]:
df

Unnamed: 0,bp_name_raw,bp_url,bp_name,bp_url_filename
0,Melanie Ann Stansbury (D) \n\t\t\t\t\t\t\t\t\...,https://s3.amazonaws.com/ballotpedia-api4/file...,Melanie Ann Stansbury,_Melanie-Stansbury_.jpg
1,Michelle Garcia Holmes (R) \n\t\t\t\t\t\t\t\t...,https://s3.amazonaws.com/ballotpedia-api4/file...,Michelle Garcia Holmes,w80182230_headshot.jpg
2,Victoria Gonzales (Independent) \n\t\t\t\t\t\...,https://s3.amazonaws.com/ballotpedia-api4/file...,Victoria Gonzales,_Melanie-Stansbury_.jpg
5,Gabriel Vasquez (D) \n\t\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Gabriel Vasquez,GabrielVasquez.jpg
6,Yvette Herrell (R) \n\t\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Yvette Herrell,YvetteHerrell.jpg
7,Eliseo Luna (D) (Write-in),https://s3.amazonaws.com/ballotpedia-api4/file...,Eliseo Luna,EliseoLuna2.jpg
9,Darshan Patel,https://s3.amazonaws.com/ballotpedia-api4/file...,Darshan Patel,Darshan_Patel_2.jpg
10,Teresa Leger Fernandez (D) \n\t\t\t\t\t\t\t\t...,https://s3.amazonaws.com/ballotpedia-api4/file...,Teresa Leger Fernandez,Teresa-Leger-Fernandez.PNG
11,Alexis Martinez Johnson (R) \n\t\t\t\t\t\t\t\...,https://s3.amazonaws.com/ballotpedia-api4/file...,Alexis Martinez Johnson,80182230_alexis_professional.jpg


In [65]:
NM = h.loc[h['cand_office_st']=='NM']

In [66]:
NM.shape

(7, 6)

In [67]:
NM

Unnamed: 0,wmpid,cand_name,full_name,cand_office_st,cand_office_dist,cand_party_affiliation
225,WMPID451,"GARCIA HOLMES, MICHELLE",Michelle Garcia Holmes,NM,1.0,REP
226,WMPID453,"LEGER FERNANDEZ, TERESA",Teresa Leger,NM,3.0,DEM
2092,WMPID3359,"STANSBURY, MELANIE",Melanie Stansbury,NM,1.0,DEM
2108,WMPID4714,"CHICK, CAMERON ALTON MR. SR",Cameron Alton Sr Chick,NM,1.0,IND
2113,WMPID4014,"VASQUEZ, GABRIEL",Gabriel Vasquez,NM,2.0,DEM
2115,WMPID2867,"MARTINEZ JOHNSON, ALEXIS",Alexis Martinez Johnson,NM,3.0,REP
3512,WMPID1184,"HERRELL, STELLA YVETTE",Yvette Herrell,NM,2.0,REP


In [68]:
nm = fuzzy_merge(NM, df, 'full_name', 'bp_name', threshold=90)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [69]:
nm

Unnamed: 0,wmpid,cand_name,full_name,cand_office_st,cand_office_dist,cand_party_affiliation,matches
225,WMPID451,"GARCIA HOLMES, MICHELLE",Michelle Garcia Holmes,NM,1.0,REP,Michelle Garcia Holmes
226,WMPID453,"LEGER FERNANDEZ, TERESA",Teresa Leger,NM,3.0,DEM,Teresa Leger Fernandez
2092,WMPID3359,"STANSBURY, MELANIE",Melanie Stansbury,NM,1.0,DEM,Melanie Ann Stansbury
2108,WMPID4714,"CHICK, CAMERON ALTON MR. SR",Cameron Alton Sr Chick,NM,1.0,IND,
2113,WMPID4014,"VASQUEZ, GABRIEL",Gabriel Vasquez,NM,2.0,DEM,Gabriel Vasquez
2115,WMPID2867,"MARTINEZ JOHNSON, ALEXIS",Alexis Martinez Johnson,NM,3.0,REP,Alexis Martinez Johnson
3512,WMPID1184,"HERRELL, STELLA YVETTE",Yvette Herrell,NM,2.0,REP,Yvette Herrell


In [70]:
nm = nm.merge(df, left_on='matches', right_on='bp_name', indicator=True, how='left')

In [72]:
nm.head()

Unnamed: 0,wmpid,cand_name,full_name,cand_office_st,cand_office_dist,cand_party_affiliation,matches,bp_name_raw,bp_url,bp_name,bp_url_filename,_merge
0,WMPID451,"GARCIA HOLMES, MICHELLE",Michelle Garcia Holmes,NM,1.0,REP,Michelle Garcia Holmes,Michelle Garcia Holmes (R) \n\t\t\t\t\t\t\t\t...,https://s3.amazonaws.com/ballotpedia-api4/file...,Michelle Garcia Holmes,w80182230_headshot.jpg,both
1,WMPID453,"LEGER FERNANDEZ, TERESA",Teresa Leger,NM,3.0,DEM,Teresa Leger Fernandez,Teresa Leger Fernandez (D) \n\t\t\t\t\t\t\t\t...,https://s3.amazonaws.com/ballotpedia-api4/file...,Teresa Leger Fernandez,Teresa-Leger-Fernandez.PNG,both
2,WMPID3359,"STANSBURY, MELANIE",Melanie Stansbury,NM,1.0,DEM,Melanie Ann Stansbury,Melanie Ann Stansbury (D) \n\t\t\t\t\t\t\t\t\...,https://s3.amazonaws.com/ballotpedia-api4/file...,Melanie Ann Stansbury,_Melanie-Stansbury_.jpg,both
3,WMPID4714,"CHICK, CAMERON ALTON MR. SR",Cameron Alton Sr Chick,NM,1.0,IND,,,,,,left_only
4,WMPID4014,"VASQUEZ, GABRIEL",Gabriel Vasquez,NM,2.0,DEM,Gabriel Vasquez,Gabriel Vasquez (D) \n\t\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Gabriel Vasquez,GabrielVasquez.jpg,both


In [73]:
nm._merge.value_counts()

both          6
left_only     1
right_only    0
Name: _merge, dtype: int64

In [74]:
nm.shape

(7, 12)

#### Combine 44 states' results

In [1487]:
# res2 = pd.concat([al, az, ar, ca, co, ct, fl, ga, hi, xid, il, xin, ia, ks, ky,
#                  la, me, md, ma, mi, mn, ms, mo, mt, ne, nv, nh, nj, nm, ny, nc, oh, ok,
#                  xor, pa, ri, sc, tn, tx, ut, va, wa, wv, wi
#                  ])

In [1488]:
#res2.shape

(1265, 12)

In [1489]:
#res2.head()

Unnamed: 0,wmpid,cand_name,full_name,cand_office_st,cand_office_dist,cand_party_affiliation,matches,bp_name_raw,bp_url,bp_name,bp_url_filename,_merge
0,WMPID21,"CARL, JERRY LEE, JR",Jerry Carl,AL,1.0,REP,Jerry Carl,Jerry Carl (R) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Jerry Carl,Jerry-Carl.PNG,both
1,WMPID24,"HARVEY-HALL, PHYLLIS",Phyllis Harvey-Hall,AL,2.0,DEM,Phyllis Harvey-Hall,Phyllis Harvey-Hall (D) \n,https://s3.amazonaws.com/ballotpedia-api4/file...,Phyllis Harvey-Hall,phyllis.jpg,both
2,WMPID27,"SEWELL, TERRI A.",Terri A Sewell,AL,7.0,DEM,Terri Sewell,Terri Sewell (D) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Terri Sewell,Terri_Sewell.jpg,both
3,WMPID4534,"SMITH, JONATHAN",Jonathan Smith,AL,2.0,LIB,,,,,,left_only
4,WMPID769,"ROGERS, MICHAEL",Mike D Rogers,AL,3.0,REP,Mike Rogers,Mike Rogers (R) \n\t\t\t\t\t\t\t\t\t\t\t,https://s3.amazonaws.com/ballotpedia-api4/file...,Mike Rogers,Mike-Rogers.jpg,both


In [1491]:
#res2.shape

(1265, 12)

### Combine all results

In [1509]:
#res = pd.concat([res1, res2])

In [1510]:
#res.head()

Unnamed: 0,wmpid,cand_name,full_name,cand_office_st,cand_office_dist,cand_party_affiliation,matches,bp_name_raw,bp_url,bp_name,bp_url_filename,_merge
0,WMPID2014,"PALIN, SARAH",Sarah Palin,AK,0.0,REP,Sarah Palin,Sarah Palin,https://s3.amazonaws.com/ballotpedia-api4/file...,Sarah Palin,Sarah_Palin.PNG,both
1,WMPID4003,"BEGICH, NICHOLAS III",Nicholas III Begich,AK,0.0,REP,Nicholas Begich,Nicholas Begich,https://s3.amazonaws.com/ballotpedia-api4/file...,Nicholas Begich,Nick-Begich.PNG,both
2,WMPID4308,"PELTOLA, MARY",Mary Peltola,AK,0.0,DEM,Mary Peltola,Mary Peltola,https://s3.amazonaws.com/ballotpedia-api4/file...,Mary Peltola,Mary-Peltola.PNG,both
3,WMPID4360,"SWEENEY, TARA M",Tara Sweeney,AK,0.0,REP,,,,,,left_only
0,WMPID5148,"ROGERS, DAVID L.",David Rogers,DE,0.0,OTH,David Rogers,David Rogers (Nonpartisan Party) \n\t\t\t\t\t...,https://s3.amazonaws.com/ballotpedia-api4/file...,David Rogers,DavidRogers.jpg,both


In [1511]:
#res._merge.value_counts(dropna=False)

both          903
left_only     383
right_only      0
Name: _merge, dtype: int64

In [1512]:
#res.to_csv("./data/bp2022_house_scraped_face.csv", index=False)