In [1]:
import numpy as np
import pandas as pd
import json
import pickle


In [3]:
bid_to_tbip_speeches = pickle.load(open('speeches_results/bid_to_tbip_floor_speeches.pkl',
                                        'rb'))
print(len(bid_to_tbip_speeches)) 

490


In [4]:
bid_to_tbip_tweets = pickle.load(open('tweets_results/bid_to_tbip_tweets.pkl',
                                        'rb'))
print(len(bid_to_tbip_tweets))

471


In [26]:
bioguide_ids_with_speech_or_tweet_tbip = sorted(list(set(bid_to_tbip_speeches.keys()).union(set(bid_to_tbip_tweets.keys()))))
print(len(bioguide_ids_with_speech_or_tweet_tbip))

509


In [27]:
## House members data for Congress 115/116,
## crucially contains info on caucus memberships and leadership positions along with host
## of other info - we refer to them as caucus data files in this code. 

legis_data_115 = pd.read_excel('supporting_data_files/H115_members.xlsx', 
                               sheet_name=None,
                               engine='openpyxl')
legis_data_115 = legis_data_115['H115_members (1)']
legis_data_115 = legis_data_115[legis_data_115['bioguide_id'].notnull()]

legis_data_116 = pd.read_excel('supporting_data_files/H116_members.xlsx', 
                               sheet_name=None, 
                               engine='openpyxl')
legis_data_116 = legis_data_116['H116_members']
legis_data_116 = legis_data_116[legis_data_116['bioguide_id'].notnull()]

In [28]:
all_caucus_data_115_ids = list(legis_data_115['bioguide_id'])
print(len(all_caucus_data_115_ids))
all_caucus_data_116_ids = list(legis_data_116['bioguide_id'])
print(len(all_caucus_data_116_ids))

450
437


In [30]:
s = set(all_caucus_data_115_ids).union(set(all_caucus_data_116_ids))
final_bids_to_consider = []
for bid in bioguide_ids_with_speech_or_tweet_tbip:
    if bid in s:
        final_bids_to_consider.append(bid)
print(len(final_bids_to_consider))

503


### Loading and adding basic biographical information about legislators

In [31]:
legis_info = json.load(open('supporting_data_files/legislator-info-1990-2020.json'))
legis_id_to_info = {}
for x in legis_info:
    legis_id_to_info[x['id']['bioguide']] = x
del legis_info

In [32]:
bid_to_name = {}
bid_to_gender = {}
bid_to_party = {}
bid_to_birth_year = {}
bid_to_seniority = {} #number of terms in the US House
bid_to_state_district = {}

In [35]:
for bid in final_bids_to_consider:
    x = legis_id_to_info[bid]
    if 'ballotpedia' in x['id']:
        bid_to_name[bid] = x['id']['ballotpedia']
    else:
        bid_to_name[bid] = x['id']['wikipedia']
    bid_to_party[bid] = x['terms'][0]['party']
    bid_to_gender[bid] = x['bio']['gender']
    if bid in all_caucus_data_115_ids:
        bid_to_birth_year[bid] = list(legis_data_115[legis_data_115['bioguide_id']==bid]['born'])[0]
    else:
        bid_to_birth_year[bid] = int(list(legis_data_116[legis_data_116['bioguide_id']==bid]['born'])[0])
    terms = [z for z in x['terms'] if z['type']=='rep' and int(z['end'][:4])<=2021]
    bid_to_seniority[bid] = len(terms)
    s = terms[-1]['state']
    try:
        d = terms[-1]['district']
    except:
        print(bid)
        break
    if d==0:
        bid_to_state_district[bid] = s + '-AL'
    elif d<10:
        bid_to_state_district[bid] = s + '-0' + str(d)
    else:
        bid_to_state_district[bid] = s + '-' + str(d)

### Loading and adding data on % of district pop. that voted for Dem/GOP candidate in 2016 presidential election

In [46]:
bid_to_district_pres_vs = {} #for dem candidate, GE 2016
bid_to_district_gop_vs = {} #for gop candidate, GE 2016

In [47]:
house_election_shares_df = pd.read_csv('supporting_data_files/Daily Kos Elections 2008, 2012 & 2016 presidential election results for congressional districts used in 2020 elections - Results.csv',
                                       skiprows=1)

In [48]:
district_to_pres_vs = dict(zip(house_election_shares_df.CD, house_election_shares_df.Clinton))
district_to_gop_vs = dict(zip(house_election_shares_df.CD, house_election_shares_df.Trump))
del house_election_shares_df

In [49]:
for bid in final_bids_to_consider:
    sd = bid_to_state_district[bid]
    bid_to_district_pres_vs[bid] = district_to_pres_vs[sd]
    bid_to_district_gop_vs[bid] = district_to_gop_vs[sd]

### Loading and adding data for DW-NOMINATE scores, caucus memberships, and leadership positons

In [54]:
bid_to_dwnom1 = {}
bid_to_dwnom2 = {}
bid_to_cong_presence_115 = {}
bid_to_progressive_115 = {}
bid_to_bluedog_115 = {}
bid_to_newdem_115 = {}
bid_to_problemsolvers_115 = {}
bid_to_freedom_115 = {}
bid_to_rsc_115 = {}
bid_to_gop_lead_115 = {}
bid_to_dem_lead_115 = {}
bid_to_top_comm_115 = {}
bid_to_comm_chair_115 = {}
bid_to_cong_presence_116 = {}
bid_to_progressive_116 = {}
bid_to_bluedog_116 = {}
bid_to_newdem_116 = {}
bid_to_problemsolvers_116 = {}
bid_to_freedom_116 = {}
bid_to_rsc_116 = {}
bid_to_gop_lead_116 = {}
bid_to_dem_lead_116 = {}
bid_to_top_comm_116 = {}
bid_to_comm_chair_116 = {}

In [57]:
all_116_ids = all_caucus_data_116_ids[:]
all_115_ids = all_caucus_data_115_ids[:]

In [58]:
for bid in final_bids_to_consider:
    present_116, present_115 = 0, 0
    if bid in all_116_ids:
        present_116 = 1
        df116 = legis_data_116[legis_data_116['bioguide_id']==bid]
    if bid in all_115_ids:
        present_115 = 1
        df115 = legis_data_115[legis_data_115['bioguide_id']==bid]
    if present_116:
        bid_to_dwnom1[bid] = list(df116['nominate_dim1'])[0]
        bid_to_dwnom2[bid] = list(df116['nominate_dim2'])[0]
    else:
        bid_to_dwnom1[bid] = list(df115['nominate_dim1'])[0]
        bid_to_dwnom2[bid] = list(df115['nominate_dim2'])[0]
    bid_to_cong_presence_115[bid] = present_115
    bid_to_cong_presence_116[bid] = present_116
    
    if present_115 and list(df115['progressive'])[0]==1:
        bid_to_progressive_115[bid] = 1
    else:
        bid_to_progressive_115[bid] = 0
    if present_115 and list(df115['bluedog'])[0]==1:
        bid_to_bluedog_115[bid] = 1
    else:
        bid_to_bluedog_115[bid] = 0
    if present_115 and list(df115['newdems'])[0]==1:
        bid_to_newdem_115[bid] = 1
    else:
        bid_to_newdem_115[bid] = 0
    if present_115 and list(df115['freedom'])[0]==1:
        bid_to_freedom_115[bid] = 1
    else:
        bid_to_freedom_115[bid] = 0
    if present_115 and list(df115['rsc'])[0]==1:
        bid_to_rsc_115[bid] = 1
    else:
        bid_to_rsc_115[bid] = 0
    if present_115 and list(df115['problemsolvers'])[0]==1:
        bid_to_problemsolvers_115[bid] = 1
    else:
        bid_to_problemsolvers_115[bid] = 0
    if present_115 and list(df115['GOPleadership'])[0]==1:
        bid_to_gop_lead_115[bid] = 1
    else:
        bid_to_gop_lead_115[bid] = 0
    if present_115 and list(df115['DEMleadership'])[0]==1:
        bid_to_dem_lead_115[bid] = 1
    else:
        bid_to_dem_lead_115[bid] = 0
    if present_115 and list(df115['CommitteeChair'])[0]==1:
        bid_to_comm_chair_115[bid] = 1
    else:
        bid_to_comm_chair_115[bid] = 0
    if present_115 and list(df115['TopCommittee'])[0]==1:
        bid_to_top_comm_115[bid] = 1
    else:
        bid_to_top_comm_115[bid] = 0
    
    if present_116 and list(df116['Progressive'])[0]==1:
        bid_to_progressive_116[bid] = 1
    else:
        bid_to_progressive_116[bid] = 0
    if present_116 and list(df116['BlueDog'])[0]==1:
        bid_to_bluedog_116[bid] = 1
    else:
        bid_to_bluedog_116[bid] = 0
    if present_116 and list(df116['NewDemocrat'])[0]==1:
        bid_to_newdem_116[bid] = 1
    else:
        bid_to_newdem_116[bid] = 0
    if present_116 and list(df116['Freedom'])[0]==1:
        bid_to_freedom_116[bid] = 1
    else:
        bid_to_freedom_116[bid] = 0
    if present_116 and list(df116['RSC'])[0]==1:
        bid_to_rsc_116[bid] = 1
    else:
        bid_to_rsc_116[bid] = 0
    if present_116 and list(df116['ProblemSolvers'])[0]==1:
        bid_to_problemsolvers_116[bid] = 1
    else:
        bid_to_problemsolvers_116[bid] = 0
    if present_116 and list(df116['GOPLeadership'])[0]==1:
        bid_to_gop_lead_116[bid] = 1
    else:
        bid_to_gop_lead_116[bid] = 0
    if present_116 and list(df116['DemLeadership'])[0]==1:
        bid_to_dem_lead_116[bid] = 1
    else:
        bid_to_dem_lead_116[bid] = 0
    if present_116 and list(df116['CommitteeChair'])[0]==1:
        bid_to_comm_chair_116[bid] = 1
    else:
        bid_to_comm_chair_116[bid] = 0
    if present_116 and list(df116['TopCommittee'])[0]==1:
        bid_to_top_comm_116[bid] = 1
    else:
        bid_to_top_comm_116[bid] = 0

### Loading and adding data for Legislator's % vote share 2016/2018 House election in their district

In [87]:
house_elec_results = pd.read_csv('supporting_data_files/1976-2018-house3.csv', 
                                 encoding='ISO-8859-1')
house_elec_results = house_elec_results[house_elec_results['year']>=2016]


In [88]:
bid_to_house_elec_vote_share_2016 = {}
bid_to_house_elec_vote_share_2018 = {}
for bid in final_bids_to_consider:
    
    lastname = legis_id_to_info[bid]['name']['last']
    party = bid_to_party[bid].upper()
    terms = legis_id_to_info[bid]['terms']
    name = bid_to_name[bid].split(' (')[0]
    
    if bid_to_cong_presence_115[bid]:
        for t in terms:
            if t['start'][:4] == '2017':
                term = t
                break
        state = term['state']
        district = term['district']
        df2016 = house_elec_results[house_elec_results['state_po']==state]
        df2016 = df2016[df2016['district']==district]
        df = df2016[df2016['party'].str.contains(party, na=False)]
        if len(df):
            try:
                voteshare = 100*(sum(df['candidatevotes'])/sum(df['totalvotes']))
            except Exception as e:
                print(e)
                print(bid)
                print(party)
                break
            bid_to_house_elec_vote_share_2016[bid] = voteshare
        else:
            df = df2016[df2016['candidate']==name.upper()]
            try:
                voteshare = 100*(sum(df['candidatevotes'])/sum(df['totalvotes']))
            except Exception as e:
                print(e)
                print(bid)
                print(party)
                break
            bid_to_house_elec_vote_share_2016[bid] = voteshare
    else:
        bid_to_house_elec_vote_share_2016[bid] = np.nan
    if bid_to_cong_presence_116[bid]:
        for t in terms:
            if t['start'][:4] == '2019':
                term = t
                break
        state = term['state']
        district = term['district']
        df2018 = house_elec_results[house_elec_results['state_po']==state]
        df2018 = df2018[df2018['district']==district]
        df2018 = df2018[df2018['year']==2018]
        
        df = df2018[df2018['party'].str.contains(party, na=False)]
        if len(df):
            try:
                voteshare = 100*(sum(df['candidatevotes'])/sum(df['totalvotes']))
            except Exception as e:
                print(e)
                print(bid)
                print(party)
                break
            bid_to_house_elec_vote_share_2018[bid] = voteshare
        else:
            df = df2018[df2018['candidate']==name.upper()]
            try:
                voteshare = 100*(sum(df['candidatevotes'])/sum(df['totalvotes']))
            except Exception as e:
                print(e)
                print(bid)
                print(party)
                break
            bid_to_house_elec_vote_share_2018[bid] = voteshare
    else:
        bid_to_house_elec_vote_share_2018[bid] = np.nan
del house_elec_results

### Loading and adding district-specific data

In [104]:
citylab_cdi = pd.read_csv('supporting_data_files/citylab_cdi.csv')
print(citylab_cdi.info())
# (from https://github.com/theatlantic/citylab-data/blob/master/citylab-congress/citylab_cdi.csv)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435 entries, 0 to 434
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CD                435 non-null    object 
 1   Cluster           435 non-null    object 
 2   Very low density  435 non-null    float64
 3   Low density       435 non-null    float64
 4   Medium density    435 non-null    float64
 5   High density      435 non-null    float64
dtypes: float64(4), object(2)
memory usage: 20.5+ KB
None


In [105]:
district_to_density = dict(zip(citylab_cdi.CD, 
                               citylab_cdi.Cluster))

In [106]:
census_115 = pd.read_csv('supporting_data_files/CensusMerge_115.csv')
print(census_115.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      437 non-null    object 
 1   Geographic.Area.Name    437 non-null    object 
 2   TotalPop                437 non-null    int64  
 3   MalePop                 437 non-null    int64  
 4   FemalePop               437 non-null    int64  
 5   MedianAge               437 non-null    float64
 6   SeniorPop               437 non-null    int64  
 7   WhitePop                437 non-null    int64  
 8   WhitePercent            437 non-null    float64
 9   BlackPop                437 non-null    int64  
 10  BlackPercent            437 non-null    float64
 11  AmIndianPop             437 non-null    int64  
 12  AmIndianPercent         437 non-null    float64
 13  AsianPop                437 non-null    int64  
 14  AsianPercent            437 non-null    fl

In [107]:
census_116 = pd.read_csv('supporting_data_files/CensusMerge_116.csv')
print(census_116.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      437 non-null    object 
 1   Geographic.Area.Name    437 non-null    object 
 2   TotalPop                437 non-null    int64  
 3   TotalMale               437 non-null    int64  
 4   TotalFemale             437 non-null    int64  
 5   MedianAge               437 non-null    float64
 6   SeniorPop               437 non-null    int64  
 7   WhitePop                437 non-null    int64  
 8   WhitePercent            437 non-null    float64
 9   BlackPop                437 non-null    int64  
 10  BlackPercent            437 non-null    float64
 11  AmIndianPop             437 non-null    int64  
 12  AmIndianPercent         437 non-null    float64
 13  AsianPop                437 non-null    int64  
 14  AsianPercentage         437 non-null    fl

In [108]:
state_code = pd.read_csv('supporting_data_files/state_abbr_code.csv')

In [109]:
state_to_code = dict(zip(state_code.State, state_code.Code))

In [110]:
def census_geographical_region_to_district(s):
    state = s.split(', ')[1]
    state_code = state_to_code[state]
    if 'at Large' in s:
        return state_code + '-AL'
    else:
        d = int(s.split()[2])
        if d < 10:
            return state_code + '-0' + str(d)
        else:
            return state_code + '-' + str(d)

In [111]:
def get_district_to_data_dics_from_census_data_csv(df):
    geog_area = list(df['Geographic.Area.Name'])[:-2]
    total_pop = list(df['TotalPop'])[:-2]
    try:
        male_pop = list(df['TotalMale'])[:-2]
        female_pop = list(df['TotalFemale'])[:-2]
    except:
        male_pop = list(df['MalePop'])[:-2]
        female_pop = list(df['FemalePop'])[:-2]
    senior_pop = list(df['SeniorPop'])[:-2]
    
    male_percs = [round(100*(x/y), 2) for x,y in zip(male_pop, total_pop)]
    female_percs = [round(100*(x/y), 2)  for x,y in zip(female_pop, total_pop)]
    senior_percs = [round(100*(x/y), 2)  for x,y in zip(senior_pop, total_pop)]
    median_ages = list(df['MedianAge'])[:-2]
    white_percs = list(df['WhitePercent'])[:-2]
    black_percs = list(df['BlackPercent'])[:-2]
    try:
        asian_percs = list(df['AsianPercent'])[:-2]
    except:
        asian_percs = list(df['AsianPercentage'])[:-2]
    amindian_percs = list(df['AmIndianPercent'])[:-2]
    try:
        hispanic_percs = list(df['HispanicPercent'])[:-2]
    except:
        hispanic_percs = list(df['HispanicPercentage'])[:-2]
    try:
        unemployment_rates = list(df['UnemployedRate'])[:-2]
    except:
        unemployment_rates = list(df['UnemploymentRate'])[:-2]
    median_incomes = list(df['DistrictMedianIncome'])[:-2]
    mean_incomes = list(df['DistrictMeanIncome'])[:-2]
    diff_median_incomes = list(df['DifferenceMedianIncome'])[:-2]
    diff_mean_incomes = list(df['DifferenceMeanIncome'])[:-2]
    
    districts = [census_geographical_region_to_district(g) for g in geog_area]
    print(len(districts))
    
    district_to_male_percs = dict(zip(districts, male_percs))
    district_to_female_percs = dict(zip(districts, female_percs))
    district_to_senior_percs = dict(zip(districts, senior_percs))
    district_to_median_ages = dict(zip(districts, median_ages))
    district_to_white_percs = dict(zip(districts, white_percs))
    district_to_black_percs = dict(zip(districts, black_percs))
    district_to_asian_percs = dict(zip(districts, asian_percs))
    district_to_amindian_percs = dict(zip(districts, amindian_percs))
    district_to_hispanic_percs = dict(zip(districts, hispanic_percs))
    district_to_unemployment_rates = dict(zip(districts, unemployment_rates))
    district_to_median_incomes = dict(zip(districts, median_incomes))
    district_to_mean_incomes = dict(zip(districts, mean_incomes))
    district_to_diff_median_incomes = dict(zip(districts, diff_median_incomes))
    district_to_diff_mean_incomes = dict(zip(districts, diff_mean_incomes))
    
    return district_to_male_percs, district_to_female_percs, district_to_senior_percs, district_to_median_ages, district_to_white_percs, district_to_black_percs, district_to_asian_percs, district_to_amindian_percs, district_to_hispanic_percs, district_to_unemployment_rates, district_to_median_incomes, district_to_mean_incomes, district_to_diff_median_incomes, district_to_diff_mean_incomes

In [112]:
district_to_male_percs_115, district_to_female_percs_115, district_to_senior_percs_115, district_to_median_ages_115, district_to_white_percs_115, district_to_black_percs_115, district_to_asian_percs_115, district_to_amindian_percs_115, district_to_hispanic_percs_115, district_to_unemployment_rates_115, district_to_median_incomes_115, district_to_mean_incomes_115, district_to_diff_median_incomes_115, district_to_diff_mean_incomes_115 = get_district_to_data_dics_from_census_data_csv(census_115)


435


In [113]:
district_to_male_percs_116, district_to_female_percs_116, district_to_senior_percs_116, district_to_median_ages_116, district_to_white_percs_116, district_to_black_percs_116, district_to_asian_percs_116, district_to_amindian_percs_116, district_to_hispanic_percs_116, district_to_unemployment_rates_116, district_to_median_incomes_116, district_to_mean_incomes_116, district_to_diff_median_incomes_116, district_to_diff_mean_incomes_116 = get_district_to_data_dics_from_census_data_csv(census_116)


435


In [114]:
bid_to_district_density = {} #how urban/rural a district is
bid_to_district_male_perc = {}
bid_to_district_female_perc = {}
bid_to_district_senior_perc = {}
bid_to_district_median_age = {}
bid_to_district_white_perc = {}
bid_to_district_black_perc = {}
bid_to_district_asian_perc = {}
bid_to_district_amindian_perc = {}
bid_to_district_hispanic_perc = {}
bid_to_district_unemp_rate = {}
bid_to_district_median_income = {}
bid_to_district_mean_income = {}
bid_to_district_diff_median_income = {}
bid_to_district_diff_mean_income = {}

In [115]:
for bid in final_bids_to_consider:
    sd = bid_to_state_district[bid]
    bid_to_district_density[bid] = district_to_density[sd]
    if bid_to_cong_presence_116[bid]:
        bid_to_district_female_perc[bid] = district_to_female_percs_116[sd]
        bid_to_district_male_perc[bid] = district_to_male_percs_116[sd]
        bid_to_district_senior_perc[bid] = district_to_senior_percs_116[sd]
        bid_to_district_median_age[bid] = district_to_median_ages_116[sd]
        bid_to_district_white_perc[bid] = district_to_white_percs_116[sd]
        bid_to_district_black_perc[bid] = district_to_black_percs_116[sd]
        bid_to_district_asian_perc[bid] = district_to_asian_percs_116[sd]
        bid_to_district_amindian_perc[bid] = district_to_amindian_percs_116[sd]
        bid_to_district_hispanic_perc[bid] = district_to_hispanic_percs_116[sd]
        bid_to_district_unemp_rate[bid] = district_to_unemployment_rates_116[sd]
        bid_to_district_median_income[bid] = district_to_median_incomes_116[sd]
        bid_to_district_diff_median_income[bid] = district_to_diff_median_incomes_116[sd]
        bid_to_district_mean_income[bid] = district_to_mean_incomes_116[sd]
        bid_to_district_diff_mean_income[bid] = district_to_diff_mean_incomes_116[sd]
    else:
        bid_to_district_female_perc[bid] = district_to_female_percs_115[sd]
        bid_to_district_male_perc[bid] = district_to_male_percs_115[sd]
        bid_to_district_senior_perc[bid] = district_to_senior_percs_115[sd]
        bid_to_district_median_age[bid] = district_to_median_ages_115[sd]
        bid_to_district_white_perc[bid] = district_to_white_percs_115[sd]
        bid_to_district_black_perc[bid] = district_to_black_percs_115[sd]
        bid_to_district_asian_perc[bid] = district_to_asian_percs_115[sd]
        bid_to_district_amindian_perc[bid] = district_to_amindian_percs_115[sd]
        bid_to_district_hispanic_perc[bid] = district_to_hispanic_percs_115[sd]
        bid_to_district_unemp_rate[bid] = district_to_unemployment_rates_115[sd]
        bid_to_district_median_income[bid] = district_to_median_incomes_115[sd]
        bid_to_district_diff_median_income[bid] = district_to_diff_median_incomes_115[sd]
        bid_to_district_mean_income[bid] = district_to_mean_incomes_115[sd]
        bid_to_district_diff_mean_income[bid] = district_to_diff_mean_incomes_115[sd]

### Adding ideal point values data

In [117]:
def standardize(x):
  """Standardize a vector x."""
  return (x - np.nanmean(x)) / np.nanstd(x)

In [138]:
bid_to_speech_tbip, bid_to_twitter_tbip = {}, {}
for bid in final_bids_to_consider:
    if bid in bid_to_tbip_speeches:
        bid_to_speech_tbip[bid] = -1* bid_to_tbip_speeches[bid]
    else:
        bid_to_speech_tbip[bid] = np.nan
        
    if bid in bid_to_tbip_tweets:
        bid_to_twitter_tbip[bid] = bid_to_tbip_tweets[bid]
    else:
        bid_to_twitter_tbip[bid] = np.nan

In [141]:
import os

In [145]:
vote_source_dir = 'tbip/data/congs_115-116_votes/'
vote_data_dir = os.path.join(vote_source_dir, "clean")
vote_param_dir = os.path.join(vote_source_dir, "fits/params")
vote_ideal_points_1d = standardize(np.load(os.path.join(vote_param_dir, 
                                                        "ideal_point_loc.npy")))
voting_reps_map = list(map(lambda x:x.rstrip(), 
                           open(os.path.join(vote_data_dir, 'rep_map.txt')).readlines()))

bid_to_stan_vote_tbip = {}
for bid in final_bids_to_consider:
    bid_to_stan_vote_tbip[bid] = -1*vote_ideal_points_1d[voting_reps_map.index(bid)]

stan_speech_ideal_points = standardize(list(bid_to_speech_tbip.values()))
stan_tweet_ideal_points = standardize(list(bid_to_twitter_tbip.values()))

bid_to_stan_speech_tbip, bid_to_stan_tweet_tbip = {}, {}
for i, bid in enumerate(final_bids_to_consider):
    bid_to_stan_speech_tbip[bid] = stan_speech_ideal_points[i]
    bid_to_stan_tweet_tbip[bid] = stan_tweet_ideal_points[i]

print(len(bid_to_stan_vote_tbip))
print(len(bid_to_stan_speech_tbip))
print(len(bid_to_stan_tweet_tbip))

503
503
503


In [149]:
final_df = pd.DataFrame()
final_df['Bioguide_ID'] = list(bid_to_name.keys())
final_df['Name'] = list(bid_to_name.values())
final_df['Gender'] = list(bid_to_gender.values())
final_df['Party'] = list(bid_to_party.values())
final_df['Born'] = list(bid_to_birth_year.values())
final_df['Number_of_House_Terms'] = list(bid_to_seniority.values())

final_df['Present_Cong115'] = list(bid_to_cong_presence_115.values())
final_df['Present_Cong116'] = list(bid_to_cong_presence_116.values())

final_df['House_Election_Candidate_Vote_Share_2016'] = list(bid_to_house_elec_vote_share_2016.values())
final_df['House_Election_Candidate_Vote_Share_2018'] = list(bid_to_house_elec_vote_share_2018.values())

final_df['District'] = list(bid_to_state_district.values())
final_df['District_Presidential_VoteShare_Dem2016'] = list(bid_to_district_pres_vs.values())
final_df['District_Presidential_VoteShare_GOP2016'] = list(bid_to_district_gop_vs.values())
final_df['District_Density'] = list(bid_to_district_density.values())
final_df['District_Percent_Female'] = list(bid_to_district_female_perc.values())
final_df['District_Percent_Male'] = list(bid_to_district_male_perc.values())
final_df['District_Percent_Senior'] = list(bid_to_district_senior_perc.values())
final_df['District_Median_Age'] = list(bid_to_district_median_age.values())
final_df['District_Percent_White'] = list(bid_to_district_white_perc.values())
final_df['District_Percent_Black'] = list(bid_to_district_black_perc.values())
final_df['District_Percent_Asian'] = list(bid_to_district_asian_perc.values())
final_df['District_Percent_Hispanic'] = list(bid_to_district_hispanic_perc.values())
final_df['District_Percent_AmericanIndian'] = list(bid_to_district_amindian_perc.values())
final_df['District_Mean_Income'] = list(bid_to_district_mean_income.values())
final_df['District_Mean_Minus_National_Mean_Income'] = list(bid_to_district_diff_mean_income.values())
final_df['District_Median_Income'] = list(bid_to_district_median_income.values())
final_df['District_Median_Minus_National_Median_Income'] = list(bid_to_district_diff_median_income.values())
final_df['District_Unemployment_Rate'] = list(bid_to_district_unemp_rate.values())
      
final_df['DW-Nominate_1'] = list(bid_to_dwnom1.values())
final_df['DW-Nominate_2'] = list(bid_to_dwnom2.values())
final_df['TBIP_Floor_Speeches'] = list(bid_to_speech_tbip.values())
final_df['TBIP_Tweets'] = list(bid_to_twitter_tbip.values())
final_df['Standardized_Vote_Ideal_Point'] = list(bid_to_stan_vote_tbip.values())
final_df['Standardized_Speech_Ideal_Point'] = list(bid_to_stan_speech_tbip.values())
final_df['Standardized_Tweet_Ideal_Point'] = list(bid_to_stan_tweet_tbip.values())

final_df['Progressive_Caucus_Cong115'] = list(bid_to_progressive_115.values()) 
final_df['NewDems_Caucus_Cong115'] = list(bid_to_newdem_115.values()) 
final_df['BlueDog_Caucus_Cong115'] = list(bid_to_bluedog_115.values()) 
final_df['ProblemSolvers_Caucus_Cong115'] = list(bid_to_problemsolvers_115.values()) 
final_df['RSC_Caucus_Cong115'] = list(bid_to_rsc_115.values()) 
final_df['Freedom_Caucus_Cong115'] = list(bid_to_freedom_115.values()) 
final_df['GOP_Leadership_Cong115'] = list(bid_to_gop_lead_115.values()) 
final_df['DEM_Leadership_Cong115'] = list(bid_to_dem_lead_115.values()) 
final_df['CommitteeChair_Cong115'] = list(bid_to_comm_chair_115.values()) 
final_df['TopCommittee_Cong115'] = list(bid_to_top_comm_115.values())

final_df['Progressive_Caucus_Cong116'] = list(bid_to_progressive_116.values()) 
final_df['NewDems_Caucus_Cong116'] = list(bid_to_newdem_116.values()) 
final_df['BlueDog_Caucus_Cong116'] = list(bid_to_bluedog_116.values()) 
final_df['ProblemSolvers_Caucus_Cong116'] = list(bid_to_problemsolvers_116.values()) 
final_df['RSC_Caucus_Cong116'] = list(bid_to_rsc_116.values()) 
final_df['Freedom_Caucus_Cong116'] = list(bid_to_freedom_116.values()) 
final_df['GOP_Leadership_Cong116'] = list(bid_to_gop_lead_116.values()) 
final_df['DEM_Leadership_Cong116'] = list(bid_to_dem_lead_116.values()) 
final_df['CommitteeChair_Cong116'] = list(bid_to_comm_chair_116.values()) 
final_df['TopCommittee_Cong116'] = list(bid_to_top_comm_116.values())

In [150]:
final_df.to_csv('legislator_info_and_tbip_congresses_115_and_116.csv', 
                index=False)

In [151]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 55 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Bioguide_ID                                   503 non-null    object 
 1   Name                                          503 non-null    object 
 2   Gender                                        503 non-null    object 
 3   Party                                         503 non-null    object 
 4   Born                                          503 non-null    int64  
 5   Number_of_House_Terms                         503 non-null    int64  
 6   Present_Cong115                               503 non-null    int64  
 7   Present_Cong116                               503 non-null    int64  
 8   House_Election_Candidate_Vote_Share_2016      418 non-null    float64
 9   House_Election_Candidate_Vote_Share_2018      420 non-null    flo