In [3]:
import pandas as pd

## Build Datasets:

In [4]:
# Get VoteView data:
voteview_df = pd.read_csv("member_ideology_house_all_years.csv")
voteview_df = voteview_df[voteview_df["chamber"] == "House"].drop(["occupancy", "last_means", "bioguide_id"], axis=1)
voteview_df.head(5)

Unnamed: 0,congress,chamber,icpsr,state_icpsr,district_code,state_abbrev,party_code,bioname,born,died,nominate_dim1,nominate_dim2,nominate_log_likelihood,nominate_geo_mean_probability,nominate_number_of_votes,nominate_number_of_errors,conditional,nokken_poole_dim1,nokken_poole_dim2
1,1,House,379,44,2.0,GA,4000,"BALDWIN, Abraham",1754.0,1807.0,-0.165,-0.373,-28.55029,0.758,103.0,12.0,,-0.429,-0.817
2,1,House,4854,44,1.0,GA,4000,"JACKSON, James",1757.0,1806.0,-0.32,-0.181,-24.89986,0.776,98.0,9.0,,-0.559,-0.052
3,1,House,6071,44,3.0,GA,4000,"MATHEWS, George",1739.0,1812.0,-0.428,-0.317,-12.62728,0.88,99.0,2.0,,-0.413,-0.232
4,1,House,1538,52,6.0,MD,5000,"CARROLL, Daniel",1730.0,1796.0,0.116,-0.74,-23.47008,0.783,96.0,11.0,,0.114,-0.779
5,1,House,2010,52,3.0,MD,4000,"CONTEE, Benjamin",1755.0,1815.0,-0.08,-0.387,-21.88695,0.788,92.0,13.0,,-0.093,-0.411


In [5]:
age_mask = voteview_df["born"] > 1990
voteview_df[age_mask]

Unnamed: 0,congress,chamber,icpsr,state_icpsr,district_code,state_abbrev,party_code,bioname,born,died,nominate_dim1,nominate_dim2,nominate_log_likelihood,nominate_geo_mean_probability,nominate_number_of_votes,nominate_number_of_errors,conditional,nokken_poole_dim1,nokken_poole_dim2
40005,117,House,22109,47,11.0,NC,200,"CAWTHORN, Madison",1995.0,,0.555,-0.243,-172.26812,0.8082,809.0,84.0,,0.554,-0.246
40265,118,House,22321,43,10.0,FL,100,"FROST, Maxwell",1997.0,,-0.497,-0.75,-16.07722,0.96842,501.0,4.0,,-0.497,-0.752


In [6]:
# Get IRS data:
saipe_df = pd.read_excel("irs.xls",skiprows=2)
saipe_df.head(5)

Unnamed: 0,State FIPS code,Name,Year,Total exemptions,Poor exemptions,Age 65 and over exemptions,Age 65 and over poor exemptions,Child exemptions,Poor child exemptions,Total exemptions under age 65,Poor exemptions under age 65,Median AGI,Mean AGI
0,1,Alabama,1989,3367441,634363,213228,30357,1145821,260472,3154213,604006,18271,26487
1,1,Alabama,1990,3427361,676616,223095,34114,1175534,281392,3204266,642502,18719,27300
2,1,Alabama,1991,3471467,720465,229666,36826,1193227,304653,3241801,683639,18929,27941
3,1,Alabama,1992,3494455,711496,227683,35158,1210348,305209,3266772,676338,19508,29397
4,1,Alabama,1993,3518338,728239,227765,35485,1220397,314438,3290573,692754,19775,29846


In [7]:
# get a unique mapping of all parties the US has ever had registered:
parties_df = pd.read_csv("HSall_parties.csv")
parties_df = parties_df.groupby(['party_code','party_name'])["n_members"].sum().reset_index().rename(columns={'n_members':'count_all_time'})

parties = {party_code:parties_df[parties_df["party_code"] == party_code]["party_name"].item() for party_code in parties_df["party_code"].unique()}

parties

{1: 'Federalist',
 13: 'Democrat-Republican',
 22: 'Adams',
 26: 'Anti Masonic',
 29: 'Whig',
 37: 'Constitutional Unionist',
 44: 'Nullifier',
 46: 'States Rights',
 100: 'Democrat',
 108: 'Anti-Lecompton Democrat',
 112: 'Conservative',
 114: 'Readjuster',
 117: 'Readjuster Democrat',
 200: 'Republican',
 203: 'Unconditional Unionist',
 206: 'Unionist',
 208: 'Liberal Republican',
 213: 'Progressive Republican',
 300: 'Free Soil',
 310: 'American',
 326: 'National Greenbacker',
 328: 'Independent',
 329: 'Ind. Democrat',
 331: 'Ind. Republican',
 340: 'Populist',
 347: 'Prohibitionist',
 354: 'Silver Republican',
 355: 'Union',
 356: 'Union Labor',
 370: 'Progressive',
 380: 'Socialist',
 402: 'Liberal',
 403: 'Law and Order',
 522: 'American Labor',
 523: 'American Labor',
 537: 'Farmer-Labor',
 555: 'Jackson',
 603: 'Ind. Whig',
 1060: 'Silver',
 1111: 'Liberty',
 1116: 'Conservative Republican',
 1275: 'Anti-Jackson',
 1346: 'Jackson Republican',
 3333: 'Opposition',
 3334: 'Oppos

## Merge Datasets:

We can merge the datasets on the session of congress, which is a two year period starting every odd year on January 4th. We'll take each pair of years from IRS and get the average of their values, and use the resulting row in our merged table:

In [8]:
# Get session of congress from year:
congress = 101
saipe_df["congress"] = saipe_df["Year"].apply(lambda x: congress+((x-1989)//2))
saipe_df[["Year", "congress"]].head(5)

Unnamed: 0,Year,congress
0,1989,101
1,1990,101
2,1991,102
3,1992,102
4,1993,103


In [9]:
# Average values by congressional session and combine rows (using the average)
saipe_df_clean = pd.DataFrame()
for congress in saipe_df["congress"].unique():
    df_congress = saipe_df[saipe_df["congress"] == congress].groupby("Name").mean()
    df_congress.reset_index(inplace=True)
    df_congress["year_range"] = df_congress["Year"].apply(lambda x: f"{int(x-0.5)}-{int(x+1.5)}") # Fix year formatting
    df_congress["state_FIPS"] = df_congress["State FIPS code"].astype(int) # cast to int instead of float
    df_congress["congress"] = df_congress["congress"].astype(int)
    df_congress.drop(["Year","State FIPS code"],axis=1,inplace=True) # Remove outdated year and FIPS code columns

    new_columns = {old_name:old_name.lower() for old_name in df_congress.columns} # lowercase columns
    df_congress = df_congress.rename(columns=new_columns)
    df_congress = df_congress[["congress", "year_range", "name", "state_fips", 'total exemptions', 'poor exemptions',
       'age 65 and over exemptions', 'age 65 and over poor exemptions',
       'child exemptions', 'poor child exemptions',
       'total exemptions under age 65', 'poor exemptions under age 65',
       'median agi', 'mean agi']] # reorganize dataframe
    
    saipe_df_clean = pd.concat([saipe_df_clean,df_congress],ignore_index=True)

saipe_df_clean.head(20)
    

Unnamed: 0,congress,year_range,name,state_fips,total exemptions,poor exemptions,age 65 and over exemptions,age 65 and over poor exemptions,child exemptions,poor child exemptions,total exemptions under age 65,poor exemptions under age 65,median agi,mean agi
0,101,1989-1991,Alabama,1,3397401.0,655489.5,218161.5,32235.5,1160677.5,270932.0,3179239.5,623254.0,18495.0,26893.5
1,101,1989-1991,Alaska,2,465887.0,45027.5,15669.0,1372.0,158544.0,15505.0,450218.0,43655.5,28586.0,37396.0
2,101,1989-1991,Arizona,4,3056550.0,556156.5,285017.5,30618.5,1001303.5,231076.0,2771532.5,525538.0,19989.0,28816.5
3,101,1989-1991,Arkansas,5,1965089.0,420454.0,157183.5,25159.5,650393.0,167440.0,1807905.5,395294.5,17121.0,24032.0
4,101,1989-1991,California,6,24348109.5,3807503.0,1894756.0,173023.5,7753444.0,1532522.5,22453353.5,3634479.5,23399.0,35051.0
5,101,1989-1991,Colorado,8,2844090.5,375506.0,203112.0,24746.0,879856.0,126733.0,2640978.5,350760.0,22453.5,31183.0
6,101,1989-1991,Connecticut,9,2818204.5,197648.0,278651.5,23635.0,783455.0,51121.5,2539553.0,174013.0,28962.0,41897.5
7,101,1989-1991,Delaware,10,590881.5,60925.5,49779.0,4447.0,180326.0,22523.0,541102.5,56478.5,24484.0,34379.0
8,101,1989-1991,District of Columbia,11,476815.0,68913.5,42786.0,3321.0,142782.5,27138.5,434029.0,65592.5,21078.5,33009.5
9,101,1989-1991,Florida,12,10759982.5,1761068.0,1394630.5,137534.5,3003811.5,626380.5,9365352.0,1623533.5,19084.0,30008.5


### Perform left merge on IRS data (1989-2020) with vote_view data (1789-2023):

In [10]:
columns = [
    # Rep info
    'congress', 'bioname', 'party_code', 'party_name', 'born', 'died',
    'year_range', 'name', 'state_abbrev', 'icpsr', 'state_icpsr', 
    'district_code', 'state_fips',
    
    # Rep ideology
    'nominate_dim1', 'nominate_dim2', 'nominate_log_likelihood',
    'nominate_geo_mean_probability', 'nominate_number_of_votes',
    'nominate_number_of_errors', 'conditional', 'nokken_poole_dim1',
    'nokken_poole_dim2', 
    
    # State demographics
    'total exemptions', 'poor exemptions', 'age 65 and over exemptions',
    'age 65 and over poor exemptions', 'child exemptions',
    'poor child exemptions', 'total exemptions under age 65',
    'poor exemptions under age 65', 'median agi', 'mean agi',
]


df = saipe_df_clean.join(
    voteview_df.set_index("congress"),
    how="left",
    on="congress",
).reset_index().drop(["chamber","index"], axis=1)

df["party_name"] = df["party_code"].apply(lambda x: parties[x])

df=df[columns]

df

Unnamed: 0,congress,bioname,party_code,party_name,born,died,year_range,name,state_abbrev,icpsr,...,total exemptions,poor exemptions,age 65 and over exemptions,age 65 and over poor exemptions,child exemptions,poor child exemptions,total exemptions under age 65,poor exemptions under age 65,median agi,mean agi
0,101,"DICKINSON, William Louis",200,Republican,1925.0,2008.0,1989-1991,Alabama,AL,10717,...,3397401.0,655489.5,218161.5,32235.5,1160677.5,270932.0,3179239.5,623254.0,18495.0,26893.5
1,101,"BEVILL, Tom",100,Democrat,1921.0,2005.0,1989-1991,Alabama,AL,11000,...,3397401.0,655489.5,218161.5,32235.5,1160677.5,270932.0,3179239.5,623254.0,18495.0,26893.5
2,101,"NICHOLS, William Flynt",100,Democrat,1918.0,1988.0,1989-1991,Alabama,AL,11037,...,3397401.0,655489.5,218161.5,32235.5,1160677.5,270932.0,3179239.5,623254.0,18495.0,26893.5
3,101,"FLIPPO, Ronnie Gene",100,Democrat,1937.0,,1989-1991,Alabama,AL,14419,...,3397401.0,655489.5,218161.5,32235.5,1160677.5,270932.0,3179239.5,623254.0,18495.0,26893.5
4,101,"ERDREICH, Ben",100,Democrat,1938.0,,1989-1991,Alabama,AL,15022,...,3397401.0,655489.5,218161.5,32235.5,1160677.5,270932.0,3179239.5,623254.0,18495.0,26893.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363319,116,"GALLAGHER, Michael",200,Republican,1984.0,,2019-2021,Wyoming,WI,21720,...,504999.5,75663.0,76729.0,12889.0,138266.5,20716.5,428270.5,62774.0,47628.5,88636.5
363320,116,"STEIL, Bryan",200,Republican,1981.0,,2019-2021,Wyoming,WI,21970,...,504999.5,75663.0,76729.0,12889.0,138266.5,20716.5,428270.5,62774.0,47628.5,88636.5
363321,116,"TIFFANY, Thomas P.",200,Republican,1957.0,,2019-2021,Wyoming,WI,21989,...,504999.5,75663.0,76729.0,12889.0,138266.5,20716.5,428270.5,62774.0,47628.5,88636.5
363322,116,"KIND, Ron",100,Democrat,1963.0,,2019-2021,Wyoming,WI,29769,...,504999.5,75663.0,76729.0,12889.0,138266.5,20716.5,428270.5,62774.0,47628.5,88636.5
