In [1]:
import pandas as pd

In [2]:
# Source: https://voteview.com/articles/data_help_members
df = pd.read_csv("member_ideology_house_all_years.csv")

In [28]:
# get a unique mapping of all parties the US has ever had registered:
parties_df = pd.read_csv("HSall_parties.csv")
parties_df = parties_df.groupby(['party_code','party_name'])["n_members"].sum().reset_index().rename(columns={'n_members':'count_all_time'})
parties_df.head(5)

Unnamed: 0,party_code,party_name,count_all_time
0,1,Federalist,847
1,13,Democrat-Republican,1976
2,22,Adams,268
3,26,Anti Masonic,77
4,29,Whig,1190


In [8]:
df = df[df["chamber"] == "House"]
display(df.head())
display(df.columns)


Unnamed: 0,congress,chamber,icpsr,state_icpsr,district_code,state_abbrev,party_code,occupancy,last_means,bioname,...,died,nominate_dim1,nominate_dim2,nominate_log_likelihood,nominate_geo_mean_probability,nominate_number_of_votes,nominate_number_of_errors,conditional,nokken_poole_dim1,nokken_poole_dim2
1,1,House,379,44,2.0,GA,4000,0.0,1.0,"BALDWIN, Abraham",...,1807.0,-0.165,-0.373,-28.55029,0.758,103.0,12.0,,-0.429,-0.817
2,1,House,4854,44,1.0,GA,4000,0.0,1.0,"JACKSON, James",...,1806.0,-0.32,-0.181,-24.89986,0.776,98.0,9.0,,-0.559,-0.052
3,1,House,6071,44,3.0,GA,4000,0.0,1.0,"MATHEWS, George",...,1812.0,-0.428,-0.317,-12.62728,0.88,99.0,2.0,,-0.413,-0.232
4,1,House,1538,52,6.0,MD,5000,0.0,1.0,"CARROLL, Daniel",...,1796.0,0.116,-0.74,-23.47008,0.783,96.0,11.0,,0.114,-0.779
5,1,House,2010,52,3.0,MD,4000,0.0,1.0,"CONTEE, Benjamin",...,1815.0,-0.08,-0.387,-21.88695,0.788,92.0,13.0,,-0.093,-0.411


Index(['congress', 'chamber', 'icpsr', 'state_icpsr', 'district_code',
       'state_abbrev', 'party_code', 'occupancy', 'last_means', 'bioname',
       'bioguide_id', 'born', 'died', 'nominate_dim1', 'nominate_dim2',
       'nominate_log_likelihood', 'nominate_geo_mean_probability',
       'nominate_number_of_votes', 'nominate_number_of_errors', 'conditional',
       'nokken_poole_dim1', 'nokken_poole_dim2'],
      dtype='object')

### Bio Fields:
- **icpsr**: Integer 1-99999. This is an ID code which identifies the member in question. In general, each member receives a single ICPSR identifier applicable to their entire career. A small number of members have received more than one- this can occur for members who have switched parties; as well as members who subsequently become president. Creating a new identifier allows a new NOMINATE estimate to be produced for separate appearances of a member in different roles.
- **state_icpsr**: Integer 0-99. Identifier for the state represented by the member.
- **district_code**: Integer 0-99. Identifier for the district that the member represents within their state (e.g. 3 for the Alabama 3rd Congressional District). Senate members are given district_code 0. Members who represent historical "at-large" districts are assigned 99, 98, or 1 in various circumstances.
- **state_abbrev**: String. Two-character postal abbreviation for state (e.g. MO for Missouri).
- **party_code**: Integer 1-9999. Identifying code for the member's party. Please see [documentation](https://voteview.com/articles/data_help_parties) for Party Data for more information about which party_code identifiers refer to which parties.
- **occupancy**: Integer 1+. ICPSR occupancy code. This item is considered legacy or incomplete information and has not been verified. In general, members receive 0 if they are the only occupant, 1 if they are the first occupant, 2 if they are the second occupant, etc.
- **last_means**: Integer 1-5. ICPSR Attain-Office Code. This is an indicator that reflects the member's last means of attaining office. This item is considered legacy or incomplete information and has not been verified. Members received 1 if they were elected in a general election, 2 if elected by special election, 3 if directly elected by a state legislature, and 5 if appointed.
- **bioname**: String. Name of the member, surname first. For most members, agrees with the Biographical Directory of Congress.
- **bioguide_id**: String. Member identifier in the Biographical Directory of Congress.
- **born**: Integer. Year of member's birth.
- **died**: Integer. Year of member's death.
### Ideological Fields:

We present two main estimates of a legislator's ideology: NOMINATE and Nokken-Poole. NOMINATE estimates assume that members occupy a static ideological position across the course of their career. Nokken-Poole estimates assume that each congress is completely separate for the purposes of estimating a member's ideology. We expect that most users of our data will primarily make use of the nominate_dim1 field, which reports the first dimension (often interpreted as economic liberalism-conservatism) of members as estimated by NOMINATE.

For more information on these scores, please see discussion in our journal article:

Boche, Adam, Jeffrey B. Lewis, Aaron Rudkin, and Luke Sonnet. "The new Voteview.com: preserving and continuing Keith Poole’s infrastructure for scholars, students and observers of Congress". Public Choice 176(1-2). Available online: [https://link.springer.com/article/10.1007/s11127-018-0546-0](https://link.springer.com/article/10.1007/s11127-018-0546-0)


- **nominate_dim1:** NOMINATE first dimension estimate.
- **nominate_dim2**: NOMINATE second dimension estimate.
- **log_likelihood**: Log-likelihood of the NOMINATE estimate.
- **geo_mean_probability**: Geometric mean probability of NOMINATE estimate.
- **number_of_votes**: Number of votes cast by the member during a given congress.
- **conditional**: Integer 0-1. A 1 indicates NOMINATE was estimated conditionally for a given member. 0 otherwise. Conditional estimation implies that an estimate is provisional and subject to updates when the next full estimation of NOMINATE scores occurs.
- **nokken_poole_dim1**: Nokken-Poole First dimension estimate.
- **nokken_poole_dim2**: Nokken-Poole Second dimension estimate.


In [22]:
df["bioguide_id"]

1        B000084
2        J000017
3        M000234
4        C000187
5        C000710
          ...   
40589    S001213
40590    T000165
40591    F000471
40592    V000135
40593    H001096
Name: bioguide_id, Length: 40467, dtype: object

#### We can drop some of these:
- The legacy values (occupancy, last_means) are unimportant according to the documentation
- bioguide_id is logistically unimportant to us
- we can replace died with "age", which we can compute soon

In [10]:
display(df.dtypes)

congress                             0
chamber                              0
icpsr                                0
state_icpsr                          0
district_code                        0
state_abbrev                         0
party_code                           0
occupancy                         1815
last_means                        1815
bioname                              0
bioguide_id                          3
born                               237
died                              8069
nominate_dim1                      153
nominate_dim2                      153
nominate_log_likelihood            763
nominate_geo_mean_probability      763
nominate_number_of_votes           763
nominate_number_of_errors          763
conditional                      40467
nokken_poole_dim1                  250
nokken_poole_dim2                  250
dtype: int64

congress                           int64
chamber                           object
icpsr                              int64
state_icpsr                        int64
district_code                    float64
state_abbrev                      object
party_code                         int64
occupancy                        float64
last_means                       float64
bioname                           object
bioguide_id                       object
born                             float64
died                             float64
nominate_dim1                    float64
nominate_dim2                    float64
nominate_log_likelihood          float64
nominate_geo_mean_probability    float64
nominate_number_of_votes         float64
nominate_number_of_errors        float64
conditional                      float64
nokken_poole_dim1                float64
nokken_poole_dim2                float64
dtype: object

In [29]:
df.drop(["occupancy", "last_means", "bioguide_id"], axis=1)

Unnamed: 0,congress,chamber,icpsr,state_icpsr,district_code,state_abbrev,party_code,bioname,born,died,nominate_dim1,nominate_dim2,nominate_log_likelihood,nominate_geo_mean_probability,nominate_number_of_votes,nominate_number_of_errors,conditional,nokken_poole_dim1,nokken_poole_dim2
1,1,House,379,44,2.0,GA,4000,"BALDWIN, Abraham",1754.0,1807.0,-0.165,-0.373,-28.55029,0.75800,103.0,12.0,,-0.429,-0.817
2,1,House,4854,44,1.0,GA,4000,"JACKSON, James",1757.0,1806.0,-0.320,-0.181,-24.89986,0.77600,98.0,9.0,,-0.559,-0.052
3,1,House,6071,44,3.0,GA,4000,"MATHEWS, George",1739.0,1812.0,-0.428,-0.317,-12.62728,0.88000,99.0,2.0,,-0.413,-0.232
4,1,House,1538,52,6.0,MD,5000,"CARROLL, Daniel",1730.0,1796.0,0.116,-0.740,-23.47008,0.78300,96.0,11.0,,0.114,-0.779
5,1,House,2010,52,3.0,MD,4000,"CONTEE, Benjamin",1755.0,1815.0,-0.080,-0.387,-21.88695,0.78800,92.0,13.0,,-0.093,-0.411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40589,118,House,21970,25,1.0,WI,200,"STEIL, Bryan",1981.0,,0.410,0.085,-48.27340,0.90814,501.0,17.0,,0.446,0.290
40590,118,House,21989,25,7.0,WI,200,"TIFFANY, Thomas P.",1957.0,,0.641,-0.222,-44.02421,0.91539,498.0,14.0,,0.811,-0.206
40591,118,House,22115,25,5.0,WI,200,"FITZGERALD, Scott",1963.0,,0.607,0.164,-49.27568,0.90597,499.0,20.0,,0.581,0.033
40592,118,House,22370,25,3.0,WI,200,"VAN ORDEN, Derrick",1969.0,,0.381,0.017,-86.36110,0.83901,492.0,37.0,,0.390,0.013
