### Bring in other data

In [1]:
import pickle
pkl_file = open('speeches_bills_members.pkl', 'rb')
df_all = pickle.load(pkl_file)

In [83]:
print("Number of bills: ", len(df_all['bill_slug'].unique()))
print("Number of votes: ", len(df_all['votes_api_url'].unique()))
print("Proportion", round(df_all['vote_with_party'].value_counts()[0]/df_all['vote_with_party'].value_counts()[1] * 100, 2), "%" )

Number of bills:  624
Number of votes:  2355
Proportion 9.92 %


In [2]:
df_all['dw_nominate_members'] = df_all['dw_nominate_members'].map(abs)
df_all['dw_nominate_voters'] = df_all['dw_nominate_voters'].map(abs)
df_all['members_seniority'] = df_all['members_seniority'].map(int)
df_all['members_seniority_sponsor'] = df_all['members_seniority_sponsor'].map(int)

In [99]:
df_all[['congress', 'members_congress', 'members_congress_sponsor']] = df_all[['congress', 'members_congress', 'members_congress_sponsor']].astype(object)

In [106]:
df_all[['members_missed_votes_pct', 'missed_votes_pct','members_seniority', 'members_seniority_sponsor', 'members_votes_with_party_pct','votes_with_party_pct','members_votes_with_party_pct_sponsor']] = (df_all[['members_missed_votes_pct', 'missed_votes_pct','members_seniority', 'members_seniority_sponsor', 
 'members_votes_with_party_pct','votes_with_party_pct','members_votes_with_party_pct_sponsor']].astype(float))

In [107]:
independent_vars = (['mentions_non_partisan_agency_no_median',
                 'uses_statistics_median',
                 'crs_sim_avg_median'])

predictor = ['vote_with_party']

moderators = (['cosponsored_by_mt1_party',
             'cosponsored_ratio_mr40pc',
             'primary_suject_no_months_mentioned',
             'subject_nicheness', 'dw_nominate_members',           
         'days_ago', 'dw_nominate_voters',
         'members_yrs_until_nxt_election',
         'members_age',
         'swing_state',
         'swing_state2',
         'member_diff_party_state',
              'votes_time_of_day',
        'members_gender',
     'members_leadership_role',
     'members_missed_votes_pct',
     'members_party',
     'members_senate_class',
     'members_seniority',
     'members_state_rank',
     'members_votes_with_party_pct',
     'members_senate_class_sponsor',
     'members_seniority_sponsor',
     'members_state_rank_sponsor',
     'members_leadership_role_sponsor',
     'members_votes_with_party_pct_sponsor',
     'members_party_sponsor',
     'members_gender_sponsor',
      'sponsor_party',
    'votes_chamber'])

control_vars = (['democratic_majority_position',
             'democratic_no',
             'democratic_not_voting',
             'democratic_present',
             'democratic_yes',
             'independent_no',
             'independent_not_voting',
             'independent_present',
             'independent_yes',
            'republican_majority_position',
             'republican_no',
             'republican_not_voting',
             'republican_present',
             'republican_yes',
            'votes_total_no',
             'votes_total_not_voting',
             'votes_total_yes',
               ])

In [108]:
df_modelling = df_all[[*independent_vars, *predictor, *moderators, *control_vars]]

In [110]:
import pandas as pd
# Making dummies
object_cols = list(df_modelling.select_dtypes(include=[object]).columns)
object_cols.append('votes_time_of_day')
print(object_cols)


df_modelling = (pd.get_dummies(df_modelling, prefix=object_cols, 
        columns=object_cols, dtype = int))

['members_gender', 'members_leadership_role', 'members_party', 'members_senate_class', 'members_state_rank', 'members_senate_class_sponsor', 'members_state_rank_sponsor', 'members_leadership_role_sponsor', 'members_party_sponsor', 'members_gender_sponsor', 'sponsor_party', 'votes_chamber', 'democratic_majority_position', 'republican_majority_position', 'votes_time_of_day']


In [112]:
moderators_dum = []
for moderator in moderators:
    for col in df_modelling:
        if moderator in col:
            moderators_dum.append(col)

In [113]:
moderators_dum.remove('members_gender_M')
moderators_dum.remove('members_gender_sponsor_M')

## Modelling

### Predictors

- Similarity with CRS reports
- Number of mentions non-partisan research
- Number of statistics

### What influences vote?

Data structured as vote by each member being the object of interest.
 
- Bill: co-sponsorship, subject nicheness
- Voter: swing-state, years until next election, DW-nominate scale, senator / representative, gender, age
- Vote: how long ago


In [114]:
# Correlations between predictors
from scipy.stats.stats import pearsonr

independent_vars_mean = (['crs_sim_avg_median', 'mentions_non_partisan_agency_no_median', 
              'uses_statistics_median'])
correlations_dict = {}
for var in independent_vars_mean:
    for variable in independent_vars_mean:
        corr = pearsonr(df_modelling[var], df_modelling[variable])
        if var != variable:
            correlations_dict[(var, variable)] = corr

In [115]:
correlations_dict

{('crs_sim_avg_median',
  'mentions_non_partisan_agency_no_median'): (0.11544000841379777, 0.0),
 ('crs_sim_avg_median', 'uses_statistics_median'): (0.08573367839487948, 0.0),
 ('mentions_non_partisan_agency_no_median',
  'crs_sim_avg_median'): (0.11544000841379777, 0.0),
 ('mentions_non_partisan_agency_no_median',
  'uses_statistics_median'): (0.07078307872149951, 0.0),
 ('uses_statistics_median', 'crs_sim_avg_median'): (0.08573367839487948, 0.0),
 ('uses_statistics_median',
  'mentions_non_partisan_agency_no_median'): (0.07078307872149951, 0.0)}

In [116]:
df_modelling = df_modelling.loc[:,~df_modelling.columns.duplicated()]

In [117]:
from pandas.api.types import is_numeric_dtype

for col in df_modelling.columns:
    if is_numeric_dtype(df_modelling[col]):
        df_modelling[col] = df_modelling[col].fillna(df_modelling[col].mean(),axis=0)
    elif is_string_dtype(df_modelling[col]):
        df_modelling[col] = df_modelling[col].fillna('',axis=0)
    else:
        df_modelling[col] = df_modelling[col].astype(str)
        df_modelling[col] = df_modelling[col].fillna('',axis=0)

In [118]:
# Generate interaction variables
from sklearn.preprocessing import PolynomialFeatures
df_interactions = pd.DataFrame()
for ind_var in independent_vars:
    for var in moderators_dum:
        try:
            x_t = PolynomialFeatures(2, interaction_only=True, include_bias=False).fit(df_modelling[[ind_var, var]])
            features = pd.DataFrame(x_t.transform(df_modelling[[ind_var, var]]), columns=x_t.get_feature_names([ind_var, var]))
            if df_interactions.empty is True:
                df_interactions = features.iloc[:,-1]
            else:
                df_interactions = pd.concat([df_interactions, features.iloc[:,-1]], axis = 1)
        except(ValueError):
            print(ind_var, var)
            

In [119]:
df_modelling_interactions = pd.concat([df_modelling,df_interactions], axis = 1)

In [120]:
# Test train split
import pandas as pd
from sklearn.model_selection import train_test_split

y = df_modelling_interactions['vote_with_party']
X = df_modelling_interactions.drop(columns = 'vote_with_party')

# Test, train split
X_tr, X_holdout, y_tr, y_holdout = train_test_split(X, y, test_size=0.3,random_state=42)

In [121]:
print(df_all.shape)
print(df_modelling_interactions.shape)
print(X.shape)
print(y.shape)

(781506, 773)
(781506, 347)
(781506, 346)
(781506,)


In [122]:
# Pickle data

import pickle
output = open('X_tr.pkl', 'wb')
pickle.dump(X_tr, output)

output.close()

output = open('y_tr.pkl', 'wb')
pickle.dump(y_tr, output)

output.close()

import pickle
output = open('X_holdout.pkl', 'wb')
pickle.dump(X_holdout, output)

output.close()

output = open('y_holdout.pkl', 'wb')
pickle.dump(y_holdout, output)

output.close()

In [123]:
# Load pickle
import pickle
pkl_file = open('X_tr.pkl', 'rb')
X_tr = pickle.load(pkl_file)

pkl_file = open('y_tr.pkl', 'rb')
y_tr = pickle.load(pkl_file)

pkl_file = open('X_holdout.pkl', 'rb')
X_holdout = pickle.load(pkl_file)

pkl_file = open('y_holdout.pkl', 'rb')
y_holdout = pickle.load(pkl_file)

In [124]:
from sklearn.preprocessing import StandardScaler

# Scale for Logistic regression
scaler = StandardScaler()

In [128]:
# Create pipelines and parameter grids for each algorithm

from imblearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

from tempfile import mkdtemp
import numpy as np
# Pipelines
cachedir = mkdtemp() # cache in memory

lr = LogisticRegression(fit_intercept = True)
pipeline_lr = (Pipeline([
                    ('scale',scaler),
                    ('logistic_regression', lr)]))

parameters_lr = [{'logistic_regression__C': [0.0001, 0.001, 0.1]}]

In [129]:
# Grid search through the parameters to find the best model
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import numpy as np

algorithm_names = ['logistic_regression']
parameter_list = [parameters_lr]
pipeline_list = [pipeline_lr]

model_results = []
for i in range(0,len(pipeline_list)):
    print('Current algorithm: ', algorithm_names[i])
    gscv = GridSearchCV(pipeline_list[i], parameter_list[i], cv=3, scoring = 'f1', verbose = 5, refit = True)
    gscv.fit(X_tr, y_tr.ravel()) 
    print("Best parameters: ", gscv.best_params_)
    print("Best estimator", gscv.best_estimator_)
    mean = gscv.cv_results_['mean_test_score'][gscv.best_index_]
    std = gscv.cv_results_['std_test_score'][gscv.best_index_]
    try:
        intercept = gscv.best_estimator_.named_steps[algorithm_names[i]].intercept_
    except(AttributeError):
        intercept = np.nan
    try:
        coefficients = gscv.best_estimator_.named_steps[algorithm_names[i]].coef_
    except(AttributeError): 
        coefficients = np.zeros(len(X_tr.columns), dtype=float)
        coefficients.fill(np.nan)
    print("Mean f1 of best estimator:", mean)
    print("Std f1 of best estimator:",  std)
    # Creating a list of model dictionaries
    model_results.append({'algorithm': algorithm_names[i],
                        'model': gscv.best_estimator_, 
                        'coefficients': dict(zip(X_tr.columns, coefficients)),
                        'intercept': intercept,
                        'mean': mean,
                        'std': std,
                        'LB_r2': (mean - 2*std),
                        'UB_r2': (mean + 2*std)})


Current algorithm:  logistic_regression
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] logistic_regression__C=0.0001 ...................................
[CV]  logistic_regression__C=0.0001, score=0.9540437510657421, total=  49.7s
[CV] logistic_regression__C=0.0001 ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   59.0s remaining:    0.0s


[CV]  logistic_regression__C=0.0001, score=0.9542348048975724, total=  50.5s
[CV] logistic_regression__C=0.0001 ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.0min remaining:    0.0s


[CV]  logistic_regression__C=0.0001, score=0.9544929277988352, total=  50.1s
[CV] logistic_regression__C=0.001 ....................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.0min remaining:    0.0s


[CV]  logistic_regression__C=0.001, score=0.9538609179255045, total= 1.2min
[CV] logistic_regression__C=0.001 ....................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  4.3min remaining:    0.0s


[CV]  logistic_regression__C=0.001, score=0.9540733462416553, total= 1.2min
[CV] logistic_regression__C=0.001 ....................................
[CV]  logistic_regression__C=0.001, score=0.9542477617890666, total= 1.2min
[CV] logistic_regression__C=0.1 ......................................
[CV]  logistic_regression__C=0.1, score=0.9538272954087488, total= 4.3min
[CV] logistic_regression__C=0.1 ......................................
[CV]  logistic_regression__C=0.1, score=0.9540746135793476, total= 4.5min
[CV] logistic_regression__C=0.1 ......................................
[CV]  logistic_regression__C=0.1, score=0.9542058396944358, total= 4.5min


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 20.7min finished


Best parameters:  {'logistic_regression__C': 0.0001}
Best estimator Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logistic_regression', LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])
Mean f1 of best estimator: 0.9542571603921001
Std f1 of best estimator: 0.00018405555382526332


In [130]:
# Calculate p-values for coefficients
import numpy as np
import pandas as pd
from scipy import stats

params = np.append(gscv.best_estimator_.named_steps['logistic_regression'].intercept_, gscv.best_estimator_.named_steps['logistic_regression'].coef_[0])

lr = gscv.best_estimator_.named_steps['logistic_regression']
predictions = lr.predict(X_holdout)

In [131]:
X_holdout.reset_index(inplace = True, drop = True)
newX = pd.DataFrame({"Constant":np.ones(len(X_holdout))}).join(pd.DataFrame(X_holdout))
MSE = (sum((y_holdout-predictions)**2))/(len(newX)-len(newX.columns))

In [132]:
dot = np.dot(newX.T,newX)

In [133]:
dot2 = np.array(dot, dtype = float)

In [134]:
inv = np.linalg.pinv(dot2)

In [135]:
var_b = float(MSE)*inv.diagonal()
sd_b = np.sqrt(var_b)
ts_b = params / sd_b

  
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


In [136]:
p_values =[2*(1-stats.t.cdf(np.abs(i),(len(newX)-1))) for i in ts_b]

In [137]:
sd_b = np.round(sd_b,3)
ts_b = np.round(ts_b,3)
p_values = np.round(p_values,3)
params = np.round(params,4)

In [138]:
features = list(X_tr.columns)
features.insert(0,'intercept')

In [139]:
myDF3 = pd.DataFrame()
myDF3['Features'],myDF3["Coefficients"],myDF3["Standard Errors"],myDF3["t values"],myDF3["Probabilites"] = [features, params,sd_b,ts_b,p_values]

In [140]:
pd.set_option("display.max_colwidth",-1)

In [141]:
myDF3.to_csv('coefficient_table_mean_ind_best.csv')

In [142]:
#myDF3.loc[(myDF3['Probabilites'] <= 0.05) & (myDF3['Features'].isin(independent_vars))]
#myDF3.loc[(myDF3['Features'].isin(independent_vars))]
#myDF3.loc[(myDF3['Probabilites'] <= 0.05) & (myDF3['Features'].isin(moderators))]
myDF3.loc[(myDF3['Probabilites'] <= 0.05) & (myDF3['Features'].isin(df_interactions.columns))].sort_values(by = ['Coefficients'])

Unnamed: 0,Features,Coefficients,Standard Errors,t values,Probabilites
298,crs_sim_avg_median members_missed_votes_pct,-0.055,0.004,-13.197,0.0
313,crs_sim_avg_median members_seniority_sponsor,-0.0137,0.001,-10.511,0.0
326,crs_sim_avg_median members_seniority_sponsor,-0.0137,0.001,-10.511,0.0
227,uses_statistics_median members_seniority,-0.0091,0.001,-15.804,0.0
182,uses_statistics_median days_ago,-0.0088,0.0,-92.712,0.0
128,mentions_non_partisan_agency_no_median members_missed_votes_pct,-0.0074,0.001,-6.484,0.0
312,crs_sim_avg_median members_seniority,-0.0062,0.003,-2.252,0.024
213,uses_statistics_median members_missed_votes_pct,-0.0056,0.001,-5.881,0.0
270,crs_sim_avg_median members_age,-0.0049,0.002,-2.149,0.032
100,mentions_non_partisan_agency_no_median members_age,-0.0031,0.001,-5.635,0.0


In [143]:
def two_thirds_same_sign(lst):
    part_len_list = 2/3 * len(lst)
    no_pos = float(sum(i >= 0 for i in lst))
    if (no_pos >= part_len_list):
        return 'positive coefficients'
    elif ((len(lst) - no_pos) >= part_len_list):
        return 'negative coefficients'
    else:
        return 'no clear relationship'

In [144]:
# Create lists of moderators that are significant in 2/3 of their interactions and of those significant interactions
# 2/3 of the coefficients have the same sign
sig_moderators_pos, sig_moderators_neg  = [], []
for mod in moderators:
    features_int = [i for i, x in enumerate(myDF3['Features']) if mod in x]
    p_values = {}
    coeffs = []
    for idx in features_int[1:]:
        p_values[idx] = myDF3.loc[idx, 'Probabilites']
    p_values_sig = [idx for idx, p in p_values.items() if p <= 0.05]
    for idx in p_values_sig:
        coeffs.append(myDF3.loc[idx, 'Coefficients'])    
        if (len(p_values_sig) >= 2/3 * len(p_values)) & (two_thirds_same_sign(coeffs) == 'positive coefficients'):
            sig_moderators_pos.append(mod)
        elif (len(p_values_sig) >= 2/3 * len(p_values)) & (two_thirds_same_sign(coeffs) == 'negative coefficients'):
            sig_moderators_neg.append(mod)
        

In [145]:
print("Significant and positive moderators: ", sig_moderators_neg)
print("Significant and negative moderators", sig_moderators_pos)

Significant and positive moderators:  ['members_age', 'members_age', 'members_age', 'members_missed_votes_pct', 'members_missed_votes_pct', 'members_missed_votes_pct', 'members_seniority', 'members_seniority', 'members_seniority', 'members_seniority', 'members_seniority']
Significant and negative moderators ['primary_suject_no_months_mentioned', 'primary_suject_no_months_mentioned', 'days_ago', 'days_ago', 'members_seniority_sponsor', 'members_seniority_sponsor', 'members_seniority_sponsor']


In [146]:
for mod in list(set(sig_moderators_neg)):
    print(myDF3.loc[(myDF3['Features'].str.contains(mod))])

                                                            Features  \
16   members_missed_votes_pct                                          
128  mentions_non_partisan_agency_no_median members_missed_votes_pct   
213  uses_statistics_median members_missed_votes_pct                   
298  crs_sim_avg_median members_missed_votes_pct                       

     Coefficients  Standard Errors  t values  Probabilites  
16  -0.1207        0.000           -520.785   0.0           
128 -0.0074        0.001           -6.484     0.0           
213 -0.0056        0.001           -5.881     0.0           
298 -0.0550        0.004           -13.197    0.0           
                                                             Features  \
17   members_seniority                                                  
19   members_seniority_sponsor                                          
142  mentions_non_partisan_agency_no_median members_seniority           
143  mentions_non_partisan_agency_no_media

In [150]:
sig_same_sign = myDF3.loc[(myDF3['Features'].str.contains(mod))]
sig_same_sign.to_csv('/home/ubuntu/Notebooks/results.csv')

In [151]:
# F1 score on holdout data
from sklearn.metrics import f1_score
f1 = f1_score(y_holdout, predictions)
print(f1)

0.9466530873432896
