In [1]:
%autosave 0

In [2]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
import statsmodels.formula.api as sm
from sklearn import linear_model
from sklearn import metrics
import pandas as pd
import numpy as np
import os

In [3]:
data = pd.read_csv(os.getcwd() + '\data\merged_train.csv')
# data.head()

**Util method to print score of a model**

In [4]:
def print_scores(scores):
    score_str = ''
    score_str += 'Test R-Squared: {}'.format(scores['test_r2']) + '\n'
    score_str += 'Root Mean Squared Error: {}'.format( [(i ** 2) ** 0.25 for i in scores['test_neg_mean_squared_error']] )
    return score_str

**Training and evaluating model**

In [5]:
def build_model(x, y, clf = linear_model.LinearRegression(), folds = 3):
    x, y = data.iloc[:, columns], data['Democratic']
    pipeline = Pipeline([
        ('scaler',StandardScaler()),
        ('clf', clf)
    ])
    scores = cross_validate(pipeline, x , y, cv=folds, scoring=('r2', 'explained_variance', 'neg_mean_squared_error'),
                            return_train_score=True)
    return print_scores(scores)

**Choosing columns**

In [6]:
columns = np.array([3, 4, 5, 6, 7, 10, 11, 12, 14, 15], dtype=np.intp)
# columns = np.array([i for i in range(3, 16)], dtype=np.intp)

**Choosing model**

In [7]:
clf = linear_model.LassoLarsIC(criterion='bic')
# clf = linear_model.Lasso(alpha = 25)
# clf = linear_model.Ridge(alpha = 25)
# clf = linear_model.LinearRegression()
# clf = linear_model.LassoLarsCV(max_iter=3)

**View results**

In [8]:
print(build_model(data.iloc[:, columns], data['Democratic'], clf = clf))

Test R-Squared: [0.81532004 0.95497886 0.88086461]
Root Mean Squared Error: [37944.73516851128, 13241.790203858016, 21874.27475307561]


In [9]:
# Test R-Squared: [0.79550104 0.77117976 0.8171098 ]
# Root Mean Squared Error: [67374.26161023966, 57667.56299252465, 40041.77976825247]

In [10]:
print(build_model(data.iloc[:, columns], data['Republican'], clf = clf))

Test R-Squared: [0.81532004 0.95497886 0.88086461]
Root Mean Squared Error: [37944.73516851128, 13241.790203858016, 21874.27475307561]


In [11]:
# Test R-Squared: [0.88488865 0.9415004  0.95100121]
# Root Mean Squared Error: [8423.569590883622, 5243.099944959566, 4642.007438335223]