# This file is used to preprocess and train a logistic regression curve to predict the outcome of the 2021 elections

In [52]:
import pandas as pd

# Read the original data in
df = pd.read_csv('../data/data_election_2020.csv')

# Let's see the columns we are working with

print(list(df.columns))

# Let's also see the data balance

print(df['majority'].value_counts())

# So we have a heavy imbalance in the data we will have to address this later

['state', 'county', 'majority', 'trump16', 'clinton16', 'otherpres16', 'romney12', 'obama12', 'otherpres12', 'demsen16', 'repsen16', 'othersen16', 'demhouse16', 'rephouse16', 'otherhouse16', 'total_population', 'cvap', 'white_pct', 'black_pct', 'hispanic_pct', 'nonwhite_pct', 'foreignborn_pct', 'female_pct', 'age29andunder_pct', 'age65andolder_pct', 'median_hh_inc', 'clf_unemploy_pct', 'lesshs_pct', 'lesscollege_pct', 'lesshs_whites_pct', 'lesscollege_whites_pct', 'rural_pct']
Trump    2524
Biden     503
Name: majority, dtype: int64


Now lets conduct some preprocessing on the data

In [53]:
# We are going to preprocess this data to be suited for a logistic regression curve

# We will start by dropping the columns we don't need

df = df.drop('county', axis=1)

# Check for missing values

print(df.isna().sum())
print(len(df))

state                        0
majority                     0
trump16                      0
clinton16                    0
otherpres16                  0
romney12                     0
obama12                      0
otherpres12                  0
demsen16                  1120
repsen16                  1120
othersen16                1120
demhouse16                 221
rephouse16                 221
otherhouse16               221
total_population             3
cvap                         3
white_pct                    3
black_pct                    3
hispanic_pct                 3
nonwhite_pct                 3
foreignborn_pct              3
female_pct                   3
age29andunder_pct            3
age65andolder_pct            3
median_hh_inc                3
clf_unemploy_pct             3
lesshs_pct                   3
lesscollege_pct              3
lesshs_whites_pct            3
lesscollege_whites_pct       3
rural_pct                    1
dtype: int64
3027


I want to check if there are any cols that are mostly na

In [54]:
len(df[df.isna().any(axis=1)])
print(list(df.columns))
print(df.head())

# 1154 rows with na values

['state', 'majority', 'trump16', 'clinton16', 'otherpres16', 'romney12', 'obama12', 'otherpres12', 'demsen16', 'repsen16', 'othersen16', 'demhouse16', 'rephouse16', 'otherhouse16', 'total_population', 'cvap', 'white_pct', 'black_pct', 'hispanic_pct', 'nonwhite_pct', 'foreignborn_pct', 'female_pct', 'age29andunder_pct', 'age65andolder_pct', 'median_hh_inc', 'clf_unemploy_pct', 'lesshs_pct', 'lesscollege_pct', 'lesshs_whites_pct', 'lesscollege_whites_pct', 'rural_pct']
     state majority  trump16  clinton16  otherpres16  romney12  obama12  \
0  Alabama    Trump    18172       5936          865     17379     6363   
1  Alabama    Trump    72883      18458         3874     66016    18424   
2  Alabama    Trump     5454       4871          144      5550     5912   
3  Alabama    Trump     6738       1874          207      6132     2202   
4  Alabama    Trump    22859       2156          573     20757     2970   

   otherpres12  demsen16  repsen16  ...  female_pct  age29andunder_pct  \
0  

Before we address the na values -> We should apply one hot encoding to the categorical variables to numerically transform them

In [55]:
from sklearn.preprocessing import OneHotEncoder
categorical_columns = ['state', 'majority']
enc = OneHotEncoder(handle_unknown='ignore')
encoded_data = enc.fit_transform(df[categorical_columns])
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=enc.get_feature_names_out(categorical_columns))
df = df.drop(columns=categorical_columns)
df = pd.concat([df, encoded_df], axis=1)
# We can also drop majority since if majority trump == 0 then we can assume they voted for biden
df = df.drop('majority_Trump', axis=1)

Judging by the col na count. My strategy for dealing with na values will be as follows:
- Drop the cols with 1/3 na
- Impute with strategy 'median' for the rest 

In [56]:
from sklearn.impute import SimpleImputer

columns_to_drop = [
    'demsen16',
    'repsen16',
    'othersen16',
]

df = df.drop(columns_to_drop, axis=1)

imputer = SimpleImputer(strategy='median')
imputer.fit(df)
imputed_df = imputer.transform(df)
imputed_df = pd.DataFrame(imputed_df, columns=df.columns)

Let's standardise the data

In [57]:
imputed_df.head()

Unnamed: 0,trump16,clinton16,otherpres16,romney12,obama12,otherpres12,demhouse16,rephouse16,otherhouse16,total_population,...,state_Tennessee,state_Texas,state_Utah,state_Vermont,state_Virginia,state_Washington,state_West Virginia,state_Wisconsin,state_Wyoming,majority_Biden
0,18172.0,5936.0,865.0,17379.0,6363.0,190.0,7544.0,14315.0,2258.0,55049.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,72883.0,18458.0,3874.0,66016.0,18424.0,898.0,0.0,76995.0,1991.0,199510.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5454.0,4871.0,144.0,5550.0,5912.0,47.0,5297.0,4286.0,463.0,26614.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6738.0,1874.0,207.0,6132.0,2202.0,86.0,1971.0,6670.0,15.0,22572.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,22859.0,2156.0,573.0,20757.0,2970.0,279.0,2390.0,22367.0,47.0,57704.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
y = imputed_df['majority_Biden']
X = imputed_df.drop('majority_Biden', axis=1)
scaled_df = scaler.fit_transform(X)
scaled_df = pd.DataFrame(scaled_df, columns=X.columns)
print(scaled_df)

       trump16  clinton16  otherpres16  romney12   obama12  otherpres12  \
0    -0.050101  -0.187409    -0.211717 -0.049035 -0.197572    -0.220460   
1     1.205277  -0.033304     0.185099  1.032118 -0.036067     0.016656   
2    -0.341923  -0.200515    -0.306800 -0.311982 -0.203612    -0.268352   
3    -0.312461  -0.237399    -0.298492 -0.299045 -0.253291    -0.255290   
4     0.057446  -0.233928    -0.250225  0.026055 -0.243007    -0.190653   
...        ...        ...          ...       ...       ...          ...   
3022 -0.188187  -0.220698    -0.095666 -0.181320 -0.218850    -0.052000   
3023 -0.377099  -0.170450    -0.142218 -0.327364 -0.199581    -0.152473   
3024 -0.325861  -0.245669    -0.178880 -0.288308 -0.260977    -0.184959   
3025 -0.400274  -0.253914    -0.276864 -0.368355 -0.272145    -0.238545   
3026 -0.397474  -0.256782    -0.300206 -0.372645 -0.277126    -0.245243   

      demhouse16  rephouse16  otherhouse16  total_population  ...  \
0      -0.155222   -0.136321  

In [74]:
# Lets split into train test split noting the imbalance 
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y) 
print(X_train)
print(y_train)


       trump16  clinton16  otherpres16  romney12   obama12  otherpres12  \
1171   10364.0    24478.0       3335.0    9344.0   27072.0       1342.0   
181     2111.0     2773.0        392.0    2285.0    2733.0        163.0   
971    10710.0     4706.0        789.0    9931.0    5228.0        257.0   
2612    1780.0      354.0         62.0    1756.0     538.0         22.0   
1576    1723.0      318.0        130.0    1688.0     471.0         69.0   
...        ...        ...          ...       ...       ...          ...   
188   333243.0   373695.0      44453.0  318127.0  329063.0      14717.0   
1575     278.0       30.0         14.0     240.0      49.0          9.0   
2260    7386.0     7732.0        277.0    7071.0    9091.0        130.0   
1487    4879.0     1265.0        277.0    4006.0    1906.0        142.0   
619     4807.0     3071.0        647.0    3876.0    4507.0        151.0   

      demhouse16  rephouse16  otherhouse16  total_population  ...  \
1171     30352.0         0.0  

In [80]:
# Fitting logistic regression to the training set

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

classifier = LogisticRegression(random_state=0, max_iter=30000, verbose=1) # A lot of max_iterations because it was not converging
classifier.fit(X_train, y_train)

# Evaluating the model
y_pred = classifier.predict(X_test)
print("confusion_matrix")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Accuracy
print("Accuracy") # Gonna optimse on this or f1 score
print(accuracy_score(y_test, y_pred))

# Precision
print("Precision")
print(precision_score(y_test, y_pred))

# Recall
print("Recall")
print(recall_score(y_test, y_pred))

# F1 Score
print("F1 Score")
print(f1_score(y_test, y_pred))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


confusion_matrix
[[622   9]
 [ 24 102]]
Accuracy
0.9564068692206077
Precision
0.918918918918919
Recall
0.8095238095238095
F1 Score
0.860759493670886


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.6s finished
