In [33]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('clean_data.csv')

In [19]:
df.columns

Index(['Unnamed: 0', 'contest', 'full_name', 'ledb_candid', 'fips', 'geo_name',
       'year', 'votes', 'vote_share', 'incumbent', 'winner', 'n_winners',
       'prob_democrat', 'prob_republican', 'pid_est', 'prob_male',
       'prob_female', 'gender_est', 'prob_black', 'prob_white',
       'prob_hispanic', 'prob_asian', 'prob_other', 'race_est',
       'contributor.cfscore', 'percent_women', 'percent_white',
       'percent_black', 'percent_hispanic', 'percent_asian_american',
       'candidate_count', 'cpi_prevYear', 'unemployment_prevYear',
       'pid_est_knn', 'gender_est_knn', 'race_est_knn', 'female', 'c_female',
       'c_democrats', 'c_republicans', 'c_white', 'c_hispanic', 'c_black',
       'c_asian', 'c_other'],
      dtype='object')

***Features engineering***

In [15]:
df['c_female'] = df['gender_est'].map(lambda val: 1 if val == 'F' else 0)
df['c_democrats'] = df['pid_est'].map(lambda val: 1 if val == 'D' else 0)
df['c_republicans'] = df['pid_est'].map(lambda val: 1 if val == 'R' else 0)
df['c_white'] = df['race_est'].map(lambda val: 1 if val == 'caucasian' else 0)
df['c_hispanic'] = df['race_est'].map(lambda val: 1 if val == 'hispanic' else 0)
df['c_black'] = df['race_est'].map(lambda val: 1 if val == 'black' else 0)
df['c_asian'] = df['race_est'].map(lambda val: 1 if val == 'asian' else 0)
df['c_other'] = df['race_est'].map(lambda val: 1 if val == 'other' else 0)

***Features matrix***

In [62]:
features_m1 = ['incumbent']

features_m2 = ['incumbent', 'candidate_count']

features_m3 = ['incumbent', 'candidate_count', 'female',
               'c_democrats', 'c_republicans',
               'c_white', 'c_hispanic', 'c_black', 'c_asian', 'c_other']

features_m3 = ['incumbent', 'candidate_count', 'female',
               'c_democrats', 'c_republicans',
               'c_white', 'c_hispanic', 'c_black', 'c_asian', 'c_other',
               'percent_women', 'percent_white']

features_m4 = ['incumbent', 'candidate_count', 'female',
               'c_democrats', 'c_republicans',
               'c_white', 'c_hispanic', 'c_black', 'c_asian', 'c_other',
               'percent_women', 'percent_white',
               'cpi_prevYear', 'unemployment_prevYear']

features_m5 = ['incumbent', 'candidate_count', 'pid_est_knn', 'gender_est_knn', 'race_est_knn']

***Logistic regression and hyperparameter***

In [56]:
def logistic(df, features):
    split_index = int(0.8 * len(df))
    train_df = df.iloc[:split_index]
    test_df = df.iloc[split_index:]

    X_train = train_df[features]
    y_train = train_df['winner']

    X_test = test_df[features]
    y_test = test_df['winner']

    param_grid = {
        'C': [0.01, 0.1, 1, 10, 100], #inverse of the regularization strength, prevent overfitting
        'penalty': ['l1', 'l2'], #Lasso-like, Ridge-like, mix 'elasticnet'
        'solver': ['liblinear', 'saga'] #solved optimization. Other: 'lbfgs', 'newton-cg', 'sag'
    }

    # GridSearchCV with cross-validation
    grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='accuracy')
    # cv is cross validation
    grid.fit(X_train, y_train)
    print("Best Parameters:", grid.best_params_)

    model = LogisticRegression()
    model.fit(X_train, y_train)

    y_pred = grid.best_estimator_.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [57]:
logistic(df, features_m1)

Best Parameters: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 0.8788167938931297
              precision    recall  f1-score   support

        lose       0.92      0.90      0.91       717
         win       0.80      0.82      0.81       331

    accuracy                           0.88      1048
   macro avg       0.86      0.86      0.86      1048
weighted avg       0.88      0.88      0.88      1048



In [58]:
logistic(df, features_m2)

Best Parameters: {'C': 1, 'penalty': 'l1', 'solver': 'saga'}
Accuracy: 0.8778625954198473
              precision    recall  f1-score   support

        lose       0.91      0.91      0.91       717
         win       0.80      0.81      0.81       331

    accuracy                           0.88      1048
   macro avg       0.86      0.86      0.86      1048
weighted avg       0.88      0.88      0.88      1048



In [59]:
logistic(df, features_m3)



Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'saga'}
Accuracy: 0.8807251908396947
              precision    recall  f1-score   support

        lose       0.91      0.91      0.91       717
         win       0.81      0.82      0.81       331

    accuracy                           0.88      1048
   macro avg       0.86      0.86      0.86      1048
weighted avg       0.88      0.88      0.88      1048





In [60]:
logistic(df, features_m4)

Best Parameters: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 0.8702290076335878
              precision    recall  f1-score   support

        lose       0.90      0.91      0.91       717
         win       0.80      0.79      0.79       331

    accuracy                           0.87      1048
   macro avg       0.85      0.85      0.85      1048
weighted avg       0.87      0.87      0.87      1048



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# features with KNN

logistic(df, features_m5)

Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.8759541984732825
              precision    recall  f1-score   support

        lose       0.91      0.91      0.91       717
         win       0.80      0.81      0.80       331

    accuracy                           0.88      1048
   macro avg       0.86      0.86      0.86      1048
weighted avg       0.88      0.88      0.88      1048



variance and covariance: we want high variance => help feature selections

EDA (variance covairance / histograms / )

hyperparameters

report

bias audit

 search