# This file is used to preprocess and train a logistic regression curve to predict the outcome of the 2021 elections

In [2]:
import pandas as pd

# Read the original data in
df = pd.read_csv('../data/data_election_2020.csv')

# Let's see the columns we are working with

print(list(df.columns))

# Let's also see the data balance

print(df['majority'].value_counts())

# So we have a heavy imbalance in the data we will have to address this later

['state', 'county', 'majority', 'trump16', 'clinton16', 'otherpres16', 'romney12', 'obama12', 'otherpres12', 'demsen16', 'repsen16', 'othersen16', 'demhouse16', 'rephouse16', 'otherhouse16', 'total_population', 'cvap', 'white_pct', 'black_pct', 'hispanic_pct', 'nonwhite_pct', 'foreignborn_pct', 'female_pct', 'age29andunder_pct', 'age65andolder_pct', 'median_hh_inc', 'clf_unemploy_pct', 'lesshs_pct', 'lesscollege_pct', 'lesshs_whites_pct', 'lesscollege_whites_pct', 'rural_pct']
Trump    2524
Biden     503
Name: majority, dtype: int64


Now lets conduct some preprocessing on the data

In [3]:
# We are going to preprocess this data to be suited for a logistic regression curve

# We will start by dropping the columns we don't need

df = df.drop('county', axis=1)

# Check for missing values

print(df.isna().sum())
print(len(df))

state                        0
majority                     0
trump16                      0
clinton16                    0
otherpres16                  0
romney12                     0
obama12                      0
otherpres12                  0
demsen16                  1120
repsen16                  1120
othersen16                1120
demhouse16                 221
rephouse16                 221
otherhouse16               221
total_population             3
cvap                         3
white_pct                    3
black_pct                    3
hispanic_pct                 3
nonwhite_pct                 3
foreignborn_pct              3
female_pct                   3
age29andunder_pct            3
age65andolder_pct            3
median_hh_inc                3
clf_unemploy_pct             3
lesshs_pct                   3
lesscollege_pct              3
lesshs_whites_pct            3
lesscollege_whites_pct       3
rural_pct                    1
dtype: int64
3027


I want to check if there are any cols that are mostly na

In [4]:
len(df[df.isna().any(axis=1)])
print(list(df.columns))
print(df.head())

# 1154 rows with na values

['state', 'majority', 'trump16', 'clinton16', 'otherpres16', 'romney12', 'obama12', 'otherpres12', 'demsen16', 'repsen16', 'othersen16', 'demhouse16', 'rephouse16', 'otherhouse16', 'total_population', 'cvap', 'white_pct', 'black_pct', 'hispanic_pct', 'nonwhite_pct', 'foreignborn_pct', 'female_pct', 'age29andunder_pct', 'age65andolder_pct', 'median_hh_inc', 'clf_unemploy_pct', 'lesshs_pct', 'lesscollege_pct', 'lesshs_whites_pct', 'lesscollege_whites_pct', 'rural_pct']
     state majority  trump16  clinton16  otherpres16  romney12  obama12  \
0  Alabama    Trump    18172       5936          865     17379     6363   
1  Alabama    Trump    72883      18458         3874     66016    18424   
2  Alabama    Trump     5454       4871          144      5550     5912   
3  Alabama    Trump     6738       1874          207      6132     2202   
4  Alabama    Trump    22859       2156          573     20757     2970   

   otherpres12  demsen16  repsen16  ...  female_pct  age29andunder_pct  \
0  

Before we address the na values -> We should apply one hot encoding to the categorical variables to numerically transform them

In [5]:
from sklearn.preprocessing import OneHotEncoder
categorical_columns = ['state', 'majority']
enc = OneHotEncoder(handle_unknown='ignore')
encoded_data = enc.fit_transform(df[categorical_columns])
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=enc.get_feature_names_out(categorical_columns))
df = df.drop(columns=categorical_columns)
df = pd.concat([df, encoded_df], axis=1)
# We can also drop majority since if majority trump == 0 then we can assume they voted for biden
df = df.drop('majority_Trump', axis=1)

Judging by the col na count. My strategy for dealing with na values will be as follows:
- Drop the cols with 1/3 na
- Impute with strategy 'median' for the rest 

In [6]:
from sklearn.impute import SimpleImputer

columns_to_drop = [
    'demsen16',
    'repsen16',
    'othersen16',
]

df = df.drop(columns_to_drop, axis=1)

imputer = SimpleImputer(strategy='median')
imputer.fit(df)
imputed_df = imputer.transform(df)
imputed_df = pd.DataFrame(imputed_df, columns=df.columns)

Let's standardise the data

In [6]:
imputed_df.head()

Unnamed: 0,trump16,clinton16,otherpres16,romney12,obama12,otherpres12,demhouse16,rephouse16,otherhouse16,total_population,...,state_Tennessee,state_Texas,state_Utah,state_Vermont,state_Virginia,state_Washington,state_West Virginia,state_Wisconsin,state_Wyoming,majority_Biden
0,18172.0,5936.0,865.0,17379.0,6363.0,190.0,7544.0,14315.0,2258.0,55049.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,72883.0,18458.0,3874.0,66016.0,18424.0,898.0,0.0,76995.0,1991.0,199510.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5454.0,4871.0,144.0,5550.0,5912.0,47.0,5297.0,4286.0,463.0,26614.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6738.0,1874.0,207.0,6132.0,2202.0,86.0,1971.0,6670.0,15.0,22572.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,22859.0,2156.0,573.0,20757.0,2970.0,279.0,2390.0,22367.0,47.0,57704.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
y = imputed_df['majority_Biden']
X = imputed_df.drop('majority_Biden', axis=1)
scaled_df = scaler.fit_transform(X)
scaled_df = pd.DataFrame(scaled_df, columns=X.columns)
print(scaled_df)

       trump16  clinton16  otherpres16  romney12   obama12  otherpres12  \
0    -0.050101  -0.187409    -0.211717 -0.049035 -0.197572    -0.220460   
1     1.205277  -0.033304     0.185099  1.032118 -0.036067     0.016656   
2    -0.341923  -0.200515    -0.306800 -0.311982 -0.203612    -0.268352   
3    -0.312461  -0.237399    -0.298492 -0.299045 -0.253291    -0.255290   
4     0.057446  -0.233928    -0.250225  0.026055 -0.243007    -0.190653   
...        ...        ...          ...       ...       ...          ...   
3022 -0.188187  -0.220698    -0.095666 -0.181320 -0.218850    -0.052000   
3023 -0.377099  -0.170450    -0.142218 -0.327364 -0.199581    -0.152473   
3024 -0.325861  -0.245669    -0.178880 -0.288308 -0.260977    -0.184959   
3025 -0.400274  -0.253914    -0.276864 -0.368355 -0.272145    -0.238545   
3026 -0.397474  -0.256782    -0.300206 -0.372645 -0.277126    -0.245243   

      demhouse16  rephouse16  otherhouse16  total_population  ...  \
0      -0.155222   -0.136321  

In [8]:
# Lets split into train test split noting the imbalance 
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y) 
print(X_train)
print(y_train)


      trump16  clinton16  otherpres16  romney12   obama12  otherpres12  \
1553   3320.0      787.0        267.0    3029.0    1219.0        175.0   
1527  17468.0     6250.0       1010.0   13248.0    8829.0        505.0   
2105  13362.0     3250.0        609.0   11177.0    4662.0          0.0   
2903   9666.0     2918.0        632.0    8135.0    4484.0        297.0   
395    4549.0     2837.0        126.0    4182.0    3167.0         47.0   
...       ...        ...          ...       ...       ...          ...   
306    4125.0     1744.0        146.0    3570.0    1845.0         59.0   
3019   3437.0      719.0        373.0    3136.0    1223.0        172.0   
381     751.0      594.0         45.0     735.0     729.0         26.0   
925    2574.0      461.0        158.0    2553.0     593.0         54.0   
2173  83197.0   153251.0      32784.0   93974.0  135291.0       7758.0   

      demhouse16  rephouse16  otherhouse16  total_population  ...  \
1553      1098.0      3149.0         157.0

In [9]:
# Fitting logistic regression to the training set

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

classifier = LogisticRegression(random_state=0, max_iter=30000, verbose=1) # A lot of max_iterations because it was not converging
classifier.fit(X_train, y_train)

# Evaluating the model
y_pred = classifier.predict(X_test)
print("confusion_matrix")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Accuracy
print("Accuracy") # Gonna optimse on this or f1 score
print(accuracy_score(y_test, y_pred))

# Precision
print("Precision")
print(precision_score(y_test, y_pred))

# Recall
print("Recall")
print(recall_score(y_test, y_pred))

# F1 Score
print("F1 Score")
print(f1_score(y_test, y_pred))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


confusion_matrix
[[625   6]
 [ 24 102]]
Accuracy
0.9603698811096433
Precision
0.9444444444444444
Recall
0.8095238095238095
F1 Score
0.8717948717948718


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.0s finished


In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

# Define models
models = {
    "RandomForestClassifier": RandomForestClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "LogisticRegression": LogisticRegression(),
    "SVC": SVC(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "SGDClassifier": SGDClassifier(),
    "GaussianNB": GaussianNB(),
    "MLPClassifier": MLPClassifier()
}

best_model = None
best_accuracy = 0
best_model_name = ""

# Iterate over models
for model_name, model in models.items():
    clf = GridSearchCV(model, {}, cv=5, scoring='accuracy')
    clf.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"{model_name} achieved accuracy: {accuracy}")

    # Check if this model has the best accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = clf.best_estimator_
        best_model_name = model_name

print(f"Best Model: {best_model_name}")
print(f"Best Accuracy: {best_accuracy}")


RandomForestClassifier achieved accuracy: 0.9577278731836195
GradientBoostingClassifier achieved accuracy: 0.9616908850726552
AdaBoostClassifier achieved accuracy: 0.9392338177014531


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression achieved accuracy: 0.9577278731836195
SVC achieved accuracy: 0.8956406869220608
KNeighborsClassifier achieved accuracy: 0.9299867899603699
DecisionTreeClassifier achieved accuracy: 0.9286657859973579
SGDClassifier achieved accuracy: 0.9379128137384413
GaussianNB achieved accuracy: 0.8771466314398944
MLPClassifier achieved accuracy: 0.9550858652575958
Best Model: GradientBoostingClassifier
Best Accuracy: 0.9616908850726552


In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFECV

# Assuming X_train, X_test, y_train, y_test are already defined

# Define the parameter grid for Logistic Regression
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300]
}

# Create a Logistic Regression model
logreg = LogisticRegression()

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set using the best model
y_pred = best_model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Best Model Parameters: {grid_search.best_params_}")
print(f"Best Model Accuracy: {accuracy}")
#Best Model Parameters: {'C': 10, 'max_iter': 100, 'solver': 'liblinear'}
#Best Model Accuracy: 0.9616908850726552

Fitting 5 folds for each of 90 candidates, totalling 450 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Model Parameters: {'C': 10, 'max_iter': 100, 'solver': 'liblinear'}
Best Model Accuracy: 0.9616908850726552




In [9]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are already defined

# Create a Logistic Regression model
logreg = LogisticRegression(C=10, max_iter=100, solver='liblinear')

# Create RFECV object with logistic regression as the estimator
rfecv = RFECV(estimator=logreg, step=1, cv=5, scoring='accuracy', verbose=1)

# Fit RFECV to the training data
rfecv.fit(X_train, y_train)

# Print the optimal number of features
print(f"Optimal number of features: {rfecv.n_features_}")

# Predict on the test set using the model with selected features
y_pred = rfecv.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Selected Model Accuracy: {accuracy}")


Fitting estimator with 75 features.
Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.
Fitting estimator with 71 features.
Fitting estimator with 70 features.




Fitting estimator with 69 features.
Fitting estimator with 68 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 fe



Fitting estimator with 74 features.
Fitting estimator with 73 features.




Fitting estimator with 72 features.
Fitting estimator with 71 features.
Fitting estimator with 70 features.




Fitting estimator with 69 features.
Fitting estimator with 68 features.




Fitting estimator with 67 features.
Fitting estimator with 66 features.




Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.




Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.




Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.




Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.




Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 fe



Fitting estimator with 73 features.
Fitting estimator with 72 features.




Fitting estimator with 71 features.
Fitting estimator with 70 features.




Fitting estimator with 69 features.
Fitting estimator with 68 features.




Fitting estimator with 67 features.
Fitting estimator with 66 features.




Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.




Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 fe



Fitting estimator with 74 features.
Fitting estimator with 73 features.




Fitting estimator with 72 features.
Fitting estimator with 71 features.




Fitting estimator with 70 features.
Fitting estimator with 69 features.
Fitting estimator with 68 features.




Fitting estimator with 67 features.
Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.




Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 fe



Fitting estimator with 72 features.
Fitting estimator with 71 features.




Fitting estimator with 70 features.
Fitting estimator with 69 features.




Fitting estimator with 68 features.




Fitting estimator with 67 features.
Fitting estimator with 66 features.
Fitting estimator with 65 features.




Optimal number of features: 64
Selected Model Accuracy: 0.9603698811096433




In [13]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are already defined

# Step 1: Run RFECV
logreg = LogisticRegression(C=10, max_iter=100, solver='liblinear')
rfecv = RFECV(estimator=logreg, step=1, cv=5, scoring='accuracy', verbose=1)
rfecv.fit(X_train, y_train)
print(f"Optimal number of features: {rfecv.n_features_}")

# Step 2: Reduce X_train and X_test to the selected features
X_train_rfe = X_train.iloc[:, rfecv.support_]
X_test_rfe = X_test.iloc[:, rfecv.support_]

# Step 3: Run GridSearchCV on the reduced dataset
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300]
}
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train_rfe, y_train)

# Evaluate the best grid search model on the test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_rfe)
accuracy = accuracy_score(y_test, y_pred)

print(f"Best Model Parameters: {grid_search.best_params_}")
print(f"Best Model Accuracy: {accuracy}")


Fitting estimator with 75 features.
Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.
Fitting estimator with 71 features.
Fitting estimator with 70 features.




Fitting estimator with 69 features.
Fitting estimator with 68 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 fe



Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.




Fitting estimator with 71 features.
Fitting estimator with 70 features.
Fitting estimator with 69 features.




Fitting estimator with 68 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.




Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.




Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.




Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.




Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 fe



Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.




Fitting estimator with 71 features.
Fitting estimator with 70 features.
Fitting estimator with 69 features.




Fitting estimator with 68 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.




Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.




Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 fe



Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.




Fitting estimator with 71 features.
Fitting estimator with 70 features.
Fitting estimator with 69 features.




Fitting estimator with 68 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.




Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 fe



Fitting estimator with 72 features.
Fitting estimator with 71 features.




Fitting estimator with 70 features.
Fitting estimator with 69 features.




Fitting estimator with 68 features.
Fitting estimator with 67 features.




Fitting estimator with 66 features.
Fitting estimator with 65 features.




Optimal number of features: 64
Fitting 5 folds for each of 90 candidates, totalling 450 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Model Parameters: {'C': 0.1, 'max_iter': 100, 'solver': 'lbfgs'}
Best Model Accuracy: 0.9669749009247027


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, make_scorer

# Assuming X_train, X_test, y_train, y_test are already defined

# Define F1 scorer
f1_scorer = make_scorer(f1_score)

# Step 1: Run RFECV with F1 score
logreg = LogisticRegression(C=10, max_iter=100, solver='liblinear')
rfecv = RFECV(estimator=logreg, step=1, cv=5, scoring=f1_scorer, verbose=1)
rfecv.fit(X_train, y_train)
print(f"Optimal number of features: {rfecv.n_features_}")

# Print selected features
selected_features = X_train.columns[rfecv.support_]
print(f"Selected features: {selected_features}")

# Step 2: Reduce X_train and X_test to the selected features
X_train_rfe = X_train[selected_features]
X_test_rfe = X_test[selected_features]

# Step 3: Run GridSearchCV on the reduced dataset with F1 score
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300]
}
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, cv=5, scoring=f1_scorer, verbose=1)
grid_search.fit(X_train_rfe, y_train)

# Evaluate the best grid search model on the test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_rfe)
f1 = f1_score(y_test, y_pred)

print(f"Best Model Parameters: {grid_search.best_params_}")
print(f"Best Model F1 Score: {f1}")


Fitting estimator with 75 features.
Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.
Fitting estimator with 71 features.
Fitting estimator with 70 features.




Fitting estimator with 69 features.
Fitting estimator with 68 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 fe



Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.
Fitting estimator with 71 features.
Fitting estimator with 70 features.




Fitting estimator with 69 features.
Fitting estimator with 68 features.
Fitting estimator with 67 features.




Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.




Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.




Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.




Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.




Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 fe



Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.




Fitting estimator with 71 features.
Fitting estimator with 70 features.
Fitting estimator with 69 features.




Fitting estimator with 68 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.




Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.




Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 fe



Fitting estimator with 74 features.
Fitting estimator with 73 features.




Fitting estimator with 72 features.
Fitting estimator with 71 features.
Fitting estimator with 70 features.




Fitting estimator with 69 features.
Fitting estimator with 68 features.
Fitting estimator with 67 features.




Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.




Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 fe



Fitting estimator with 72 features.
Fitting estimator with 71 features.




Fitting estimator with 70 features.
Fitting estimator with 69 features.




Fitting estimator with 68 features.
Fitting estimator with 67 features.




Fitting estimator with 66 features.
Fitting estimator with 65 features.




Optimal number of features: 64
Selected features: Index(['trump16', 'clinton16', 'otherpres16', 'romney12', 'obama12',
       'otherpres12', 'demhouse16', 'rephouse16', 'otherhouse16',
       'total_population', 'cvap', 'white_pct', 'black_pct', 'hispanic_pct',
       'nonwhite_pct', 'foreignborn_pct', 'female_pct', 'age29andunder_pct',
       'age65andolder_pct', 'clf_unemploy_pct', 'lesshs_pct',
       'lesscollege_pct', 'lesshs_whites_pct', 'lesscollege_whites_pct',
       'rural_pct', 'state_Alabama', 'state_Arizona', 'state_Arkansas',
       'state_California', 'state_Colorado', 'state_Florida', 'state_Georgia',
       'state_Idaho', 'state_Indiana', 'state_Iowa', 'state_Kansas',
       'state_Kentucky', 'state_Louisiana', 'state_Maine', 'state_Maryland',
       'state_Michigan', 'state_Minnesota', 'state_Mississippi',
       'state_Missouri', 'state_Montana', 'state_Nebraska', 'state_Nevada',
       'state_New Hampshire', 'state_New Jersey', 'state_New Mexico',
       'state_New 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Model Parameters: {'C': 0.1, 'max_iter': 100, 'solver': 'lbfgs'}
Best Model F1 Score: 0.899598393574297


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
