In [89]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [73]:
data = pd.read_csv('mayoralElectionsv2.csv')

In [74]:
data.head()
# row: each election
# target label: winner
# one-hot race_est

Unnamed: 0,contest,fips,geo_name,year,month,votes,candidate_count,ledb_candid,incumbent,vote_share,...,W_race_est,W_gender_est,W_contributor.cfscore,W_bonica.cid,RU_ledb_candid,RU_vote_share,RU_votes,RU_pid_est,RU_race_est,RU_gender_est
0,3651000_1993_11_new york_NY_Mayor_mayor_1,3651000,new york,1993,11.0,876896,5,35567,1,0.48,...,caucasian,M,0.84,4667066000.0,35567.0,0.48,876896.0,D,black,M
1,3651000_1997_11_new york_NY_Mayor_mayor_1,3651000,new york,1997,11.0,783815,6,35608,1,0.58,...,caucasian,M,0.55,3300166000.0,35611.0,0.4,549335.0,D,caucasian,F
2,3651000_2017_11_new york_NY_Mayor_mayor_1,3651000,new york,2017,11.0,760112,7,34761,1,0.66,...,caucasian,M,,,35600.0,0.28,316948.0,R,caucasian,F
3,3651000_2005_11_new york_NY_Mayor_mayor_1,3651000,new york,2005,11.0,753089,8,35590,1,0.58,...,caucasian,M,,,35572.0,0.39,503219.0,D,caucasian,M
4,3651000_2009_11_new york_NY_Mayor_mayor_1,3651000,new york,2009,11.0,585470,12,35590,1,0.51,...,caucasian,M,,,35625.0,0.46,534876.0,,black,M


In [75]:
# check null value
data.isna().sum()

contest                     0
fips                        0
geo_name                    0
year                        0
month                     163
votes                       0
candidate_count             0
ledb_candid                 0
incumbent                   0
vote_share                  0
winner                      0
n_winners                   0
prob_democrat              85
prob_republican            85
pid_est                    85
prob_male                  92
prob_female                92
gender_est                 92
prob_black                  4
prob_white                  4
prob_hispanic               4
prob_asian                  4
prob_other                 25
race_est                    4
contributor.cfscore       791
bonica.cid                523
percent_women              93
percent_white              93
percent_black              93
percent_hispanic           93
percent_asian_american     93
W_ledb_candid               0
W_vote_share                0
W_votes   

In [None]:
# generate our label
data['challenger_win'] = data['winner'].map(lambda val: 1 if val == 'lose' else 0)
print(data['challenger_win'].value_counts())
print(f"total null value: {data['challenger_win'].isna().sum()}")

# drop elections with only single candidate
clean_data = data[data['candidate_count'] != 1]

# drop if gender_est is null
clean_data = clean_data.dropna(subset=['gender_est'])

clean_data['female'] = clean_data['gender_est'].map(lambda val: 1 if val == 'F' else 0)


challenger_win
0    2018
1     380
Name: count, dtype: int64
total null value: 0


In [None]:
# temporal train-test split and logistic
df_sorted = clean_data.sort_values('year')
split_index = int(0.8 * len(df_sorted))
train_df = df_sorted.iloc[:split_index]
test_df = df_sorted.iloc[split_index:]

features = ['candidate_count', 'female']

X_train = train_df[features]
y_train = train_df['challenger_win']

X_test = test_df[features]
y_test = test_df['challenger_win']

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8070652173913043
              precision    recall  f1-score   support

           0       0.81      1.00      0.89       298
           1       0.00      0.00      0.00        70

    accuracy                           0.81       368
   macro avg       0.40      0.50      0.45       368
weighted avg       0.66      0.81      0.72       368



In [90]:
# temporal train-test split, logistic, and cross-validation
df_sorted = clean_data.sort_values('year')

X = df_sorted[['candidate_count', 'female']]
y = df_sorted['challenger_win']

tscv = TimeSeriesSplit(n_splits=5)

pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression()
)

# Cross-validate using time-aware splits
scores = cross_val_score(pipe, X, y, cv=tscv, scoring='accuracy')

print("Cross-validated scores (time-aware):", scores)
print("\nMean accuracy:", scores.mean())

Cross-validated scores (time-aware): [0.81699346 0.78431373 0.80392157 0.74183007 0.80718954]

Mean accuracy: 0.7908496732026145
