In [1]:
import pandas as pd
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.datasets import make_classification
import sklearn as skl
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:
df1 = pd.read_csv("countypres_2000-2020.csv")
df1.head()

Unnamed: 0,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,totalvotes,version,mode
0,2000,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,AL GORE,DEMOCRAT,4942,17208,20220315,TOTAL
1,2000,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,GEORGE W. BUSH,REPUBLICAN,11993,17208,20220315,TOTAL
2,2000,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,RALPH NADER,GREEN,160,17208,20220315,TOTAL
3,2000,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,OTHER,OTHER,113,17208,20220315,TOTAL
4,2000,ALABAMA,AL,BALDWIN,1003.0,US PRESIDENT,AL GORE,DEMOCRAT,13997,56480,20220315,TOTAL


In [3]:
df1.groupby("county_name").sum("candidatevotes")

Unnamed: 0_level_0,year,county_fips,candidatevotes,totalvotes,version
county_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ABBEVILLE,92720,2070046.0,63179,533602,930134490
ACADIA,40200,440020.0,157827,525144,404406300
ACCOMACK,56360,1428028.0,89735,433788,566168820
ADA,40200,320020.0,1093493,3663353,404406300
ADAIR,191100,2643095.0,175951,678479,1920929925
...,...,...,...,...,...
YUBA,42220,128415.0,128042,459845,424626615
YUMA,104640,295462.0,303325,1791519,1051456380
ZAPATA,42220,1018605.0,18939,67181,424626615
ZAVALA,42220,1018647.0,21813,77588,424626615


In [4]:
df1.dtypes

year                int64
state              object
state_po           object
county_name        object
county_fips       float64
office             object
candidate          object
party              object
candidatevotes      int64
totalvotes          int64
version             int64
mode               object
dtype: object

## Draft of a random forest model

Which model did you choose and why? 
- We chose a Random forest model because of it´s high accuracy and interpretability. It can easily handle non linear data and outliers. The input will be in the form of tabular data (no images or natural language). A random forest model with a sufficient number of estimators and tree depth should be able to perform at a similar capacity to most deep learning models.
- We want a classifiter that predict which political partie will win given certain conditions (features).
- We need to consider if the data classes have imbalance to apply an oversampling or undersampling technique. 

How are you training your model?
- The data will have over 3000 rows and 40 feature columns.
- It will require preprocessing with (dummies or one Hot Encoder) for the categorical variables. Depending on the lenght of unique values, we might need to bucket certain data. 
- After having everything in numerical values, we will standarized the data. 
- We will keep the default percentage of training and testing data (75% for training and 25% for testing)

What is the model's accuracy?

How does this model work?

In [5]:
#Make classification is used to create sample values and contains many parameters that change the shape and values of the sample dataset.
# Generate dummy dataset
X, y = make_classification(n_samples=500, 
                           n_features=40, 
                           n_informative=20, 
                           n_redundant=10, 
                           n_repeated=10, random_state=0)

# Creating a DataFrame with the dummy data
df = pd.DataFrame(X)
df["Target"] = y
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,Target
0,10.344344,7.649411,3.378619,3.824800,2.667444,-7.847167,3.781608,-0.617573,2.667444,-0.583381,...,6.762299,2.625292,4.615096,-0.559935,-0.583381,0.717844,-2.125050,-0.192304,-0.559935,0
1,9.741622,2.025169,10.019709,3.534773,3.153033,0.397565,3.423823,2.020849,3.153033,1.326533,...,-0.436811,0.431407,3.778800,1.006584,1.326533,1.117772,-0.206758,0.600709,1.006584,0
2,-4.879177,-14.744674,-4.242443,-5.501274,0.145295,6.158059,-1.285861,2.126544,0.145295,1.995793,...,-3.252943,-0.260362,3.082679,-0.037846,1.995793,3.451668,1.627228,1.978573,-0.037846,1
3,-1.297167,3.182710,-5.280273,-0.906036,2.287886,7.039455,0.830466,-0.337647,2.287886,0.624581,...,12.523048,-5.603030,3.437454,3.281823,0.624581,0.355897,-4.126634,-1.931356,3.281823,1
4,-2.504354,-4.866865,-4.240301,-2.685165,2.656403,-3.452812,-0.301580,-1.600371,2.656403,1.329280,...,-2.870052,4.638311,0.489695,2.703581,1.329280,-0.021179,-1.317965,-0.046273,2.703581,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,4.015876,-5.874155,-9.629924,-2.901383,-0.077923,-3.809564,3.875644,0.713467,-0.077923,-1.740902,...,8.673136,-1.340524,0.785441,1.623186,-1.740902,1.164151,-1.202229,-1.185764,1.623186,1
496,0.359509,-2.990328,-1.378215,-0.822323,0.692914,-2.133670,4.674686,-0.532673,0.692914,-0.262559,...,6.070022,1.734302,-1.298261,0.683140,-0.262559,-1.013309,-2.775215,3.047970,0.683140,0
497,1.483440,12.496205,8.245836,1.293406,-2.877999,-5.091807,3.235619,-3.964450,-2.877999,-7.088909,...,13.694965,-1.679868,2.947754,-2.110653,-7.088909,-2.725354,-0.975097,3.365363,-2.110653,1
498,-2.725804,-6.429842,-7.318481,1.588513,2.147521,-15.084149,1.561703,-0.348801,2.147521,-5.271931,...,-4.107854,0.723882,-4.183060,-2.454524,-5.271931,0.963426,0.435168,-3.190520,-2.454524,0


In [6]:
#Use sklearn to split dataset for train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0)

In [7]:
#Create scaler instance
X_scaler=skl.preprocessing.StandardScaler()

#Fit the scaler
X_scaler.fit(X_train)

#Scale the data
X_train_scaled=X_scaler.transform(X_train)
X_test_scaled=X_scaler.transform(X_test)

In [8]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")
print(classification_report(y_test,y_pred))


 Random forest predictive accuracy: 0.888
              precision    recall  f1-score   support

           0       0.92      0.87      0.89        68
           1       0.85      0.91      0.88        57

    accuracy                           0.89       125
   macro avg       0.89      0.89      0.89       125
weighted avg       0.89      0.89      0.89       125



In [9]:
#Evaluate the model
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,59,9
Actual 1,5,52


#### Rank the importance of features

In [10]:
# Calculate feature importance in the Random Forest model.
#importances = sorted(rf_model.feature_importances_,reverse=True)
importances = sorted(zip(rf_model.feature_importances_, df.columns), reverse=True)
# We can sort the features by their importance.
importances


[(0.12309293815297041, 27),
 (0.06124787561208183, 0),
 (0.05354596181785168, 36),
 (0.04224746089547724, 30),
 (0.03345720722640135, 15),
 (0.033327570158318394, 14),
 (0.03136513012993313, 10),
 (0.029800634262678274, 28),
 (0.02882392643327256, 25),
 (0.02842188794367885, 38),
 (0.02597184775570589, 1),
 (0.023705755473814236, 2),
 (0.022750545594957622, 12),
 (0.022013898045612275, 35),
 (0.021797575922229626, 31),
 (0.021763840374593706, 11),
 (0.021111251255617948, 24),
 (0.02042227241321946, 19),
 (0.01969468149377124, 26),
 (0.01950067643597791, 5),
 (0.01890402988286799, 9),
 (0.018637657526125662, 21),
 (0.018330762291540716, 16),
 (0.018288207578814837, 4),
 (0.017761801495922068, 13),
 (0.017740549363284966, 17),
 (0.017034110043636107, 37),
 (0.016719030744147552, 32),
 (0.01650280129441761, 8),
 (0.016089261619396764, 3),
 (0.01569072698162837, 22),
 (0.01522501781018302, 18),
 (0.0149636146240438, 34),
 (0.014961231445833074, 33),
 (0.014925810148190066, 6),
 (0.01443455