In [1]:
import pandas as pd
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.datasets import make_classification
import sklearn as skl
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:
df1 = pd.read_csv("countypres_2000-2020.csv")
df1.head()

Unnamed: 0,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,totalvotes,version,mode
0,2000,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,AL GORE,DEMOCRAT,4942,17208,20220315,TOTAL
1,2000,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,GEORGE W. BUSH,REPUBLICAN,11993,17208,20220315,TOTAL
2,2000,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,RALPH NADER,GREEN,160,17208,20220315,TOTAL
3,2000,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,OTHER,OTHER,113,17208,20220315,TOTAL
4,2000,ALABAMA,AL,BALDWIN,1003.0,US PRESIDENT,AL GORE,DEMOCRAT,13997,56480,20220315,TOTAL


In [3]:
df1.groupby("county_name").sum("candidatevotes")

Unnamed: 0_level_0,year,county_fips,candidatevotes,totalvotes,version
county_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ABBEVILLE,92720,2070046.0,63179,533602,930134490
ACADIA,40200,440020.0,157827,525144,404406300
ACCOMACK,56360,1428028.0,89735,433788,566168820
ADA,40200,320020.0,1093493,3663353,404406300
ADAIR,191100,2643095.0,175951,678479,1920929925
...,...,...,...,...,...
YUBA,42220,128415.0,128042,459845,424626615
YUMA,104640,295462.0,303325,1791519,1051456380
ZAPATA,42220,1018605.0,18939,67181,424626615
ZAVALA,42220,1018647.0,21813,77588,424626615


In [4]:
df1.dtypes

year                int64
state              object
state_po           object
county_name        object
county_fips       float64
office             object
candidate          object
party              object
candidatevotes      int64
totalvotes          int64
version             int64
mode               object
dtype: object

## Draft of a random forest model

Which model did you choose and why? 
- We chose a Random forest model because of it´s high accuracy and interpretability. It can easily handle non linear data and outliers. The input will be in the form of tabular data (no images or natural language). A random forest model with a sufficient number of estimators and tree depth should be able to perform at a similar capacity to most deep learning models.
- We want a classifiter that predict which political partie will win given certain conditions (features).
- We need to consider if the data classes have imbalance to apply an oversampling or undersampling technique. 

How are you training your model?
- The data will have over 3000 rows and 40 feature columns.
- It will require preprocessing with (dummies or one Hot Encoder) for the categorical variables. Depending on the lenght of unique values, we might need to bucket certain data. 
- After having everything in numerical values, we will standarized the data. 
- We will keep the default percentage of training and testing data (75% for training and 25% for testing)

What is the model's accuracy?

How does this model work?

In [5]:
#Make classification is used to create sample values and contains many parameters that change the shape and values of the sample dataset.
# Generate dummy dataset
X, y = make_classification(n_samples=5000, 
                           n_features=1500, 
                           n_informative=70, 
                           n_redundant=50, 
                           n_repeated=100, random_state=0)

# Creating a DataFrame with the dummy data
df = pd.DataFrame(X)
df["Target"] = y
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1491,1492,1493,1494,1495,1496,1497,1498,1499,Target
0,0.989085,0.500873,-1.685798,1.338509,1.258163,-0.271056,0.057597,13.131085,-0.244760,0.704654,...,1.931575,-0.313087,0.212345,13.131085,1.724555,-0.504809,-1.769902,-1.052819,-0.355090,1
1,-0.567461,1.428491,-0.271988,-0.063751,-0.123489,0.415629,-1.139705,8.106955,1.738568,0.199502,...,0.422669,-4.552081,1.304262,8.106955,1.171122,-0.674735,-0.435309,2.207393,0.608734,1
2,0.701813,-4.304956,0.866710,0.220649,-0.929059,1.769314,0.318098,-20.900089,-1.919268,-0.190972,...,0.497719,-3.487645,0.575302,-20.900089,0.061258,1.041518,-0.875797,-0.898026,0.771743,0
3,1.006486,11.377563,0.206928,0.168132,-1.552809,3.095840,-0.688531,-32.904548,-0.467552,-1.423661,...,-0.479751,11.002565,0.046308,-32.904548,-1.041259,0.545606,-2.551875,-0.563119,-0.774120,1
4,-1.529356,-6.243965,2.145334,1.526934,1.073742,0.718421,2.787043,27.540890,0.332299,-0.652535,...,-1.755574,-2.196618,-1.452632,27.540890,-0.917530,1.353355,1.305012,-0.157194,-1.089604,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.838070,0.284293,0.079567,-1.457521,-1.164346,0.002683,-0.185979,37.921995,-0.296202,-0.049619,...,-1.511429,-0.768005,0.446856,37.921995,1.124037,-0.923567,0.759832,-0.919581,0.339357,0
4996,0.054547,7.067449,1.550652,0.752789,1.611210,-0.728870,-0.812267,-2.016526,0.570826,0.332775,...,1.611145,-4.207494,1.660688,-2.016526,-1.161452,0.595307,-0.689318,0.125768,-0.288667,0
4997,-0.746221,1.315793,-1.623836,1.748128,0.648539,-1.063470,0.621603,4.944672,0.714235,1.619503,...,-0.605424,-7.096321,0.677670,4.944672,-0.121370,1.741232,1.516191,-0.597635,-0.712668,1
4998,-0.835172,7.999159,-1.586178,1.033821,-0.562425,1.189337,-0.129582,-38.135666,-1.040531,1.834073,...,0.292920,0.509776,0.539623,-38.135666,-0.805284,-2.045560,-1.182397,-0.068733,-1.763176,1


In [6]:
#Use sklearn to split dataset for train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0)

In [7]:
#Create scaler instance
X_scaler=skl.preprocessing.StandardScaler()

#Fit the scaler
X_scaler.fit(X_train)

#Scale the data
X_train_scaled=X_scaler.transform(X_train)
X_test_scaled=X_scaler.transform(X_test)

In [8]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")
print(classification_report(y_test,y_pred))


 Random forest predictive accuracy: 0.843
              precision    recall  f1-score   support

           0       0.82      0.87      0.84       611
           1       0.87      0.82      0.84       639

    accuracy                           0.84      1250
   macro avg       0.84      0.84      0.84      1250
weighted avg       0.84      0.84      0.84      1250



In [9]:
#Evaluate the model
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,530,81
Actual 1,115,524


#### Rank the importance of features

In [10]:
# Calculate feature importance in the Random Forest model.
#importances = sorted(rf_model.feature_importances_,reverse=True)
importances = sorted(zip(rf_model.feature_importances_, df.columns), reverse=True)
# We can sort the features by their importance.
importances


[(0.00800268102315839, 464),
 (0.006702219906169833, 1073),
 (0.006604962454505939, 1406),
 (0.005873259061408845, 175),
 (0.005788671974185675, 319),
 (0.005521840908006064, 924),
 (0.005092032249757838, 705),
 (0.004782851323723337, 177),
 (0.004751182458849599, 1038),
 (0.004666898142080974, 504),
 (0.004658042068517551, 284),
 (0.004460219668119884, 1201),
 (0.004209441341605811, 832),
 (0.003788259571590421, 338),
 (0.0037597157567824816, 98),
 (0.003509672349127972, 1089),
 (0.003247101062093739, 270),
 (0.0032235235708436344, 243),
 (0.0031603072602911174, 1020),
 (0.003048479186462469, 957),
 (0.0030116819512448294, 567),
 (0.002995203600466077, 949),
 (0.002798308653006426, 515),
 (0.002796263406027685, 763),
 (0.0027929969137187073, 394),
 (0.0027818224706562403, 510),
 (0.0027456252370380305, 521),
 (0.002732083074526779, 948),
 (0.0027264743291509877, 786),
 (0.0027129877719124936, 756),
 (0.002708406803202696, 1164),
 (0.0026980494037060184, 1272),
 (0.0026881866325770953,