In [1]:
# Suppress any warning message
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Initital imported libraries 
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Load the data
file_path = Path('../Resources/ml_data_4.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,id_ur,county_desc,u_r_code,avg_zero,avg_one,avg_two,avg_three,perc_zero,perc_one,perc_two,perc_three,avg_speed,max_speed
0,20001R,Allen,R,3074.44,1268.89,818.132,2314.53,0.024317,0.058917,0.091379,0.0323,9.8,25
1,20001U,Allen,U,1496.76,1107.86,629.955,1688.42,0.032891,0.044437,0.078148,0.029157,9.8,25
2,20003R,Anderson,R,2491.65,431.837,712.834,1406.68,0.02024,0.11678,0.070746,0.03585,9.8,25
3,20003U,Anderson,U,1127.71,422.422,289.669,1064.2,0.025751,0.068746,0.100252,0.027288,9.8,25
4,20005R,Atchison,R,1128.0,449.469,285.401,1015.12,0.025514,0.064031,0.10084,0.028351,19.822222,100


### Split the Data into Training and Testing

In [4]:
# Create our features
X = pd.get_dummies(df.drop(["u_r_code","county_desc"], axis=1))

# Create our target
y = df["u_r_code"]

In [5]:
# Show statistical summary
X.describe()

Unnamed: 0,avg_zero,avg_one,avg_two,avg_three,perc_zero,perc_one,perc_two,perc_three,avg_speed,max_speed,...,id_ur_20195R,id_ur_20197R,id_ur_20199R,id_ur_20201R,id_ur_20203R,id_ur_20205R,id_ur_20205U,id_ur_20207R,id_ur_20209R,id_ur_20209U
count,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,...,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0
mean,4851.178453,3698.012777,2621.773773,6297.73222,0.028673,0.05584,0.079184,0.035794,14.264592,67.921687,...,0.006024,0.006024,0.006024,0.006024,0.006024,0.006024,0.006024,0.006024,0.006024,0.006024
std,12026.892941,13088.082214,10833.982167,24647.125785,0.008596,0.023135,0.014945,0.007718,20.22175,184.695622,...,0.077615,0.077615,0.077615,0.077615,0.077615,0.077615,0.077615,0.077615,0.077615,0.077615
min,78.2132,42.8299,33.3424,73.6145,0.017451,0.028387,0.038595,0.023727,7.233333,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1088.6275,563.27125,361.587,707.3995,0.023178,0.040101,0.072115,0.029617,9.8,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1737.18,1020.5635,667.0865,1392.39,0.026057,0.047978,0.078022,0.034676,9.8,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4290.55,2283.6775,1547.4925,3831.3475,0.031346,0.067855,0.083888,0.040907,9.8,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,108244.0,119434.0,117348.0,242088.0,0.066679,0.144126,0.139503,0.058757,129.854545,1000.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
# Check the balance of our target values [u_r_codes]
y.value_counts()

R    105
U     61
Name: u_r_code, dtype: int64

In [7]:
# Import library to train, test, and split dataframe
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

### Balanced Random Forest Classifier

In [8]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest = BalancedRandomForestClassifier(n_estimators = 100)
random_forest = random_forest.fit(X_train, y_train)

In [9]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8444444444444444

In [10]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[24,  3],
       [ 3, 12]], dtype=int64)

In [11]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          R       0.89      0.89      0.80      0.89      0.84      0.72        27
          U       0.80      0.80      0.89      0.80      0.84      0.70        15

avg / total       0.86      0.86      0.83      0.86      0.84      0.71        42



In [12]:
# List the features sorted in descending order by feature importance
sorted(zip(random_forest.feature_importances_, X.columns), reverse=True)

[(0.15460822073045744, 'perc_three'),
 (0.12298748511755238, 'perc_zero'),
 (0.0837598656840172, 'perc_one'),
 (0.07886774815195238, 'perc_two'),
 (0.07862296546882606, 'avg_three'),
 (0.07807137684181495, 'avg_zero'),
 (0.07400186291558324, 'avg_two'),
 (0.07023921191342886, 'avg_one'),
 (0.040356594483081196, 'avg_speed'),
 (0.0260365352868419, 'max_speed'),
 (0.010279858517893615, 'id_ur_20191U'),
 (0.008559636683546527, 'id_ur_20099U'),
 (0.006455705204475016, 'id_ur_20029U'),
 (0.0061961879616872365, 'id_ur_20085U'),
 (0.0058371099285479625, 'id_ur_20175U'),
 (0.005683355865462679, 'id_ur_20145U'),
 (0.005150876959540274, 'id_ur_20117U'),
 (0.0049012311825778875, 'id_ur_20043U'),
 (0.004659950578646682, 'id_ur_20087U'),
 (0.004626110431382754, 'id_ur_20209R'),
 (0.004543363892485902, 'id_ur_20001U'),
 (0.004260278255545843, 'id_ur_20061U'),
 (0.003718363397006602, 'id_ur_20177R'),
 (0.0037085365863836887, 'id_ur_20173R'),
 (0.003379459861656484, 'id_ur_20091R'),
 (0.00335194997320

### Easy Ensemble AdaBoost Classifier

In [13]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ensemble = EasyEnsembleClassifier(n_estimators = 100,random_state=1)
ensemble = ensemble.fit(X_train, y_train)
print(ensemble)

EasyEnsembleClassifier(n_estimators=100, random_state=1)


In [14]:
# Calculated the balanced accuracy score
y_pred = ensemble.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.825925925925926

In [15]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[23,  4],
       [ 3, 12]], dtype=int64)

In [16]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          R       0.88      0.85      0.80      0.87      0.83      0.69        27
          U       0.75      0.80      0.85      0.77      0.83      0.68        15

avg / total       0.84      0.83      0.82      0.83      0.83      0.68        42

