In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Load the data
file_path = Path('Resources/ml_data_2.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,id_ur,county_desc,u_r_code,avg_has_zero,avg_has_one,avg_has_two,avg_has_3more
0,20001R,Allen,R,3074.44,1268.89,818.132,2314.53
1,20001U,Allen,U,1496.76,1107.86,629.955,1688.42
2,20003R,Anderson,R,2491.65,431.837,712.834,1406.68
3,20003U,Anderson,U,1127.71,422.422,289.669,1064.2
4,20005R,Atchison,R,1128.0,449.469,285.401,1015.12


### Split the Data into Training and Testing

In [4]:
# Create our features
X = pd.get_dummies(df.drop(["u_r_code","county_desc"], axis=1))

# Create our target
y = df["u_r_code"]

In [5]:
X.describe()

Unnamed: 0,avg_has_zero,avg_has_one,avg_has_two,avg_has_3more,id_ur_20001R,id_ur_20001U,id_ur_20003R,id_ur_20003U,id_ur_20005R,id_ur_20005U,...,id_ur_20195R,id_ur_20197R,id_ur_20199R,id_ur_20201R,id_ur_20203R,id_ur_20205R,id_ur_20205U,id_ur_20207R,id_ur_20209R,id_ur_20209U
count,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,...,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0
mean,4851.178453,3698.012777,2621.773773,6297.73222,0.006024,0.006024,0.006024,0.006024,0.006024,0.006024,...,0.006024,0.006024,0.006024,0.006024,0.006024,0.006024,0.006024,0.006024,0.006024,0.006024
std,12026.892941,13088.082214,10833.982167,24647.125785,0.077615,0.077615,0.077615,0.077615,0.077615,0.077615,...,0.077615,0.077615,0.077615,0.077615,0.077615,0.077615,0.077615,0.077615,0.077615,0.077615
min,78.2132,42.8299,33.3424,73.6145,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1088.6275,563.27125,361.587,707.3995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1737.18,1020.5635,667.0865,1392.39,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4290.55,2283.6775,1547.4925,3831.3475,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,108244.0,119434.0,117348.0,242088.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
# Check the balance of our target values
#y['u_r_codes'].value_counts()
y.value_counts()

R    105
U     61
Name: u_r_code, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

### Balanced Random Forest Classifier

In [8]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest = BalancedRandomForestClassifier(n_estimators = 100)
random_forest = random_forest.fit(X_train, y_train)

In [9]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.711111111111111

In [10]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[24,  3],
       [ 7,  8]], dtype=int64)

In [11]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          R       0.77      0.89      0.53      0.83      0.69      0.49        27
          U       0.73      0.53      0.89      0.62      0.69      0.46        15

avg / total       0.76      0.76      0.66      0.75      0.69      0.48        42



In [12]:
# List the features sorted in descending order by feature importance
sorted(zip(random_forest.feature_importances_, X.columns), reverse=True)

[(0.16075926789669084, 'avg_has_zero'),
 (0.15840517681761007, 'avg_has_3more'),
 (0.14162350619776598, 'avg_has_two'),
 (0.1373995811162129, 'avg_has_one'),
 (0.012392281524748248, 'id_ur_20085U'),
 (0.011825819353783767, 'id_ur_20193U'),
 (0.011600710575428594, 'id_ur_20191U'),
 (0.011288366300431698, 'id_ur_20189U'),
 (0.010048036984889043, 'id_ur_20005U'),
 (0.00922197538062168, 'id_ur_20149U'),
 (0.009029605011773674, 'id_ur_20001U'),
 (0.0082714872209311, 'id_ur_20067U'),
 (0.007415060304500803, 'id_ur_20173R'),
 (0.007249056104763791, 'id_ur_20029U'),
 (0.00706063001100705, 'id_ur_20099U'),
 (0.007037550238976629, 'id_ur_20181U'),
 (0.00624551242211284, 'id_ur_20121U'),
 (0.0062321120094838275, 'id_ur_20145U'),
 (0.00619340622218929, 'id_ur_20167U'),
 (0.006028183419966365, 'id_ur_20073U'),
 (0.005876805775442891, 'id_ur_20117U'),
 (0.00582929345416179, 'id_ur_20009U'),
 (0.005555545620288322, 'id_ur_20087U'),
 (0.005432061556373917, 'id_ur_20171U'),
 (0.00541796440499218, 'id_u

### Easy Ensemble AdaBoost Classifier

In [13]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ensemble = EasyEnsembleClassifier(n_estimators = 100,random_state=1)
ensemble = ensemble.fit(X_train, y_train)
print(ensemble)

EasyEnsembleClassifier(n_estimators=100, random_state=1)


In [14]:
# Calculated the balanced accuracy score
y_pred = ensemble.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7259259259259259

In [15]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[23,  4],
       [ 6,  9]], dtype=int64)

In [16]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          R       0.79      0.85      0.60      0.82      0.71      0.52        27
          U       0.69      0.60      0.85      0.64      0.71      0.50        15

avg / total       0.76      0.76      0.69      0.76      0.71      0.51        42

