In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [4]:
columns = ['res_county','age_group','sex','race','ethnicity','hosp_yn']

target = ['hosp_yn']

In [5]:
df = pd.read_csv('virginia_covid.csv')
df = df.loc[:, columns].copy()
df.head()

Unnamed: 0,res_county,age_group,sex,race,ethnicity,hosp_yn
0,NORFOLK CITY,18 to 49 years,Female,Multiple/Other,Hispanic/Latino,No
1,FREDERICK,18 to 49 years,Female,White,Hispanic/Latino,No
2,FAIRFAX,18 to 49 years,Male,Black,Hispanic/Latino,No
3,CULPEPER,18 to 49 years,Male,White,Hispanic/Latino,No
4,ALBEMARLE,18 to 49 years,Male,White,Hispanic/Latino,No


In [6]:
# Split the data 

In [7]:
# Create our features
X = pd.get_dummies(df.drop(columns="hosp_yn"))

# Create our target
y = df.loc[:, target].copy()

In [8]:
X.describe()

Unnamed: 0,res_county_ACCOMACK,res_county_ALBEMARLE,res_county_ALEXANDRIA CITY,res_county_AMHERST,res_county_ARLINGTON,res_county_AUGUSTA,res_county_BEDFORD,res_county_BOTETOURT,res_county_BUCHANAN,res_county_CAMPBELL,...,sex_Female,sex_Male,race_American Indian/Alaska Native,race_Asian,race_Black,race_Multiple/Other,race_Native Hawaiian/Other Pacific Islander,race_White,ethnicity_Hispanic/Latino,ethnicity_Non-Hispanic/Latino
count,262838.0,262838.0,262838.0,262838.0,262838.0,262838.0,262838.0,262838.0,262838.0,262838.0,...,262838.0,262838.0,262838.0,262838.0,262838.0,262838.0,262838.0,262838.0,262838.0,262838.0
mean,0.006624,0.009272,0.023585,0.002656,0.029395,0.007906,0.007921,0.003455,0.002332,0.005323,...,0.538415,0.461585,5.3e-05,0.057899,0.237138,0.033922,5.7e-05,0.67093,0.077797,0.922203
std,0.081117,0.095843,0.151752,0.051464,0.16891,0.088564,0.088648,0.058674,0.048237,0.072762,...,0.498523,0.498523,0.007298,0.233552,0.425329,0.181029,0.007554,0.469876,0.267852,0.267852
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
y.value_counts()

hosp_yn
No         244784
Yes         18054
dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1)

In [11]:
## Model #1: Balanced Random Forest Classifier

In [12]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import train_test_split
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1) 
rf_model = rf_model.fit(X_train, y_train)

In [13]:
# Calculated the balanced accuracy score
predictions = rf_model.predict(X_test)
accuracy_score(y_test, predictions)

0.7758332065134683

In [14]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

array([[47734, 13467],
       [ 1263,  3246]])

In [15]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.97      0.78      0.72      0.87      0.75      0.56     61201
        Yes       0.19      0.72      0.78      0.31      0.75      0.56      4509

avg / total       0.92      0.78      0.72      0.83      0.75      0.56     65710



In [16]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
importances
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3599614727102093, 'age_group_65+ years'),
 (0.24434794819287758, 'age_group_18 to 49 years'),
 (0.10592835477922975, 'age_group_0 - 17 years'),
 (0.07299949692491514, 'age_group_50 to 64 years'),
 (0.020341360032781263, 'race_Black'),
 (0.014027709557940984, 'race_White'),
 (0.00913014012924775, 'sex_Male'),
 (0.008860427417916608, 'sex_Female'),
 (0.006380573870920595, 'ethnicity_Non-Hispanic/Latino'),
 (0.006364620268395401, 'ethnicity_Hispanic/Latino'),
 (0.006007246464040267, 'res_county_FAIRFAX'),
 (0.005198675128604116, 'res_county_PORTSMOUTH CITY'),
 (0.0046809234481248845, 'res_county_NORFOLK CITY'),
 (0.004638060043496732, 'res_county_PRINCE WILLIAM'),
 (0.00398372989471761, 'res_county_VIRGINIA BEACH CITY'),
 (0.003950953801250953, 'res_county_LOUDOUN'),
 (0.003915594084467642, 'res_county_MONTGOMERY'),
 (0.003649455910478335, 'race_Multiple/Other'),
 (0.0035710816764633617, 'res_county_CHESTERFIELD'),
 (0.0035591692507955545, 'res_county_ALEXANDRIA CITY'),
 (0.0034173266

In [17]:
# Model #2: Easy Ensemble Adaboost Classifer 

In [18]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
easy = EasyEnsembleClassifier(n_estimators = 100,random_state=1)
easy = easy.fit(X_train, y_train)

In [19]:
# Calculated the balanced accuracy score
y_pred = easy.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7518190146506658

In [20]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[47233, 13968],
       [ 1209,  3300]])

In [21]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.98      0.77      0.73      0.86      0.75      0.57     61201
        Yes       0.19      0.73      0.77      0.30      0.75      0.56      4509

avg / total       0.92      0.77      0.73      0.82      0.75      0.57     65710



In [None]:
#Model #3: Logistic Regression

In [22]:
#Train the model and create predictions
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=200,
   random_state=1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [23]:
# Calculated the accuracy score
y_pred = easy.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.7518190146506658

In [None]:
#Model #4: SVM 

In [24]:
#train the model and create predictions
from sklearn.svm import SVC
model = SVC(kernel='linear')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [25]:
#calculate accuracy score
accuracy_score(y_test, y_pred)

0.9313803074113529

In [26]:
#show confusion matrix
confusion_matrix(y_test, y_pred)

array([[61201,     0],
       [ 4509,     0]])

In [27]:
#print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.93      1.00      0.96     61201
         Yes       0.00      0.00      0.00      4509

    accuracy                           0.93     65710
   macro avg       0.47      0.50      0.48     65710
weighted avg       0.87      0.93      0.90     65710



In [None]:
#Model #5: Decision Trees

In [29]:
from sklearn import tree
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train, y_train)
# Making predictions using the testing data.
predictions = model.predict(X_test)

In [30]:
#confusion matrix
confusion_matrix(y_test, predictions)

array([[61185,    16],
       [ 4490,    19]])

In [31]:
#accuracy score
accuracy_score(y_test, predictions)

0.9314259625627759

In [32]:
#classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.93      1.00      0.96     61201
         Yes       0.54      0.00      0.01      4509

    accuracy                           0.93     65710
   macro avg       0.74      0.50      0.49     65710
weighted avg       0.90      0.93      0.90     65710



In [None]:
# Model #6: Gradient Boosting Classifier

In [38]:
from sklearn.ensemble import GradientBoostingClassifier
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_train, y_train)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test,
           y_test)))

Learning rate:  0.05
Accuracy score (training): 0.931
Accuracy score (validation): 0.931
Learning rate:  0.1
Accuracy score (training): 0.931
Accuracy score (validation): 0.931
Learning rate:  0.25
Accuracy score (training): 0.931
Accuracy score (validation): 0.931
Learning rate:  0.5
Accuracy score (training): 0.931
Accuracy score (validation): 0.931
Learning rate:  0.75
Accuracy score (training): 0.931
Accuracy score (validation): 0.931
Learning rate:  1
Accuracy score (training): 0.931
Accuracy score (validation): 0.931


In [40]:
#create model, train it, and create predictions
classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.5, max_features=5, max_depth=3, random_state=0)

classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

In [41]:
#confusion matrix
confusion_matrix(y_test, predictions)

array([[61112,    89],
       [ 4463,    46]])

In [42]:
#accuracy score
accuracy_score(y_test, predictions)

0.9307259169076244

In [43]:
#classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.93      1.00      0.96     61201
         Yes       0.34      0.01      0.02      4509

    accuracy                           0.93     65710
   macro avg       0.64      0.50      0.49     65710
weighted avg       0.89      0.93      0.90     65710



In [None]:
#Model #7: Logistic Regression with naive random oversampling

In [45]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'hosp_yn': 1})

In [46]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [47]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[47438, 13763],
       [ 1224,  3285]])

In [48]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.7518304838991157

In [49]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.97      0.78      0.73      0.86      0.75      0.57     61201
        Yes       0.19      0.73      0.78      0.30      0.75      0.56      4509

avg / total       0.92      0.77      0.73      0.83      0.75      0.57     65710



In [None]:
#Model #8: Logistic Regression with SMOTE Oversampling

In [50]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

In [51]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [52]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7527250146145947

In [53]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[48199, 13002],
       [ 1272,  3237]])

In [54]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.97      0.79      0.72      0.87      0.75      0.57     61201
        Yes       0.20      0.72      0.79      0.31      0.75      0.56      4509

avg / total       0.92      0.78      0.72      0.83      0.75      0.57     65710

