In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, NearMiss, TomekLinks, EditedNearestNeighbours

In [None]:
df2019 = pd.read_csv('/content/drive/MyDrive/DM_Fall24_Project/Cleaned_BRFSS2019.csv')
df2021 = pd.read_csv('/content/drive/MyDrive/DM_Fall24_Project/Cleaned_BRFSS2021.csv')

In [None]:
combined_df = pd.concat([df2019, df2021])
combined_df.sample(5)

Unnamed: 0,Diabetes,HighBloodPressure,HighCholesterol,CholesterolCheck,BodyMassIndex,Smoker,HadStroke,HadHeartDiseaseorAtack,PhysicallyActive,ConsumesFruits,...,HaveHealthCoverage,HaveHealthFinancialIssues,GeneralHealthCondition,MentalHealthCondition,PhysicalHealthCondition,DifficultyInWalking,Gender,AgeBand,HighestLevelOfEducation,IncomeLevel
50531,0.0,1.0,1.0,7.0,24.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,2.0,30.0,0.0,1.0,0,9.0,6.0,3.0
3834,0.0,0.0,0.0,0.0,28.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1,8.0,5.0,11.0
48224,2.0,1.0,1.0,0.0,29.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,3.0,2.0,0.0,0.0,1,8.0,5.0,8.0
155479,2.0,1.0,0.0,0.0,27.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,3.0,0.0,4.0,0.0,0,11.0,5.0,5.0
141644,2.0,0.0,0.0,0.0,41.0,1.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1,11.0,5.0,6.0


In [None]:
combined_df.Diabetes.value_counts()

Unnamed: 0_level_0,count
Diabetes,Unnamed: 1_level_1
0.0,208107
2.0,34756
1.0,5726


In [None]:
X = combined_df.drop(columns=['Diabetes'])  # Features
y = combined_df['Diabetes']  # Target variable

In [None]:
# Resample dataset using RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)

In [None]:
X_rus.shape

(17178, 21)

In [None]:
y_rus.value_counts()

Unnamed: 0_level_0,count
Diabetes,Unnamed: 1_level_1
0.0,5726
1.0,5726
2.0,5726


In [None]:
# Splitting the data into training and test sets
X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(X_rus, y_rus, test_size=0.2, random_state=42)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_rus, y_train_rus)

In [None]:
y_pred_rus = rf_classifier.predict(X_test_rus)

In [None]:
print("Random Under Sampling Results:\n")
print("Class count:\n", y_rus.value_counts())
print("")
accuracy = accuracy_score(y_test_rus, y_pred_rus)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test_rus, y_pred_rus))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test_rus, y_pred_rus))

Random Under Sampling Results:

Class count:
 Diabetes
0.0    5726
1.0    5726
2.0    5726
Name: count, dtype: int64

Accuracy: 0.4956344586728754

Classification Report:
               precision    recall  f1-score   support

         0.0       0.57      0.61      0.59      1127
         1.0       0.41      0.34      0.37      1189
         2.0       0.49      0.55      0.52      1120

    accuracy                           0.50      3436
   macro avg       0.49      0.50      0.49      3436
weighted avg       0.49      0.50      0.49      3436


Confusion Matrix:
 [[684 250 193]
 [332 402 455]
 [182 321 617]]


In [None]:
# Resample dataset using Cluster Centroids
cc = ClusterCentroids()
X_cc, y_cc = cc.fit_resample(X, y)

In [None]:
# Splitting the data into training and test sets
X_train_cc, X_test_cc, y_train_cc, y_test_cc = train_test_split(X_cc, y_cc, test_size=0.2, random_state=42)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=11)
rf_classifier.fit(X_train_cc, y_train_cc)

In [None]:
y_pred_cc = rf_classifier.predict(X_test_cc)

In [None]:
print("Cluster Centroid Under Sampling Results:\n")
print("Class count:\n", y_cc.value_counts())
print("")
accuracy = accuracy_score(y_test_cc, y_pred_cc)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test_cc, y_pred_cc))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test_cc, y_pred_cc))

Cluster Centroid Under Sampling Results:

Class count:
 Diabetes
0.0    5726
1.0    5726
2.0    5726
Name: count, dtype: int64

Accuracy: 0.9173457508731082

Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      0.92      0.95      1184
         1.0       0.90      0.96      0.93      1178
         2.0       0.87      0.86      0.87      1074

    accuracy                           0.92      3436
   macro avg       0.92      0.92      0.92      3436
weighted avg       0.92      0.92      0.92      3436


Confusion Matrix:
 [[1092    1   91]
 [   0 1133   45]
 [  29  118  927]]


In [None]:
# Resample dataset using NearMiss
nm = NearMiss()
X_nm, y_nm = nm.fit_resample(X, y)

In [None]:
# Splitting the data into training and test sets
X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split(X_nm, y_nm, test_size=0.2, random_state=42)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_nm, y_train_nm)

In [None]:
y_pred_nm = rf_classifier.predict(X_test_nm)

In [None]:
print("Near Miss Under Sampling Results:\n")
print("Class count:\n", y_nm.value_counts())
print("")
accuracy = accuracy_score(y_test_nm, y_pred_nm)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test_nm, y_pred_nm))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test_nm, y_pred_nm))

Near Miss Under Sampling Results:

Class count:
 Diabetes
0.0    5726
1.0    5726
2.0    5726
Name: count, dtype: int64

Accuracy: 0.7494179278230501

Classification Report:
               precision    recall  f1-score   support

         0.0       0.75      0.81      0.78      1184
         1.0       0.86      0.70      0.77      1178
         2.0       0.66      0.73      0.70      1074

    accuracy                           0.75      3436
   macro avg       0.76      0.75      0.75      3436
weighted avg       0.76      0.75      0.75      3436


Confusion Matrix:
 [[962  52 170]
 [120 827 231]
 [208  80 786]]


In [None]:
# Tomek Links Under Sampling
tl = TomekLinks()

In [None]:
X_tl, y_tl = tl.fit_resample(X, y)

In [None]:
# Splitting the data into training and test sets
X_train_tl, X_test_tl, y_train_tl, y_test_tl = train_test_split(X_tl, y_tl, test_size=0.2, random_state=42)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_tl, y_train_tl)

In [None]:
y_pred_tl = rf_classifier.predict(X_test_tl)

In [None]:
print("Tomek Links Under Sampling Results:\n")
print("Class count:\n", y_tl.value_counts())
print("")
accuracy = accuracy_score(y_test_tl, y_pred_tl)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test_tl, y_pred_tl))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test_tl, y_pred_tl))

Tomek Links Under Sampling Results:

Class count:
 Diabetes
0.0    199173
2.0     26734
1.0      5726
Name: count, dtype: int64

Accuracy: 0.8616573488462451

Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.98      0.93     39855
         1.0       0.02      0.00      0.00      1141
         2.0       0.50      0.19      0.28      5331

    accuracy                           0.86     46327
   macro avg       0.47      0.39      0.40     46327
weighted avg       0.81      0.86      0.83     46327


Confusion Matrix:
 [[38880    32   943]
 [ 1041     1    99]
 [ 4282    12  1037]]


In [None]:
# Edited Nearest Neighbors Under Sampling
enn = EditedNearestNeighbours()

In [None]:
X_enn, y_enn = enn.fit_resample(X, y)

In [None]:
# Splitting the data into training and test sets
X_train_enn, X_test_enn, y_train_enn, y_test_enn = train_test_split(X_enn, y_enn, test_size=0.2, random_state=42)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_enn, y_train_enn)

In [None]:
y_pred_enn = rf_classifier.predict(X_test_enn)

In [None]:
print("Edited Nearest Neighbors Under Sampling Results:\n")
print("Class count:\n", y_enn.value_counts())
print("")
accuracy = accuracy_score(y_test_enn, y_pred_enn)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test_enn, y_pred_enn))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test_enn, y_pred_enn))

Edited Nearest Neighbors Under Sampling Results:

Class count:
 Diabetes
0.0    147424
1.0      5726
2.0      1511
Name: count, dtype: int64

Accuracy: 0.9901076520221123

Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     29505
         1.0       1.00      0.77      0.87      1125
         2.0       0.86      0.92      0.89       303

    accuracy                           0.99     30933
   macro avg       0.95      0.90      0.92     30933
weighted avg       0.99      0.99      0.99     30933


Confusion Matrix:
 [[29478     3    24]
 [  232   870    23]
 [   23     1   279]]


Using Tomek Links and ENN, along with Cluster Centroids

In [None]:
# Tomek Links Under Sampling
tl = TomekLinks()
X_tl, y_tl = tl.fit_resample(X, y)
# Resample dataset using Cluster Centroids
cc = ClusterCentroids()
X_tlcc, y_tlcc = cc.fit_resample(X_tl, y_tl)

In [None]:
# Splitting the data into training and test sets
X_train_tlcc, X_test_tlcc, y_train_tlcc, y_test_tlcc = train_test_split(X_tlcc, y_tlcc, test_size=0.2, random_state=42)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_tlcc, y_train_tlcc)

In [None]:
y_pred_tlcc = rf_classifier.predict(X_test_tlcc)

In [None]:
print("Tomek Links + Cluster Centroids Under Sampling Results:\n")
print("Class count:\n", y_tlcc.value_counts())
print("")
accuracy = accuracy_score(y_test_tlcc, y_pred_tlcc)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test_tlcc, y_pred_tlcc))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test_tlcc, y_pred_tlcc))

Tomek Links + Cluster Centroids Under Sampling Results:

Class count:
 Diabetes
0.0    5726
1.0    5726
2.0    5726
Name: count, dtype: int64

Accuracy: 0.90046565774156

Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      0.96      0.97      1127
         1.0       0.86      0.92      0.89      1189
         2.0       0.87      0.82      0.84      1120

    accuracy                           0.90      3436
   macro avg       0.90      0.90      0.90      3436
weighted avg       0.90      0.90      0.90      3436


Confusion Matrix:
 [[1081    2   44]
 [   0 1094   95]
 [  24  177  919]]


Using ENN and Cluster Centroids Under Sampling

In [None]:
# Edited Nearest Neighbors Under Sampling
enn = EditedNearestNeighbours()
X_enn, y_enn = enn.fit_resample(X, y)
# Resample dataset using Cluster Centroids
cc = ClusterCentroids()
X_enncc, y_enncc = cc.fit_resample(X_enn, y_enn)

In [None]:
# Splitting the data into training and test sets
X_train_enncc, X_test_enncc, y_train_enncc, y_test_enncc = train_test_split(X_enncc, y_enncc, test_size=0.2, random_state=42)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_enncc, y_train_enncc)

In [None]:
y_pred_enncc = rf_classifier.predict(X_test_enncc)

In [None]:
print("Edited Nearest Neighbors + Cluster Centroids Under Sampling Results:\n")
print("Class count:\n", y_enncc.value_counts())
print("")
accuracy = accuracy_score(y_test_enncc, y_pred_enncc)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test_enncc, y_pred_enncc))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test_enncc, y_pred_enncc))

Edited Nearest Neighbors + Cluster Centroids Under Sampling Results:

Class count:
 Diabetes
0.0    1511
1.0    1511
2.0    1511
Name: count, dtype: int64

Accuracy: 0.9470782800441014

Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      0.98      0.99       325
         1.0       0.93      0.91      0.92       296
         2.0       0.92      0.94      0.93       286

    accuracy                           0.95       907
   macro avg       0.95      0.95      0.95       907
weighted avg       0.95      0.95      0.95       907


Confusion Matrix:
 [[319   5   1]
 [  3 270  23]
 [  0  16 270]]
