In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, NearMiss, TomekLinks, EditedNearestNeighbours

In [None]:
df2019 = pd.read_csv('/content/drive/MyDrive/DM_Fall24_Project/Cleaned_BRFSS2019.csv')
df2021 = pd.read_csv('/content/drive/MyDrive/DM_Fall24_Project/Cleaned_BRFSS2021.csv')

In [None]:
combined_df = pd.concat([df2019, df2021])
combined_df.sample(5)

Unnamed: 0,Diabetes,HighBloodPressure,HighCholesterol,CholesterolCheck,BodyMassIndex,Smoker,HadStroke,HadHeartDiseaseorAtack,PhysicallyActive,ConsumesFruits,...,HaveHealthCoverage,HaveHealthFinancialIssues,GeneralHealthCondition,MentalHealthCondition,PhysicalHealthCondition,DifficultyInWalking,Gender,AgeBand,HighestLevelOfEducation,IncomeLevel
36458,0.0,0.0,0.0,0.0,39.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,0,2.0,4.0,4.0
73246,0.0,1.0,1.0,0.0,31.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1,4.0,5.0,8.0
9573,0.0,1.0,1.0,0.0,29.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1,13.0,5.0,9.0
12142,0.0,0.0,1.0,0.0,39.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,4.0,0.0,30.0,0.0,1,8.0,4.0,5.0
118463,0.0,0.0,0.0,0.0,29.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,30.0,1.0,1,13.0,5.0,3.0


In [None]:
combined_df.Diabetes.value_counts()

Unnamed: 0_level_0,count
Diabetes,Unnamed: 1_level_1
0.0,208107
2.0,34756
1.0,5726


In [None]:
combined_df = combined_df[combined_df.Diabetes != 1]

In [None]:
combined_df.Diabetes.value_counts()

Unnamed: 0_level_0,count
Diabetes,Unnamed: 1_level_1
0.0,208107
2.0,34756


Resampling first and then splitting

In [None]:
X = combined_df.drop(columns=['Diabetes'])  # Features
y = combined_df['Diabetes']  # Target variable

In [None]:
# Resample dataset using RandomUnderSampler
rus = RandomUnderSampler(random_state=11)
X_rus, y_rus = rus.fit_resample(X, y)

In [None]:
X_rus.shape

(69512, 21)

In [None]:
y_rus.value_counts()

Unnamed: 0_level_0,count
Diabetes,Unnamed: 1_level_1
0.0,34756
2.0,34756


In [None]:
# Splitting the data into training and test sets
X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(X_rus, y_rus, test_size=0.2, random_state=11)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=11)
rf_classifier.fit(X_train_rus, y_train_rus)

In [None]:
y_pred_rus = rf_classifier.predict(X_test_rus)

In [None]:
print("Random Under Sampling Results:\n")
print("Class count:\n", y_rus.value_counts())
print("")
accuracy = accuracy_score(y_test_rus, y_pred_rus)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test_rus, y_pred_rus))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test_rus, y_pred_rus))

Random Under Sampling Results:

Class count:
 Diabetes
0.0    34756
2.0    34756
Name: count, dtype: int64

Accuracy: 0.7387614183989067

Classification Report:
               precision    recall  f1-score   support

         0.0       0.76      0.71      0.73      7053
         2.0       0.72      0.77      0.74      6850

    accuracy                           0.74     13903
   macro avg       0.74      0.74      0.74     13903
weighted avg       0.74      0.74      0.74     13903


Confusion Matrix:
 [[4974 2079]
 [1553 5297]]


In [None]:
# Resample dataset using Cluster Centroids
cc = ClusterCentroids()
X_cc, y_cc = cc.fit_resample(X, y)

In [None]:
X_cc.shape

(69512, 21)

In [None]:
y_cc.value_counts()

Unnamed: 0_level_0,count
Diabetes,Unnamed: 1_level_1
0.0,34756
2.0,34756


In [None]:
# Splitting the data into training and test sets
X_train_cc, X_test_cc, y_train_cc, y_test_cc = train_test_split(X_cc, y_cc, test_size=0.2, random_state=11)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=11)
rf_classifier.fit(X_train_cc, y_train_cc)

In [None]:
y_pred_cc = rf_classifier.predict(X_test_cc)

In [None]:
print("Cluster Centroid Under Sampling Results:\n")
print("Class count:\n", y_cc.value_counts())
print("")
accuracy = accuracy_score(y_test_cc, y_pred_cc)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test_cc, y_pred_cc))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test_cc, y_pred_cc))

Cluster Centroid Under Sampling Results:

Class count:
 Diabetes
0.0    34756
2.0    34756
Name: count, dtype: int64

Accuracy: 0.9345465007552327

Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      0.91      0.93      7053
         2.0       0.91      0.96      0.94      6850

    accuracy                           0.93     13903
   macro avg       0.94      0.93      0.93     13903
weighted avg       0.94      0.93      0.93     13903


Confusion Matrix:
 [[6419  634]
 [ 276 6574]]


In [None]:
# Resample dataset using NearMiss
nm = NearMiss()
X_nm, y_nm = nm.fit_resample(X, y)

In [None]:
X_nm.shape

(69512, 21)

In [None]:
y_nm.value_counts()

Unnamed: 0_level_0,count
Diabetes,Unnamed: 1_level_1
0.0,34756
2.0,34756


In [None]:
# Splitting the data into training and test sets
X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split(X_nm, y_nm, test_size=0.2, random_state=11)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=11)
rf_classifier.fit(X_train_nm, y_train_nm)

In [None]:
y_pred_nm = rf_classifier.predict(X_test_nm)

In [None]:
print("Near Miss Under Sampling Results:\n")
print("Class count:\n", y_nm.value_counts())
print("")
accuracy = accuracy_score(y_test_nm, y_pred_nm)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test_nm, y_pred_nm))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test_nm, y_pred_nm))

Near Miss Under Sampling Results:

Class count:
 Diabetes
0.0    34756
2.0    34756
Name: count, dtype: int64

Accuracy: 0.8402503056894195

Classification Report:
               precision    recall  f1-score   support

         0.0       0.82      0.87      0.85      7053
         2.0       0.86      0.81      0.83      6850

    accuracy                           0.84     13903
   macro avg       0.84      0.84      0.84     13903
weighted avg       0.84      0.84      0.84     13903


Confusion Matrix:
 [[6161  892]
 [1329 5521]]


In [None]:
# Tomek Links Under Sampling
tl = TomekLinks()

In [None]:
X_tl, y_tl = tl.fit_resample(X, y)

In [None]:
# Splitting the data into training and test sets
X_train_tl, X_test_tl, y_train_tl, y_test_tl = train_test_split(X_tl, y_tl, test_size=0.2, random_state=11)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=11)
rf_classifier.fit(X_train_tl, y_train_tl)

In [None]:
y_pred_tl = rf_classifier.predict(X_test_tl)

In [None]:
print("Tomek Links Under Sampling Results:\n")
print("Class count:\n", y_tl.value_counts())
print("")
accuracy = accuracy_score(y_test_tl, y_pred_tl)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test_tl, y_pred_tl))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test_tl, y_pred_tl))

Tomek Links Under Sampling Results:

Class count:
 Diabetes
0.0    200208
2.0     34756
Name: count, dtype: int64

Accuracy: 0.8576596514374482

Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     40013
         2.0       0.55      0.24      0.33      6980

    accuracy                           0.86     46993
   macro avg       0.71      0.60      0.63     46993
weighted avg       0.83      0.86      0.83     46993


Confusion Matrix:
 [[38624  1389]
 [ 5300  1680]]


In [None]:
# Edited Nearest Neighbors Under Sampling
enn = EditedNearestNeighbours()

In [None]:
X_enn, y_enn = enn.fit_resample(X, y)

In [None]:
# Splitting the data into training and test sets
X_train_enn, X_test_enn, y_train_enn, y_test_enn = train_test_split(X_enn, y_enn, test_size=0.2, random_state=11)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=11)
rf_classifier.fit(X_train_enn, y_train_enn)

In [None]:
y_pred_enn = rf_classifier.predict(X_test_enn)

In [None]:
print("Edited Nearest Neighbors Under Sampling Results:\n")
print("Class count:\n", y_enn.value_counts())
print("")
accuracy = accuracy_score(y_test_enn, y_pred_enn)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test_enn, y_pred_enn))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test_enn, y_pred_enn))

Edited Nearest Neighbors Under Sampling Results:

Class count:
 Diabetes
0.0    154358
2.0     34756
Name: count, dtype: int64

Accuracy: 0.9656029400100468

Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      1.00      0.98     30910
         2.0       0.99      0.82      0.90      6913

    accuracy                           0.97     37823
   macro avg       0.98      0.91      0.94     37823
weighted avg       0.97      0.97      0.96     37823


Confusion Matrix:
 [[30849    61]
 [ 1240  5673]]


Using Tomek Links and ENN, along with Cluster Centroids

In [None]:
# Tomek Links Under Sampling
tl = TomekLinks()
X_tl, y_tl = tl.fit_resample(X, y)
# Resample dataset using Cluster Centroids
cc = ClusterCentroids()
X_tlcc, y_tlcc = cc.fit_resample(X_tl, y_tl)

In [None]:
# Splitting the data into training and test sets
X_train_tlcc, X_test_tlcc, y_train_tlcc, y_test_tlcc = train_test_split(X_tlcc, y_tlcc, test_size=0.2, random_state=11)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=11)
rf_classifier.fit(X_train_tlcc, y_train_tlcc)

In [None]:
y_pred_tlcc = rf_classifier.predict(X_test_tlcc)

In [None]:
print("Tomek Links + Cluster Centroids Under Sampling Results:\n")
print("Class count:\n", y_tlcc.value_counts())
print("")
accuracy = accuracy_score(y_test_tlcc, y_pred_tlcc)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test_tlcc, y_pred_tlcc))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test_tlcc, y_pred_tlcc))

Tomek Links + Cluster Centroids Under Sampling Results:

Class count:
 Diabetes
0.0    34756
2.0    34756
Name: count, dtype: int64

Accuracy: 0.9344745738329857

Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      0.91      0.93      6915
         2.0       0.92      0.96      0.94      6988

    accuracy                           0.93     13903
   macro avg       0.94      0.93      0.93     13903
weighted avg       0.94      0.93      0.93     13903


Confusion Matrix:
 [[6312  603]
 [ 308 6680]]


In [None]:
# Edited Nearest Neighbors Under Sampling
enn = EditedNearestNeighbours()
X_enn, y_enn = enn.fit_resample(X, y)
# Resample dataset using Cluster Centroids
cc = ClusterCentroids()
X_enncc, y_enncc = cc.fit_resample(X_enn, y_enn)

In [None]:
# Splitting the data into training and test sets
X_train_enncc, X_test_enncc, y_train_enncc, y_test_enncc = train_test_split(X_enncc, y_enncc, test_size=0.2, random_state=11)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=11)
rf_classifier.fit(X_train_enncc, y_train_enncc)

In [None]:
y_pred_enncc = rf_classifier.predict(X_test_enncc)

In [None]:
print("Edited Nearest Neighbors + Cluster Centroids Under Sampling Results:\n")
print("Class count:\n", y_enncc.value_counts())
print("")
accuracy = accuracy_score(y_test_enncc, y_pred_enncc)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test_enncc, y_pred_enncc))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test_enncc, y_pred_enncc))

Edited Nearest Neighbors + Cluster Centroids Under Sampling Results:

Class count:
 Diabetes
0.0    34756
2.0    34756
Name: count, dtype: int64

Accuracy: 0.9246925124073941

Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.91      0.92      7053
         2.0       0.91      0.94      0.92      6850

    accuracy                           0.92     13903
   macro avg       0.92      0.92      0.92     13903
weighted avg       0.93      0.92      0.92     13903


Confusion Matrix:
 [[6417  636]
 [ 411 6439]]
