In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, NearMiss, TomekLinks, EditedNearestNeighbours

In [None]:
df2019 = pd.read_csv('/content/drive/MyDrive/CSE572 Data Mining/Cleaned_BRFSS2019.csv')
df2021 = pd.read_csv('/content/drive/MyDrive/CSE572 Data Mining/Cleaned_BRFSS2021.csv')
combined_df = pd.concat([df2019, df2021])

Using **only 2019** dataset for both test and train data

In [None]:
only2019 = df2019[df2019.Diabetes != 1]

In [None]:
only2019.Diabetes.value_counts()

Unnamed: 0_level_0,count
Diabetes,Unnamed: 1_level_1
0.0,147245
2.0,24494


In [None]:
X = only2019.drop(columns=['Diabetes'])  # Features
y = only2019['Diabetes']  # Target variable

In [None]:
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=150, random_state=42)
rf_classifier.fit(X_train, y_train)

In [None]:
y_pred = rf_classifier.predict(X_test)

In [None]:
print("Results after using ONLY 2019 data for both train and test:\n")
print("Class count:\n", y.value_counts())
print("")
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Results after using ONLY 2019 data for both train and test:

Class count:
 Diabetes
0.0    147245
2.0     24494
Name: count, dtype: int64

Accuracy: 0.8597589379294283

Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     29502
         2.0       0.51      0.20      0.28      4846

    accuracy                           0.86     34348
   macro avg       0.69      0.58      0.60     34348
weighted avg       0.83      0.86      0.83     34348


Confusion Matrix:
 [[28578   924]
 [ 3893   953]]


Using **2019 data for trainin**g and **2021 data for testing**

In [None]:
train2019 = df2019[df2019.Diabetes != 1]
test2021 = df2021[df2021.Diabetes != 1]

In [None]:
X_train = train2019.drop(columns=['Diabetes'])  # Features
y_train = train2019['Diabetes']  # Target variable

X_test = test2021.drop(columns=['Diabetes'])  # Features
y_test = test2021['Diabetes']  # Target variable

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=150, random_state=42)
rf_classifier.fit(X_train, y_train)

In [None]:
y_pred = rf_classifier.predict(X_test)

In [None]:
print("Results after using 2019 data for training and 2021 data for testing:\n")
print("Class count in training:\n", y_train.value_counts())
print("")
print("Class count in testing:\n", y_test.value_counts())
print("")
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Results after using 2019 data for training and 2021 data for testing:

Class count in training:
 Diabetes
0.0    147245
2.0     24494
Name: count, dtype: int64

Class count in testing:
 Diabetes
0.0    60862
2.0    10262
Name: count, dtype: int64

Accuracy: 0.8567572127551881

Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     60862
         2.0       0.51      0.19      0.27     10262

    accuracy                           0.86     71124
   macro avg       0.69      0.58      0.60     71124
weighted avg       0.82      0.86      0.83     71124


Confusion Matrix:
 [[59004  1858]
 [ 8330  1932]]


Using **combined_df** and splitting it into train and test

In [None]:
combined_df = combined_df[combined_df.Diabetes != 1]

In [None]:
combined_df.Diabetes.value_counts()

Unnamed: 0_level_0,count
Diabetes,Unnamed: 1_level_1
0.0,208107
2.0,34756


In [None]:
X = combined_df.drop(columns=['Diabetes'])  # Features
y = combined_df['Diabetes']  # Target variable

In [None]:
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=150, random_state=42)
rf_classifier.fit(X_train, y_train)

In [None]:
y_pred = rf_classifier.predict(X_test)

In [None]:
print("Results after using combined_df data for both train and test:\n")
print("Class count:\n", y.value_counts())
print("")
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Results after using combined_df data for both train and test:

Class count:
 Diabetes
0.0    208107
2.0     34756
Name: count, dtype: int64

Accuracy: 0.8558870154200894

Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     41496
         2.0       0.51      0.19      0.28      7077

    accuracy                           0.86     48573
   macro avg       0.69      0.58      0.60     48573
weighted avg       0.82      0.86      0.83     48573


Confusion Matrix:
 [[40245  1251]
 [ 5749  1328]]
