In [20]:
import torch
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV

## Loading dataset

In [21]:
df = pd.read_csv('./beophi.csv', index_col = 0)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 773 entries, 0 to 772
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               773 non-null    int64  
 1   Glucose                   772 non-null    float64
 2   BloodPressure             773 non-null    int64  
 3   SkinThickness             773 non-null    int64  
 4   Insulin                   773 non-null    int64  
 5   BMI                       772 non-null    float64
 6   DiabetesPedigreeFunction  773 non-null    float64
 7   Age                       773 non-null    int64  
 8   Outcome                   773 non-null    int64  
dtypes: float64(3), int64(6)
memory usage: 60.4 KB


## Data Cleaning

In [23]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,773.0,772.0,773.0,773.0,773.0,772.0,773.0,773.0,773.0
mean,3.824062,121.813472,69.952135,20.552393,79.380336,37.513342,0.469982,33.191462,0.351876
std,3.369336,40.868603,30.392494,15.927667,114.990798,155.004046,0.331145,11.744123,0.480568
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,19.0,0.0
25%,1.0,99.0,64.0,0.0,0.0,27.275,0.24,24.0,0.0
50%,3.0,117.0,72.0,23.0,25.0,32.0,0.37,29.0,0.0
75%,6.0,141.0,80.0,32.0,126.0,36.6,0.624,41.0,1.0
max,17.0,830.0,722.0,99.0,846.0,4333.1,2.42,81.0,2.0


Ta có thể thấy được giá trị max của 3 cột BloodPressure, Insulin và BMI lớn bất thường so với các giá trị khác.

Cột Outcome cũng chỉ nên có giá trị là 0 hoặc 1 nhưng lại xuất hiện giá trị 2.

In [None]:
df.isnull().sum()

In [24]:
df = df[df["BMI"] < 4333.1]
df[df["BMI"] >= 4333.1]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome


In [25]:
df = df[df["BloodPressure"] < 722]
df[df["BloodPressure"] >= 722]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome


In [26]:
df = df[df["Glucose"] < 830]
df[df["Glucose"] >= 830]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome


In [27]:
df = df[df["Outcome"] < 2]
df[df["Outcome"] >= 2]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome


In [28]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Khi sử dụng hàm isnull() để kiểm tra các giá trị NULL đều sẽ cho kết quả = 0.
Tuy nhiên, khi sử dụng hàm info() thì ta thấy được cột BMI và cột Glucose có giá trị NULL.

Để giải quyết vấn đề này ta sẽ thử qua các phương pháp

- Drop record bị khuyết
- Fill bằng giá trị 0
- Fill bằng giá trị trung bình toàn cục
- Fill bằng "KNN Imputer"

In [29]:
df_mean = df.copy(deep=True)
df_zero = df.copy(deep=True)
df_imputed = df.copy(deep=True)
df_drop = df.copy(deep=True)

In [30]:
df_zero['Glucose'] = df_mean['Glucose'].fillna(0)
df_mean['BMI'] = df_zero['BMI'].fillna(0)

df_zero.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [31]:
df_mean['Glucose'] = df_mean['Glucose'].fillna(df_mean['Glucose'].mean())
df_mean['BMI'] = df_mean['BMI'].fillna(df_mean['BMI'].mean())

df_mean.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [32]:
imputer = KNNImputer(n_neighbors=5, weights="uniform")
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
df_imputed.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [33]:
df_drop = df_drop.dropna()
df_drop.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

## Data Splitting

In [34]:
y_zero = df_zero["Outcome"]  
x_zero = df_zero.drop(columns=["Outcome"], axis=1) 

In [35]:
y_mean = df_mean["Outcome"]  
x_mean = df_mean.drop(columns=["Outcome"], axis=1)

In [36]:
y_drop = df_drop["Outcome"]  
x_drop = df_drop.drop(columns=["Outcome"], axis=1)

In [37]:
y_imputed = df_imputed["Outcome"]  
x_imputed = df_imputed.drop(columns=["Outcome"], axis=1)

In [38]:
x_train_zero, x_test_zero, y_train_zero, y_test_zero = train_test_split(x_zero, y_zero, test_size=0.2 ,random_state=221)
x_train_drop, x_test_drop, y_train_drop, y_test_drop = train_test_split(x_drop, y_drop, test_size=0.2 ,random_state=221)
x_train_mean, x_test_mean, y_train_mean, y_test_mean = train_test_split(x_mean, y_mean, test_size=0.2 ,random_state=221)
x_train_imputed, x_test_imputed, y_train_imputed, y_test_imputed = train_test_split(x_imputed, y_imputed, test_size=0.2 ,random_state=221)

In [None]:
x_test_mean.columns

In [None]:
x_test_imputed.head()

In [None]:
y_test_imputed.head()

## GridSearchCV

In [None]:
rf_clf = RandomForestClassifier(random_state=221)
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [2, 3, 4]
}
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=10, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train_imputed, y_train_imputed)

In [None]:
best_params = grid_search.best_params_

## Training and Evaluating

In [None]:
random_forest_drop = grid_search.best_estimator_

random_forest_drop.fit(x_train_drop, y_train_drop)
y_pred = random_forest_drop.predict(x_test_drop)

print(classification_report(y_test_drop, y_pred))

NameError: name 'grid_search' is not defined

In [40]:
random_forest_zero = grid_search.best_estimator_

random_forest_zero.fit(x_train_zero, y_train_zero)
y_pred = random_forest_zero.predict(x_test_zero)

print(classification_report(y_test_zero, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.86      0.78        93
           1       0.70      0.49      0.58        61

    accuracy                           0.71       154
   macro avg       0.71      0.68      0.68       154
weighted avg       0.71      0.71      0.70       154



In [41]:
random_forest_mean = grid_search.best_estimator_

random_forest_mean.fit(x_train_mean, y_train_mean)
y_pred = random_forest_mean.predict(x_test_mean)

print(classification_report(y_test_mean, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.86      0.78        93
           1       0.70      0.49      0.58        61

    accuracy                           0.71       154
   macro avg       0.71      0.68      0.68       154
weighted avg       0.71      0.71      0.70       154



In [42]:
random_forest_imputed = grid_search.best_estimator_

random_forest_imputed.fit(x_train_imputed, y_train_imputed)
y_pred = random_forest_imputed.predict(x_test_imputed)

print(classification_report(y_test_imputed, y_pred))

              precision    recall  f1-score   support

         0.0       0.72      0.86      0.78        93
         1.0       0.70      0.49      0.58        61

    accuracy                           0.71       154
   macro avg       0.71      0.68      0.68       154
weighted avg       0.71      0.71      0.70       154



## Feature selection

In [None]:
corr_matrix = df.corr()
corr_matrix = corr_matrix["Outcome"].to_frame()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='Blues', fmt='.3f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

## Single feature

In [44]:
y_imputed = df_imputed["Outcome"]  
x_imputed = df_imputed.drop(columns=["Outcome"], axis=1)[["Glucose"]] 

In [None]:
x_imputed.head()

In [45]:
x_train_imputed, x_test_imputed, y_train_imputed, y_test_imputed = train_test_split(x_imputed, y_imputed, test_size=0.2 ,random_state=221)

In [46]:
random_forest_imputed = grid_search.best_estimator_

random_forest_imputed.fit(x_train_imputed, y_train_imputed)
y_pred = random_forest_imputed.predict(x_test_zero)

print(classification_report(y_test_imputed, y_pred))

              precision    recall  f1-score   support

         0.0       0.70      0.84      0.76        93
         1.0       0.64      0.44      0.52        61

    accuracy                           0.68       154
   macro avg       0.67      0.64      0.64       154
weighted avg       0.68      0.68      0.67       154



## Multi-selective features

In [47]:
y_imputed = df_imputed["Outcome"]  
x_imputed = df_imputed.drop(columns=["Outcome"], axis=1)[["Pregnancies", "Glucose", "BMI", "Age"]] 

In [None]:
x_imputed.head()

In [48]:

x_train_imputed, x_test_imputed, y_train_imputed, y_test_imputed = train_test_split(x_imputed, y_imputed, test_size=0.2 ,random_state=221)

In [49]:
random_forest_imputed = RandomForestClassifier(criterion='gini', bootstrap=True, n_estimators=100, random_state=211)
random_forest_imputed.fit(x_train_imputed, y_train_imputed)
y_pred = random_forest_imputed.predict(x_test_imputed)

print(classification_report(y_test_imputed, y_pred))

              precision    recall  f1-score   support

         0.0       0.74      0.83      0.78        93
         1.0       0.68      0.56      0.61        61

    accuracy                           0.72       154
   macro avg       0.71      0.69      0.70       154
weighted avg       0.72      0.72      0.71       154



In [None]:
df = pd.read_csv('./beophi.csv', index_col = 0)

In [None]:
imputer = KNNImputer(n_neighbors=5, weights="uniform")
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [None]:
y = df_imputed["Outcome"]  
x = df_imputed.drop(columns=["Outcome"], axis=1)[["Pregnancies", "Glucose", "BMI", "Age"]] 

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2 ,random_state=221)

In [None]:
rf = RandomForestClassifier(criterion='gini', bootstrap=True, n_estimators=100, random_state=211)


In [None]:
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

print(classification_report(y_test, y_pred))