# 1. Basic Setup

In [482]:
# Imports
import pandas as pd
import numpy as np

from sklearn import metrics

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [483]:
# Dataset importing
dataset = pd.read_csv('https://github.com/PedroG022/diabetes-classification/raw/main/dataset/diabetes_prediction_dataset.csv')

In [484]:
# Utilities
def column_names(dataframe):
  return [key for key in dataframe.columns]

# 2. Dataset viewing

In [485]:
# Basic info
print(dataset.info())
print(dataset.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


None

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [486]:
# Imbalanced dataset
dataset['diabetes'].value_counts()

0    91500
1     8500
Name: diabetes, dtype: int64

In [487]:
# Smoking history values
dataset['smoking_history'].values

array(['never', 'No Info', 'never', ..., 'former', 'never', 'current'],
      dtype=object)

# 3. Pre-processing

In [488]:
# Gender normalization
gender_encoder = LabelEncoder()
gender_label = gender_encoder.fit_transform(dataset['gender'])

dataset['gender'] = gender_label
dataset.gender.value_counts()

# Smoking history normalization
smoking_encoder = LabelEncoder()
smoking_label = smoking_encoder.fit_transform(dataset['smoking_history'])

dataset['smoking_history'] = smoking_label
dataset.smoking_history.value_counts()

0    35816
4    35095
3     9352
1     9286
5     6447
2     4004
Name: smoking_history, dtype: int64

In [489]:
# Normalization
for column in column_names(dataset):
  X = np.array(dataset[column]).reshape(-1,1)

  scaler = MinMaxScaler()
  scaler.fit(X)

  dataset[column] = scaler.transform(X).reshape(1,-1)[0]

# 4. Undersampling

In [490]:
# Didn't get good results with this

# undersample = RandomUnderSampler()
# X_res, y_res = undersample.fit_resample(X_train,y_train)

# 5. Training

In [491]:
X = dataset.drop('diabetes', axis=1)
y = dataset.diabetes

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
model = RandomForestClassifier()
model.fit(X_train, y_train)

# 6. Testing

In [492]:
y_pred = model.predict(X_test)

print(pd.crosstab(y_test,y_pred, rownames=['Real'], colnames=['Predito'], margins=True))
print(metrics.classification_report(y_test,y_pred))

Predito    0.0   1.0    All
Real                       
0.0      18224    66  18290
1.0        547  1163   1710
All      18771  1229  20000
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98     18290
         1.0       0.95      0.68      0.79      1710

    accuracy                           0.97     20000
   macro avg       0.96      0.84      0.89     20000
weighted avg       0.97      0.97      0.97     20000
