## Setting up to Work

The first part of the process, importing the libraries and depend

In [3]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

Loading dataset, after downloading it from kaggle: https://www.kaggle.com/datasets/iammustafatz/diabetes-prediction-dataset.

In [8]:
df = pd.read_csv("diabetes_prediction_dataset.csv")


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [10]:
df.describe() #-> 8.5% dos casos é positivo.

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [117]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [118]:
# Checagem de NAs
for c in df.columns:
    print(f"{df[c].name} - {df[c].isna().sum()} NAs")

gender - 0 NAs
age - 0 NAs
hypertension - 0 NAs
heart_disease - 0 NAs
smoking_history - 0 NAs
bmi - 0 NAs
HbA1c_level - 0 NAs
blood_glucose_level - 0 NAs
diabetes - 0 NAs


In [119]:
X = df.drop(['diabetes'], axis=1)
Y = df['diabetes']

In [120]:

CT = ColumnTransformer(
    transformers = [ 
        ('onehot', OneHotEncoder(sparse_output=False, categories='auto'), ['gender', 'smoking_history']), #sparse_output=False
        #('ordinal', OrdinalEncoder(categories=[['never','No Info', 'not current', 'former', 'current', 'ever']]), ['smoking_history']) #sparse_output=False
    ],	
    remainder='passthrough'
)

# Transformar o DataFrame
df_transformed = CT.fit_transform(X)

In [121]:
# Mutal Information
y = df['diabetes']
X_features = df.drop('diabetes', axis=1)

# Aplica o encoding na coluna 'gender'
X_encoded = CT.fit_transform(X_features[['gender', 'smoking_history']])
# Cria nomes para as novas colunas
encoded_cols = CT.get_feature_names_out(['gender', 'smoking_history'])

# Transforma em DataFrame com nomes de coluna
X_encoded_df = pd.DataFrame(X_encoded, columns=encoded_cols, index=X_features.index)

# Substitui a coluna original no DataFrame
X_features = pd.concat([X_features.drop(['gender', 'smoking_history'], axis=1), X_encoded_df], axis=1)

In [122]:
X_features.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,onehot__gender_Female,onehot__gender_Male,onehot__gender_Other,onehot__smoking_history_No Info,onehot__smoking_history_current,onehot__smoking_history_ever,onehot__smoking_history_former,onehot__smoking_history_never,onehot__smoking_history_not current
0,80.0,0,1,25.19,6.6,140,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,54.0,0,0,27.32,6.6,80,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,28.0,0,0,27.32,5.7,158,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,36.0,0,0,23.45,5.0,155,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,76.0,1,1,20.14,4.8,155,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### Cálculo de Mutual Information

In [None]:
# Calcula a MI
mi_scores = mutual_info_classif(X_features, y)

# Exibe os resultados
mi_df = pd.DataFrame({'Feature': X_features.columns, 'MI Score': mi_scores})
print(mi_df.sort_values(by='MI Score', ascending=False))


### Cálculo de Correlação
cálculo de correlação implica na existença de colinearadiade entre a diabetes e algumas variáveis.

In [128]:
# Calculo de Correlação
df_trans = pd.concat([X_features, y], axis=1)
df_trans.corr()['diabetes'].sort_values(ascending=False)


diabetes                               1.000000
blood_glucose_level                    0.419558
HbA1c_level                            0.400660
age                                    0.258008
bmi                                    0.214357
hypertension                           0.197823
heart_disease                          0.171727
onehot__smoking_history_former         0.097917
onehot__gender_Male                    0.037666
onehot__smoking_history_never          0.027267
onehot__smoking_history_ever           0.024080
onehot__smoking_history_not current    0.020734
onehot__smoking_history_current        0.019606
onehot__gender_Other                  -0.004090
onehot__gender_Female                 -0.037553
onehot__smoking_history_No Info       -0.118939
Name: diabetes, dtype: float64