# Program Pre-processing vs No pre-processing

In [11]:
import pandas as pd
import autogluon
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split

In [12]:
npp_data = pd.read_csv("diabetes_dataset.csv")
pp_data = pd.read_csv("diabetes_dataset_DONE.csv")

In [13]:
from sklearn.feature_selection import mutual_info_classif

X = npp_data.drop("diagnosed_diabetes", axis=1)
y = npp_data["diagnosed_diabetes"]

npp_data.corr(numeric_only=True)["diagnosed_diabetes"].sort_values(ascending=False)

diagnosed_diabetes                    1.000000
hba1c                                 0.679397
glucose_postprandial                  0.629832
glucose_fasting                       0.510919
diabetes_risk_score                   0.277300
family_history_diabetes               0.197926
age                                   0.137713
bmi                                   0.097057
systolic_bp                           0.095481
waist_to_hip_ratio                    0.078918
ldl_cholesterol                       0.067475
cholesterol_total                     0.058173
insulin_level                         0.057715
triglycerides                         0.056230
diastolic_bp                          0.035619
cardiovascular_history                0.029793
hypertension_history                  0.027524
heart_rate                            0.022785
screen_time_hours_per_day             0.018127
alcohol_consumption_per_week          0.000760
sleep_hours_per_day                  -0.000399
diet_score   

In [14]:
# cols_to_drop = [
#     "family_history_diabetes",
#     "bmi",
#     "cholesterol_total",
#     "glucose_fasting",
#     "glucose_postprandial",
#     "diabetes_stage"
# ]

# npp_data = npp_data.drop(cols_to_drop, axis=1)

In [15]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X = npp_data.drop("diagnosed_diabetes", axis=1)
y = npp_data["diagnosed_diabetes"]

num_cols = X.select_dtypes(include=["number"]).columns
X_num = X[num_cols]

X_train, X_test, y_train, y_test = train_test_split(
    X_num, y, test_size=0.3, random_state=42, shuffle=True
)

pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca  = pca.transform(X_test)

### 1. No pre-processing autogluon

In [16]:
train, test = train_test_split(
    npp_data,
    test_size=0.3,
    random_state=42,
    shuffle=True
)

train_data = TabularDataset(train)

predictor = TabularPredictor(label='diagnosed_diabetes', path='modelePPvsNPP/NPP').fit(train_data, presets="medium", save_space=True, keep_only_best=True)

test_data = TabularDataset(test)

predictions = predictor.predict(test_data)
print(predictions)

leaderboard = predictor.leaderboard()
print(leaderboard)

print(predictor.evaluate(train_data))



print("----------hiper-----------")

model_name = predictor.model_best
model_info = predictor.info()
model_info['model_info'][model_name]['hyperparameters']

Preset alias specified: 'medium' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.11.5
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.6.0: Wed Jul  5 22:17:35 PDT 2023; root:xnu-8796.141.3~6/RELEASE_ARM64_T8112
CPU Count:          8
Memory Avail:       5.97 GB / 16.00 GB (37.3%)
Disk Space Avail:   785.63 GB / 926.35 GB (84.8%)
Presets specified: ['medium']
Using hyperparameters preset: hyperparameters='default'
Beginning AutoGluon training ...
AutoGluon will save models to "/Users/ntix/Desktop/Programowanie/SUML/suml_15_grupa5/modelePPvsNPP/NPP"
Train Data Rows:    70000
Train Data Columns: 30
Label Column:       diagnosed_diabetes
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during Predictor in

75721    0
80184    1
19864    0
76699    0
92991    1
        ..
42648    0
86306    0
45466    1
63724    0
34122    1
Name: diagnosed_diabetes, Length: 30000, dtype: int64
                 model  score_val eval_metric  pred_time_val  fit_time  \
0       ExtraTreesGini        1.0    accuracy       0.027197  1.006780   
1  WeightedEnsemble_L2        1.0    accuracy       0.027498  1.044042   

   pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  \
0                0.027197           1.006780            1       True   
1                0.000301           0.037262            2       True   

   fit_order  
0          1  
1          2  
{'accuracy': 1.0, 'balanced_accuracy': 1.0, 'mcc': 1.0, 'roc_auc': 1.0, 'f1': 1.0, 'precision': 1.0, 'recall': 1.0}
----------hiper-----------


{'use_orig_features': False,
 'valid_stacker': True,
 'max_base_models': 0,
 'max_base_models_per_type': 'auto',
 'save_bag_folds': True,
 'stratify': 'auto',
 'bin': 'auto',
 'n_bins': None}

### 2. Pre-processing autogluon

In [17]:
train, test = train_test_split(
    pp_data,
    test_size=0.3,
    random_state=42,
    shuffle=True
)

train_data = TabularDataset(train)

predictor = TabularPredictor(label='diagnosed_diabetes', path='modelePPvsNPP/PP').fit(train_data, presets="medium", save_space=True, keep_only_best=True)

test_data = TabularDataset(test)

predictions = predictor.predict(test_data)
print(predictions)

leaderboard = predictor.leaderboard()
print(leaderboard)

print(predictor.evaluate(train_data))



print("----------hiper-----------")

model_name = predictor.model_best
model_info = predictor.info()
model_info['model_info'][model_name]['hyperparameters']

Preset alias specified: 'medium' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.11.5
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.6.0: Wed Jul  5 22:17:35 PDT 2023; root:xnu-8796.141.3~6/RELEASE_ARM64_T8112
CPU Count:          8
Memory Avail:       5.71 GB / 16.00 GB (35.7%)
Disk Space Avail:   785.57 GB / 926.35 GB (84.8%)
Presets specified: ['medium']
Using hyperparameters preset: hyperparameters='default'
Beginning AutoGluon training ...
AutoGluon will save models to "/Users/ntix/Desktop/Programowanie/SUML/suml_15_grupa5/modelePPvsNPP/PP"
Train Data Rows:    58080
Train Data Columns: 16
Label Column:       diagnosed_diabetes
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1.0, 0.0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during Predictor

26892    1.0
79545    0.0
14554    0.0
51734    1.0
21371    1.0
        ... 
43079    0.0
26629    0.0
40625    0.0
29478    1.0
17817    1.0
Name: diagnosed_diabetes, Length: 24892, dtype: float64
                 model  score_val eval_metric  pred_time_val  fit_time  \
0             LightGBM      0.908    accuracy       0.000649  0.421013   
1  WeightedEnsemble_L2      0.908    accuracy       0.000945  0.459064   

   pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  \
0                0.000649           0.421013            1       True   
1                0.000296           0.038051            2       True   

   fit_order  
0          1  
1          2  
{'accuracy': 0.9115702479338843, 'balanced_accuracy': 0.9257947742338053, 'mcc': 0.8355979703444808, 'roc_auc': 0.9455088035049731, 'f1': 0.9200373657169547, 'precision': 0.9993911719939117, 'recall': 0.8523582864560796}
----------hiper-----------


{'use_orig_features': False,
 'valid_stacker': True,
 'max_base_models': 0,
 'max_base_models_per_type': 'auto',
 'save_bag_folds': True,
 'stratify': 'auto',
 'bin': 'auto',
 'n_bins': None}