In [1]:
# Diabetes Analysis Notebook
import sys
import os
import pandas as pd
from sklearn.metrics import roc_auc_score

# Add the path to the DiabetesPrediction directory (the parent directory of DiaPredict)
sys.path.append(os.path.abspath('DiaPredict'))

# Now you can import your classes
from DiaPredict.data_loader import DataLoader
from DiaPredict.preprocessor import RemoveNaNPreprocessor, FillNaNPreprocessor
from DiaPredict.feature_extraction import FeatureTransformer1, FeatureTransformer2
from DiaPredict.model import Model

#In this notebook, we will import the necessary classes, load and preprocess the diabetes dataset, create features, train a model, and make predictions. We will also compute the ROC AUC score.

In [2]:
# Step 1: Load the data
data_loader = DataLoader(file_path='sample_diabetes_mellitus_data.csv')
train_df, test_df = data_loader.load_data()

# Display the shape of the datasets
print(f'Train shape: {train_df.shape}')
print(f'Test shape: {test_df.shape}')


Train shape: (8000, 53)
Test shape: (2000, 53)


In [3]:
# Step 2: Preprocess the data
remove_nan_preprocessor = RemoveNaNPreprocessor()
train_df = remove_nan_preprocessor.process(train_df)
test_df = remove_nan_preprocessor.process(test_df)

fill_nan_preprocessor = FillNaNPreprocessor()
train_df = fill_nan_preprocessor.process(train_df)
test_df = fill_nan_preprocessor.process(test_df)

# Display the shapes after preprocessing
print(f'Processed Train shape: {train_df.shape}')
print(f'Processed Test shape: {test_df.shape}')


Processed Train shape: (7494, 53)
Processed Test shape: (1874, 53)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




In [4]:
# Step 3: Feature transformation
feature_transformer1 = FeatureTransformer1()
train_df = feature_transformer1.transform(train_df)
test_df = feature_transformer1.transform(test_df)

feature_transformer2 = FeatureTransformer2()
train_df = feature_transformer2.transform(train_df)
test_df = feature_transformer2.transform(test_df)

# Display the first few rows of the transformed train dataset
train_df.head()


Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,...,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus,age_squared,height_normalized
9254,9254,185898,137,88.0,28.713767,0,Caucasian,F,157.48,Emergency Department,...,0,0,0,0,1,0,0,1,7744.0,-1.182014
1561,1561,179718,83,69.0,,0,Caucasian,F,152.4,Floor,...,0,0,0,0,0,0,0,1,4761.0,-1.662378
1670,1670,204365,118,61.0,,0,Caucasian,M,188.0,,...,0,0,0,0,0,0,0,0,3721.0,1.703953
6087,6087,167361,118,36.0,33.861606,1,African American,F,165.1,Operating Room,...,0,0,0,0,0,0,0,0,1296.0,-0.461468
6669,6669,213013,83,49.0,22.693954,1,Caucasian,F,162.6,Operating Room,...,0,0,0,0,0,0,0,0,2401.0,-0.697868


In [5]:
# Step 4: Define feature and target columns
feature_columns = ['age', 'height_normalized', 'age_squared']  # Update with actual feature columns
target_column = 'diabetes_mellitus'  # Update with actual target column

# Step 5: Initialize and train the model
model = Model(feature_columns=feature_columns, target_column=target_column, hyperparameters={'C': 1.0})

model.train(train_df)


In [6]:
# Step 6: Make predictions
test_df['predictions'] = model.predict(test_df)

# Display the first few rows of the test dataset with predictions
test_df[['diabetes_mellitus', 'predictions']].head()


Unnamed: 0,diabetes_mellitus,predictions
6252,1,0.266752
1731,0,0.276096
4742,0,0.275921
4521,0,0.050136
6340,1,0.324199


In [7]:
# Step 7: Calculate ROC AUC score
roc_auc = roc_auc_score(test_df[target_column], test_df['predictions'])
print(f'ROC AUC Score: {roc_auc}')


ROC AUC Score: 0.5859212036631791
