In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pima-indians-diabetes-database/diabetes.csv
/kaggle/input/diabetes/description.pdf
/kaggle/input/diabetes/diabetic_data.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score
import warnings
warnings.filterwarnings('ignore')

In [3]:
# DATASET 1: FOR READMISSION IN HOSPITALS 

print("Uploading diabetes dataset")
diabetes_one = pd.read_csv('/kaggle/input/diabetes/diabetic_data.csv')
print("Dataset loaded")

Uploading diabetes dataset
step 1: Dataset loaded completed
Shape right after loading: (101766, 50)


In [6]:
print("Extracting Features from dataset:")
for i, col in enumerate(diabetes_one.columns, 1):
    print(f"{i} {col}")

calc_missing= (diabetes_one.isnull().sum() / len(diabetes_one) * 100)
drop_cols= calc_missing[calc_missing > 95].index
diabetes_one= diabetes_one.drop(columns=drop_cols)


diabetes_one['target'] = (diabetes_one['readmitted'] == '<30').astype(int)
print(f"  High Risk (readmitted <30 days): {(diabetes_one['target']==1).sum():,}")
print(f"  Low Risk: {(diabetes_one['target']==0).sum():,}")


Extracting Features from dataset:
1 encounter_id
2 patient_nbr
3 race
4 gender
5 age
6 weight
7 admission_type_id
8 discharge_disposition_id
9 admission_source_id
10 time_in_hospital
11 payer_code
12 medical_specialty
13 num_lab_procedures
14 num_procedures
15 num_medications
16 number_outpatient
17 number_emergency
18 number_inpatient
19 diag_1
20 diag_2
21 diag_3
22 number_diagnoses
23 max_glu_serum
24 A1Cresult
25 metformin
26 repaglinide
27 nateglinide
28 chlorpropamide
29 glimepiride
30 acetohexamide
31 glipizide
32 glyburide
33 tolbutamide
34 pioglitazone
35 rosiglitazone
36 acarbose
37 miglitol
38 troglitazone
39 tolazamide
40 examide
41 citoglipton
42 insulin
43 glyburide-metformin
44 glipizide-metformin
45 glimepiride-pioglitazone
46 metformin-rosiglitazone
47 metformin-pioglitazone
48 change
49 diabetesMed
50 readmitted
51 target
  High Risk (readmitted <30 days): 11,357
  Low Risk: 90,409


In [7]:
print("Feature Engineering")

if 'num_lab_procedures' in diabetes_one.columns:
    diabetes_one['had_lab_procedures']= (diabetes_one['num_lab_procedures']>0).astype(int)
if 'num_medications' in diabetes_one.columns:
    diabetes_one['high_medication_count'] = (diabetes_one['num_medications']> diabetes_one['num_medications'].median()).astype(int)
if 'number_diagnoses' in diabetes_one.columns:
    diabetes_one['multiple_diagnoses']=(diabetes_one['number_diagnoses']>5).astype(int)
if 'admission_type_id' in diabetes_one.columns:
    diabetes_one['is_emergency'] =(diabetes_one['admission_type_id']==1).astype(int)
if 'number_inpatient' in diabetes_one.columns:
    diabetes_one['had_inpatient_visits'] = (diabetes_one['number_inpatient']> 0).astype(int)


Feature Engineering


In [13]:
diabetes_one = diabetes_one.drop(columns=['encounter_id', 'patient_nbr', 'readmitted'], errors='ignore')

categorical_cols =diabetes_one.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
    lab_enc=LabelEncoder()
    diabetes_one[col] = lab_enc.fit_transform(diabetes_one[col].astype(str))

In [16]:
print("Training Random Forest Model")

X_diabetes_one=diabetes_one.drop('target', axis=1)
y_diabetes_one=diabetes_one['target']

X_train_h,X_test_h,y_train_h,y_test_h= train_test_split(
    X_diabetes_one,y_diabetes_one,test_size=0.2,random_state=42,stratify=y_diabetes_one
)

print(f"Training set: {len(X_train_h):,}")
print(f"Testing set: {len(X_test_h):,}")

Training Random Forest Model
Training set: 81,412
Testing set: 20,354


In [17]:
param_grid_h= {
    'n_estimators': [100, 200],
    'max_depth': [20, None],
    'min_samples_split': [2, 5]
}

rf_hospital= RandomForestClassifier(random_state=42, n_jobs=-1)
grid_hospital =GridSearchCV(rf_hospital, param_grid_h, cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
grid_hospital.fit(X_train_h, y_train_h)

best_rf_hospital =grid_hospital.best_estimator_
print(f"Best parameters: {grid_hospital.best_params_}")

y_pred_h =best_rf_hospital.predict(X_test_h)
y_proba_h =best_rf_hospital.predict_proba(X_test_h)[:, 1]

print("PROJECT 1 RESULTS: Hospital Readmission Prediction")
print(f"Accuracy:  {accuracy_score(y_test_h, y_pred_h):.4f} ({accuracy_score(y_test_h, y_pred_h)*100:.2f}%)")
print(f"ROC-AUC:   {roc_auc_score(y_test_h, y_proba_h):.4f}")
print(f"F1-Score:  {f1_score(y_test_h, y_pred_h):.4f}")
print("Classification Report:")
print(classification_report(y_test_h, y_pred_h, target_names=['Low Risk', 'High Risk']))

Best parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}
PROJECT 1 RESULTS: Hospital Readmission Prediction
Accuracy:  0.8891 (88.91%)
ROC-AUC:   0.6642
F1-Score:  0.0157
Classification Report:
              precision    recall  f1-score   support

    Low Risk       0.89      1.00      0.94     18083
   High Risk       0.78      0.01      0.02      2271

    accuracy                           0.89     20354
   macro avg       0.84      0.50      0.48     20354
weighted avg       0.88      0.89      0.84     20354



In [18]:
# DATASET 2 : FOR DIABETES SEVERITY MODEL
diabetes_two = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
print("Dataset loaded")

Dataset loaded


In [20]:
print("Extracting Features from dataset:")
for i, col in enumerate(diabetes_two.columns, 1):
    print(f"  {i} {col}")

zero_counts = (diabetes_two == 0).sum()
for col in diabetes_two.columns[:-1]:  
    if zero_counts[col] > 0:
        print(f"  {col}: {zero_counts[col]} zeros ({zero_counts[col]/len(diabetes_two)*100:.1f}%)")

Extracting Features from dataset:
  1 Pregnancies
  2 Glucose
  3 BloodPressure
  4 SkinThickness
  5 Insulin
  6 BMI
  7 DiabetesPedigreeFunction
  8 Age
  9 Outcome
  Pregnancies: 111 zeros (14.5%)
  Glucose: 5 zeros (0.7%)
  BloodPressure: 35 zeros (4.6%)
  SkinThickness: 227 zeros (29.6%)
  Insulin: 374 zeros (48.7%)
  BMI: 11 zeros (1.4%)


In [23]:
def classify_severity(row):
    if row['Outcome'] == 1:
        return 2  # Diabetic
    elif row['Glucose'] >= 140 or row['BMI'] >= 30:
        return 1  # Prediabetes
    else:
        return 0  # Healthy

diabetes_two['Severity'] = diabetes_two.apply(classify_severity, axis=1)

print(diabetes_two['Severity'].value_counts())

Severity
1    278
2    268
0    222
Name: count, dtype: int64


In [26]:
print("Feature Engineering")
print("Max Age:", diabetes_two['Age'].max())
print("Max BMI:",diabetes_two['BMI'].max())
print("Max Glucose", diabetes_two['Glucose'].max())

diabetes_two['AgeGroup'] =pd.cut(diabetes_two['Age'], bins=[0,25,45,90], labels=[0,1,2], include_lowest=True)
diabetes_two['AgeGroup'] = diabetes_two['AgeGroup'].astype(int)

diabetes_two['BMI_Category'] = pd.cut(diabetes_two['BMI'], bins=[0,25,30,70], labels=[0,1,2], include_lowest=True)
diabetes_two['BMI_Category'] = diabetes_two['BMI_Category'].astype(int)

diabetes_two['Glucose_Category'] = pd.cut(diabetes_two['Glucose'], bins=[0,100,140,200], labels=[0,1,2], include_lowest=True)
diabetes_two['Glucose_Category'] = diabetes_two['Glucose_Category'].astype(int)



Feature Engineering
Max Age: 81
Max BMI: 67.1
Max Glucose 199


In [28]:
X_diabetes_two=diabetes_two.drop(['Outcome', 'Severity'], axis=1)
y_diabetes_two=diabetes_two['Severity']

X_train_two,X_test_two,y_train_two,y_test_two = train_test_split(X_diabetes_two,y_diabetes_two,test_size=0.2,random_state=42,stratify=y_diabetes_two)

print(f"Training set: {len(X_train_two)}")
print(f"Testing set: {len(X_test_two)}")

Training set: 614
Testing set: 154


In [29]:
print("Training Random Forest Model")

param_grid_diabetes_two = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15, None],
    'min_samples_split': [2, 5]
}

rf_diabetes_two= RandomForestClassifier(random_state=42,n_jobs=-1)
grid_diabetes_two = GridSearchCV(rf_diabetes_two,param_grid_diabetes_two,cv=3,scoring='accuracy',n_jobs=-1, verbose=0)

grid_diabetes_two.fit(X_train_two, y_train_two)

best_rf_diabetes_two = grid_diabetes_two.best_estimator_
print(f"Best parameters: {grid_diabetes_two.best_params_}")

y_pred_two = best_rf_diabetes_two.predict(X_test_two)

print("RESULT : Diabetes Severity Classification")
print(f"Accuracy:  {accuracy_score(y_test_two, y_pred_two):.4f} ({accuracy_score(y_test_two, y_pred_two)*100:.2f}%)")
print(f"F1-Score:  {f1_score(y_test_two, y_pred_two, average='weighted'):.4f}")

print("\nClassification Report:")
print(classification_report(y_test_two, y_pred_two, target_names=['Healthy', 'Prediabetes', 'Diabetic']))


Training Random Forest Model
Best parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
RESULT : Diabetes Severity Classification
Accuracy:  0.6883 (68.83%)
F1-Score:  0.6758

Classification Report:
              precision    recall  f1-score   support

     Healthy       0.81      0.95      0.88        44
 Prediabetes       0.66      0.71      0.68        56
    Diabetic       0.59      0.44      0.51        54

    accuracy                           0.69       154
   macro avg       0.68      0.70      0.69       154
weighted avg       0.67      0.69      0.68       154

