<a href="https://colab.research.google.com/github/RubyaRashedIIT/Attendance-system-/blob/main/diabetic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Data Handling
import pandas as pd
import numpy as np

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Machine Learning Models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

# Model Evaluation
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/diabetic_data.csv')

# Display first few rows
print(data.head())

# Check column names
print(data.columns)

   encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189        Caucasian  Female  [10-20)      ?   
2         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3        500364     82442376        Caucasian    Male  [30-40)      ?   
4         16680     42519267        Caucasian    Male  [40-50)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 1  ...          No      No                   No

In [4]:
# Replace '?' with NaN for easier processing
data.replace('?', np.nan, inplace=True)

# Fill missing categorical values with the most frequent value
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].fillna(data[col].mode()[0])

# Fill missing numerical values with the median
for col in data.select_dtypes(include=['int64', 'float64']).columns:
    data[col] = data[col].fillna(data[col].median())



In [5]:
# Create a target variable for complications (1 = Complication, 0 = No Complication)
complication_codes = {
    'retinopathy': ['362.01', '362.02', '362.03', '362.04', '362.05', '362.06', '362.07'],
    'neuropathy': ['357.2', '250.6'],
    'kidney_disease': ['585', '250.4', '580', '581', '582', '583', '584', '586', '587', '588', '589']
}

def has_complication(row):
    if row['diag_1'] in complication_codes['retinopathy'] or \
       row['diag_2'] in complication_codes['retinopathy'] or \
       row['diag_3'] in complication_codes['retinopathy']:
        return 'retinopathy'

    if row['diag_1'] in complication_codes['neuropathy'] or \
       row['diag_2'] in complication_codes['neuropathy'] or \
       row['diag_3'] in complication_codes['neuropathy']:
        return 'neuropathy'

    if row['diag_1'] in complication_codes['kidney_disease'] or \
       row['diag_2'] in complication_codes['kidney_disease'] or \
       row['diag_3'] in complication_codes['kidney_disease']:
        return 'kidney_disease'

    return 'no_complication'


In [6]:
# Apply function to classify patients
data['complication'] = data.apply(has_complication, axis=1)


In [7]:
# Convert to binary classification (1 = has complication, 0 = no complication)
data['complication'] = data['complication'].apply(lambda x: 1 if x != 'no_complication' else 0)



In [8]:
# Select categorical and numerical columns
categorical_cols = ['race', 'gender', 'age', 'payer_code', 'medical_specialty']
numerical_cols = ['time_in_hospital', 'num_lab_procedures', 'num_medications', 'number_diagnoses']




In [9]:
# One-Hot Encoding for categorical variables
encoder = OneHotEncoder(handle_unknown='ignore')


In [10]:
# Standard Scaling for numerical variables
scaler = StandardScaler()


In [11]:
# Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numerical_cols),
        ('cat', encoder, categorical_cols)
    ])



In [12]:
# Apply transformation
X = data[categorical_cols + numerical_cols]
y = data['complication']


In [13]:
# Split the dataset (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Fit and transform training data, transform test data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [14]:
# Define models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', use_label_encoder=False),
    "SVM": SVC(kernel='linear', probability=True, random_state=42)
}



In [15]:
import warnings
import time
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Suppress warnings from XGBoost and scikit-learn
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Define models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1),
    "SVM": SVC(kernel='linear', probability=True, random_state=42)
}

# Train and evaluate models
for name, model in models.items():
    print(f"\nTraining {name}...")
    start_time = time.time()

    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Make predictions
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=1)

    print(f"\n{name} Model Performance:")
    print("Accuracy:", accuracy)
    print("Classification Report:\n", report)

    # AUC-ROC calculation (only if probabilities are available)
    if y_pred_proba is not None:
        auc_roc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
        print("AUC-ROC Score:", auc_roc)

    end_time = time.time()
    print(f"Training time: {end_time - start_time:.2f} seconds")



Training Random Forest...

Random Forest Model Performance:
Accuracy: 0.8710818512331728
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.98      0.93     17982
           1       0.23      0.05      0.08      2372

    accuracy                           0.87     20354
   macro avg       0.56      0.51      0.50     20354
weighted avg       0.81      0.87      0.83     20354

AUC-ROC Score: 0.6219948517938961
Training time: 120.67 seconds

Training XGBoost...

XGBoost Model Performance:
Accuracy: 0.8832170580721234
Classification Report:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94     17982
           1       0.47      0.02      0.04      2372

    accuracy                           0.88     20354
   macro avg       0.68      0.51      0.49     20354
weighted avg       0.84      0.88      0.83     20354

AUC-ROC Score: 0.6815166182671335
Training time: 0.94 seconds

Tra

In [None]:
# Hyperparameter tuning for XGBoost
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

grid_search = GridSearchCV(XGBClassifier(eval_metric='logloss', use_label_encoder=False), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)


In [None]:
# Best Model
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
y_prob_best = best_model.predict_proba(X_test)[:, 1]

print("\nBest Tuned XGBoost Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("AUC-ROC Score:", roc_auc_score(y_test, y_prob_best, multi_class='ovr'))



Best Tuned XGBoost Model Performance:
Accuracy: 0.8831679276800629
AUC-ROC Score: 0.6896396232282497
