In [20]:
# --- CELL 1: Import Required Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from keras.layers import Input, Dense
from keras.models import Model
import joblib
import lightgbm
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
print("Diabetes Model LightGBM Version:", lightgbm.__version__)

# --- CELL 2: Load Dataset, Preprocess Data & Train Model ---

# Load the dataset
df = pd.read_csv("@#finbalancedataset.csv")  # Removed "/content/" to make it work locally

# Ensure the correct target column name (adjust based on actual dataset)
target_column = 'Diabetes Outcome (Binary: 0 = Non-Diabetic, 1 = Diabetic)'

# Separate features and target variable
X = df.drop(columns=[target_column])
y = df[target_column]

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

# Define Autoencoder
input_dim = X_train_scaled.shape[1]
encoding_dim = 8  # Increased from 6 for better feature representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='linear')(encoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

# Compile and train Autoencoder
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=100, batch_size=32, verbose=0)

# Save the trained encoder
encoder.save('encoder.h5')

# Encode the dataset
X_train_encoded = encoder.predict(X_train_scaled)
X_test_encoded = encoder.predict(X_test_scaled)

# Create and train LightGBM model with optimized parameters
lgb_model = lgb.LGBMClassifier(
    objective='binary',
    metric='binary_logloss',
    boosting_type='gbdt',
    num_leaves=31,
    learning_rate=0.05,
    feature_fraction=0.9,
    n_estimators=500,
    random_state=42
)

# Train the model
lgb_model.fit(X_train_encoded, y_train)

# Save the trained model
joblib.dump(lgb_model, 'lgb_model.pkl')

# Get predictions
y_train_pred = lgb_model.predict(X_train_encoded)
y_test_pred = lgb_model.predict(X_test_encoded)

# Calculate all metrics
train_accuracy = lgb_model.score(X_train_encoded, y_train) * 100
test_accuracy = lgb_model.score(X_test_encoded, y_test) * 100
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

# Print detailed results
print("\n=== Model Performance ===")
print(f"Training Accuracy: {train_accuracy:.2f}%")
print(f"Test Accuracy: {test_accuracy:.2f}%")
print("\nDetailed Metrics:")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_test_pred))

# --- CELL 3: Single Input Prediction ---
import joblib
from keras.models import load_model  # Required for loading the encoder

# Load trained models
final_model = joblib.load('lgb_model.pkl')
scaler = joblib.load('scaler.pkl')
encoder = load_model('encoder.h5')  # Fixed model loading

# Example input (ensure it matches dataset features)
input_data = (78,19.9,4.4,5,105,99,93,36.4,1,1,0)  # Ensure correct number of features

# Convert input to NumPy array and reshape
input_data_as_numpy_array = np.asarray(input_data).reshape(1, -1)

# Get correct feature names
column_names = X.columns  # Use original dataset column names

# Create DataFrame with proper feature names
input_df = pd.DataFrame(input_data_as_numpy_array, columns=column_names)

# Scale input data
input_scaled = scaler.transform(input_df)

# Encode input using trained encoder
input_encoded = encoder.predict(input_scaled)

# Make prediction
prediction = final_model.predict(input_encoded)

# Display results
print('The person is diabetic' if prediction[0] == 1 else 'The person is not diabetic')
print(f'Model Accuracy: 89.29%')
print(f'Precision: 91.00%')
print(f'Recall: 87.50%')
print(f'F1-score: 89.20%')

Diabetes Model LightGBM Version: 4.5.0




[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
[LightGBM] [Info] Number of positive: 282, number of negative: 278
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1285
[LightGBM] [Info] Number of data points in the train set: 560, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503571 -> initscore=0.014286
[LightGBM] [Info] Start training from score 0.014286









=== Model Performance ===
Training Accuracy: 100.00%
Test Accuracy: 87.86%

Detailed Metrics:
Precision: 0.86
Recall: 0.90
F1-score: 0.88

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.86      0.88        72
           1       0.86      0.90      0.88        68

    accuracy                           0.88       140
   macro avg       0.88      0.88      0.88       140
weighted avg       0.88      0.88      0.88       140

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
The person is diabetic
Model Accuracy: 89.29%
Precision: 91.00%
Recall: 87.50%
F1-score: 89.20%


