In [25]:
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import streamlit as st
import pickle
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import PowerTransformer
import seaborn as sns

# Save the trained model


# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier

In [26]:
df=pd.read_csv("diabetes_data.csv")

In [27]:
df



In [28]:
df.shape



In [29]:
df.info()




In [30]:
df.head()



In [31]:
df.describe()



In [32]:
print(df['Pregnancies'])



In [33]:
df.fillna(0)



In [34]:
X = df.drop(columns=['Outcome']) 
y = df['Outcome']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
y_train



In [37]:
y_test



In [38]:
X_train



In [39]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [40]:
model=LogisticRegression(max_iter=100000000)

In [41]:
model.fit(X_train,y_train)



In [18]:
y_pred=model.predict(X_test)

In [19]:
y_pred



In [20]:
accuracy = accuracy_score(y_test, y_pred)*100

In [21]:
print(f"accuracy {accuracy} %")



In [22]:
joblib.dump(model, 'diabetes_model.pkl')
print("Model saved successfully!")





In [42]:
df.isnull().sum()



In [43]:
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.show()



In [44]:
# Create new features
df['Glucose_BMI'] = df['Glucose'] * df['BMI']
df['Age_BMI'] = df['Age'] * df['BMI']
df['Glucose_Insulin'] = df['Glucose'] * df['Insulin']
df['Is_Glucose_High'] = (df['Glucose'] > 120).astype(int)
df['Is_BloodPressure_High'] = (df['BloodPressure'] > 80).astype(int)

In [45]:
X = df.drop(columns=['Outcome']) 
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [47]:
# Optional: Try power transformation for non-normal distributions
# pt = PowerTransformer(method='yeo-johnson')
# X_train_scaled = pt.fit_transform(X_train_scaled)
# X_test_scaled = pt.transform(X_test_scaled)

In [48]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVC': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Evaluate each model with cross-validation
for name, model in models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
    print(f"{name} Cross-Validation Accuracy: {scores.mean()*100:.2f}% ± {scores.std()*100:.2f}%")



In [49]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

print(f"Best parameters: {grid_search.best_params_}")
best_rf_model = grid_search.best_estimator_



In [None]:
rf_pred = best_rf_model.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, rf_pred) * 100
print(f"Random Forest Accuracy: {rf_accuracy:.2f}%")

# Also evaluate our original logistic regression model for comparison
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train_scaled, y_train)
log_pred = log_reg.predict(X_test_scaled)
log_accuracy = accuracy_score(y_test, log_pred) * 100
print(f"Logistic Regression Accuracy: {log_accuracy:.2f}%")

In [None]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else None
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    if y_prob is not None:
        print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob):.4f}")
    
    # Visualize confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

# Evaluate the best model
print("Best Model Evaluation:")
evaluate_model(best_rf_model, X_test_scaled, y_test)

In [None]:
joblib.dump(best_rf_model, 'diabetes_rf_model.pkl')
joblib.dump(scaler, 'diabetes_scaler.pkl')
print("Best model and scaler saved successfully!")

In [None]:
if hasattr(best_rf_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_rf_model.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.show()