In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import os

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

NameError: name 'np' is not defined

# Logistic Regression

## **Diabetes Prediction Model:**

In [None]:
# Load diabetes dataset
diabetes_df = pd.read_csv('data/feature_engineering/diabetes_feature_engineering.csv')

In [None]:
# Display dataset info

print("Diabetes Dataset Info:")
print(diabetes_df.info())

print("\nFirst 5 rows:")
print(diabetes_df.head())

In [None]:
# Prepare features and target
X = diabetes_df.drop('diabetes', axis=1)
y = diabetes_df['diabetes']

In [None]:
# Identify categorical and numerical columns
cat_cols = ['gender', 'smoking_history', 'age_group', 'bmi_category', 'glucose_tolerance']
num_cols = ['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level']


In [None]:
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])

In [None]:
# Create and train logistic regression model
diabetes_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])


In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train model
diabetes_model.fit(X_train, y_train)

In [None]:
# Evaluate model
y_pred = diabetes_model.predict(X_test)

In [None]:
print("\nDiabetes Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Save model
joblib.dump(diabetes_model, 'models/diabetes_logreg.pkl')
print("\nDiabetes model saved to models/diabetes_logreg.pkl")

In [None]:
# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Diabetes Prediction Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()