In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split ,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score
from sklearn.feature_selection import RFE

In [None]:
# Load the dataset
data = pd.read_csv('diabetes_dataset.csv')

# Display the first few rows of the dataset
print(data.head())

# Display the Last few rows of the dataset

print(data.tail())

# Display the information of dataset
print(data.info())

   Age  Pregnancies    BMI  Glucose  BloodPressure  HbA1c    LDL   HDL  \
0   69            5  28.39    130.1           77.0    5.4  130.4  44.0   
1   32            1  26.49    116.5           72.0    4.5   87.4  54.2   
2   89           13  25.34    101.0           82.0    4.9  112.5  56.8   
3   78           13  29.91    146.0          104.0    5.7   50.7  39.1   
4   38            8  24.56    103.2           74.0    4.7  102.5  29.1   

   Triglycerides  WaistCircumference  HipCircumference   WHR  FamilyHistory  \
0           50.0                90.5             107.9  0.84              0   
1          129.9               113.3              81.4  1.39              0   
2          177.6                84.7             107.2  0.79              0   
3          117.0               108.9             110.0  0.99              0   
4          145.9                84.1              92.8  0.91              0   

   DietType  Hypertension  MedicationUse  Outcome  
0         0             0   

In [None]:
# Check for missing values
print(data.isnull().sum())

# Scale features used for logistic regression
scaler = StandardScaler()
features = data.drop('Outcome', axis=1)
scaled_features = scaler.fit_transform(features)

# Define target variable
target = data['Outcome']


Age                   0
Pregnancies           0
BMI                   0
Glucose               0
BloodPressure         0
HbA1c                 0
LDL                   0
HDL                   0
Triglycerides         0
WaistCircumference    0
HipCircumference      0
WHR                   0
FamilyHistory         0
DietType              0
Hypertension          0
MedicationUse         0
Outcome               0
dtype: int64


Split the dataset into training and testing sets (80-20 split).

In [None]:
# Split the dataset into training and testing sets (80-20 split).
X_train, X_test, y_train, y_test = train_test_split(scaled_features, target, test_size=0.2, random_state=900)


Train a Logistic Regression model.

In [None]:
# Train the Logistic Regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

# Make predictions
y_pred = logistic_model.predict(X_test)


Evaluate model performance using Accuracy, Precision, Recall, and F1-score.

In [None]:
# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Display the Evaluation parameter
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')



Accuracy: 0.9926624737945493
Precision: 1.0
Recall: 0.9795918367346939
F1 Score: 0.9896907216494846


Perform feature selection to determine which variables contribute the most.

In [None]:
# Feature selection using RFE
selector = RFE(logistic_model, n_features_to_select=8, step=1)
selector = selector.fit(X_train, y_train)

# Get the selected feature indices and names
selected_features = selector.support_
features_names = features.columns
print("Selected features are:", features_names[selected_features])


Selected features are: Index(['Age', 'BMI', 'Glucose', 'HbA1c', 'WaistCircumference',
       'HipCircumference', 'WHR', 'FamilyHistory'],
      dtype='object')


Optimize the model using hyperparameter tuning.

In [None]:
# Define hyperparameters to tune
param_grid = {
    'C': [0.0001,0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

# Initialize GridSearchCV
grid = GridSearchCV(LogisticRegression(solver='liblinear'), param_grid, scoring='f1', cv=4)
grid.fit(X_train, y_train)

# Best parameters
print("Best Hyperparameters:", grid.best_params_)

# Best model
best_model = grid.best_estimator_

# Evaluate on test set
y_pred_optimized = best_model.predict(X_test)

# Metrics for optimized model
accuracy_opt = accuracy_score(y_test, y_pred_optimized)
precision_opt = precision_score(y_test, y_pred_optimized)
recall_opt = recall_score(y_test, y_pred_optimized)
f1_opt = f1_score(y_test, y_pred_optimized)

print(f'Optimized Accuracy: {accuracy_opt}')
print(f'Optimized Precision: {precision_opt}')
print(f'Optimized Recall: {recall_opt}')
print(f'Optimized F1 Score: {f1_opt}')




Best Hyperparameters: {'C': 1, 'penalty': 'l1'}
Optimized Accuracy: 0.9984276729559748
Optimized Precision: 0.9985401459854014
Optimized Recall: 0.9970845481049563
Optimized F1 Score: 0.9978118161925602


Add on Task : Feature Engineering


In [None]:


# Read dataset
file_path = "diabetes_dataset.csv"
df = pd.read_csv(file_path)

# Create a new feature that categorizes individuals as "Overweight" or "Normal" based on BMI
def categorize_bmi(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif 18.5 <= bmi < 25:
        return "Normal"
    elif 25 <= bmi < 30:
        return "Overweight"
    else:
        return "Obese"


# Create a cholesterol risk level feature using LDL and HDL values
def cholesterol_risk(ldl, hdl):
    if ldl >= 160 or hdl < 40:
        return "High Risk"
    elif ldl >= 100 or hdl < 50:
        return "Moderate Risk"
    else:
        return "Low Risk"

#  Calculate the average of BloodPressure and categorize it as Low, Normal, or High
def categorize_bp(bp):
    if bp < 120:
        return 'Low'
    elif 120 <= bp < 140:
        return 'Normal'
    else:
        return 'High'


# Generate a new binary feature: "High WHR" (1 if WHR > 0.85 for females, > 0.90 for males, else 0).
def high_whr(whr):
    return 1 if whr > 0.85 else 0


# Apply transformations
df["BMI_Category"] = df["BMI"].apply(categorize_bmi)
df["Cholesterol_Risk"] = df.apply(lambda row: cholesterol_risk(row["LDL"], row["HDL"]), axis=1)
df["BP_Category"] = df["BloodPressure"].apply(categorize_bp)
df["High_WHR"] = df["WHR"].apply(high_whr)

# Save the output  of modified dataset
output_file = "Feature_Engineering_Updated_Diabetes_dataset.csv"
df.to_csv(output_file, index=False)

print(f"model updated saved as {output_file}")

model updated saved as Feature_Engineering_Updated_Diabetes_dataset.csv
