In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Load dataset
df = pd.read_csv("diabetes.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
#Baseline Model (Before Feature Engineering)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

model = LogisticRegression(max_iter=500, solver='liblinear')
baseline_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
baseline_accuracy = baseline_scores.mean()
print("Baseline CV Accuracy:", baseline_accuracy)


In [None]:
#Create 10 New Synthetic Features
df = data.copy()

# 1. Glucose_BMI_Ratio
df['Glucose_BMI_Ratio'] = df['Glucose'] / (df['BMI'] + 1)

# 2. High_Glucose (flag)
df['High_Glucose'] = (df['Glucose'] > 140).astype(int)

# 3. Low_BP (flag)
df['Low_BP'] = (df['BloodPressure'] < 60).astype(int)

# 4. High_Insulin (flag)
df['High_Insulin'] = (df['Insulin'] > 100).astype(int)

# 5. BMI_Age_Interaction
df['BMI_Age_Interaction'] = df['BMI'] * df['Age']

# 6. Preg_Age_Interaction
df['Preg_Age_Interaction'] = df['Pregnancies'] * df['Age']

# 7. Age_Group (binned)
df['Age_Group'] = pd.cut(df['Age'], bins=[20,30,40,50,60,100], labels=False)

# 8. BMI_Group (binned)
df['BMI_Group'] = pd.cut(df['BMI'], bins=[0,18.5,25,30,100], labels=False)

# 9. Log_Insulin (reduce skew)
df['Log_Insulin'] = np.log1p(df['Insulin'])

# 10. Glucose_Squared
df['Glucose_Sq'] = df['Glucose'] ** 2


In [None]:
#Train Model After Feature Engineering
X_new = df.drop('Outcome', axis=1)
y = df['Outcome']

# Drop rows with NaN values in X_new and align y accordingly
X_new_cleaned = X_new.dropna()
y_cleaned = y[X_new_cleaned.index]

model_new = LogisticRegression(max_iter=1000, solver='liblinear')
new_scores = cross_val_score(model_new, X_new_cleaned, y_cleaned, cv=5, scoring='accuracy')
new_accuracy = new_scores.mean()
print("New CV Accuracy:", new_accuracy)

In [None]:
#Compare Metrics and Plot
import matplotlib.pyplot as plt

accuracies = [baseline_accuracy, new_accuracy]
labels = ['Baseline', 'Feature Engineered']

plt.bar(labels, accuracies)
plt.ylabel('CV Accuracy')
plt.title('Model Performance Before vs After Feature Engineering')
plt.show()


In [None]:
#Optional Feature Selection (if overfitting)
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X_new_cleaned, y_cleaned)
