In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from collections import Counter

# STEP 1: Load your dataset
df = pd.read_csv("/content/Smote.csv")  # Replace with your file path


In [5]:
df.head(10)

Unnamed: 0,Age,BloodPressure,Cholesterol,Diabetes
0,58,87,230,0
1,71,67,248,0
2,48,95,162,0
3,34,67,188,0
4,62,117,164,0
5,27,119,178,0
6,40,109,178,0
7,58,87,224,0
8,77,100,181,0
9,38,123,151,0


In [7]:
# Step 2: Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Step 3: Check imbalance
print("Before SMOTE class distribution:", Counter(y))

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Step 5: Train Logistic Regression BEFORE SMOTE
model_before = LogisticRegression()
model_before.fit(X_train, y_train)
y_pred_before = model_before.predict(X_test)

print("\nPerformance BEFORE SMOTE:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_before))
print("Classification Report:\n", classification_report(y_test, y_pred_before))
print("Accuracy BEFORE SMOTE:", round(accuracy_score(y_test, y_pred_before) * 100, 2), "%")

# Step 6: Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("\nAfter SMOTE class distribution:", Counter(y_train_smote))

# Step 7: Train Logistic Regression AFTER SMOTE
model_after = LogisticRegression()
model_after.fit(X_train_smote, y_train_smote)
y_pred_after = model_after.predict(X_test)

print("\nPerformance AFTER SMOTE:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_after))
print("Classification Report:\n", classification_report(y_test, y_pred_after))
print("Accuracy AFTER SMOTE:", round(accuracy_score(y_test, y_pred_after) * 100, 2), "%")


Before SMOTE class distribution: Counter({0: 900, 1: 100})

Performance BEFORE SMOTE:
Confusion Matrix:
 [[270   0]
 [ 10  20]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       270
           1       1.00      0.67      0.80        30

    accuracy                           0.97       300
   macro avg       0.98      0.83      0.89       300
weighted avg       0.97      0.97      0.96       300

Accuracy BEFORE SMOTE: 96.67 %

After SMOTE class distribution: Counter({0: 630, 1: 630})

Performance AFTER SMOTE:
Confusion Matrix:
 [[236  34]
 [  6  24]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.87      0.92       270
           1       0.41      0.80      0.55        30

    accuracy                           0.87       300
   macro avg       0.69      0.84      0.73       300
weighted avg       0.92      0.87      0.88       300

Accuracy AFT