<a href="https://colab.research.google.com/github/SeanMuInCa/learn_python/blob/master/Logistic_Regression_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA

In [None]:
# Step 1: Load the dataset
data = pd.read_csv("diabetes.csv")

# Explore dataset
print(data.head())
print(data.info())
print(data.describe())

# Check for missing values
print(data.isnull().sum())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768

In [None]:
# Step 2: Split data into features and target
X = data.drop(columns=["Outcome"])
y = data["Outcome"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Step 3: Feature Selection using Logistic Regression with L1 regularization
logreg_l1 = LogisticRegression(penalty='l1', solver='liblinear', C=0.1)
logreg_l1.fit(X_train, y_train)

# Identify important features
feature_importance = np.abs(logreg_l1.coef_).flatten()
selected_features = X.columns[feature_importance > 0]
print("Selected Features:", selected_features)

# Filter dataset with selected features
X_train_selected = X_train[:, feature_importance > 0]
X_test_selected = X_test[:, feature_importance > 0]

Selected Features: Index(['Pregnancies', 'Glucose', 'BloodPressure', 'Insulin', 'BMI',
       'DiabetesPedigreeFunction', 'Age'],
      dtype='object')


In [None]:
# Step 4: Train models with selected features
knn = KNeighborsClassifier(n_neighbors=7)
logreg = LogisticRegression()

knn.fit(X_train_selected, y_train)
logreg.fit(X_train_selected, y_train)

# Predictions and evaluation
knn_pred = knn.predict(X_test_selected)
logreg_pred = logreg.predict(X_test_selected)

print("KNN Performance with Selected Features:")
print(classification_report(y_test, knn_pred))
print("Logistic Regression Performance with Selected Features:")
print(classification_report(y_test, logreg_pred))

KNN Performance with Selected Features:
              precision    recall  f1-score   support

           0       0.78      0.82      0.80        99
           1       0.64      0.58      0.61        55

    accuracy                           0.73       154
   macro avg       0.71      0.70      0.70       154
weighted avg       0.73      0.73      0.73       154

Logistic Regression Performance with Selected Features:
              precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154



In [None]:
# Step 5: Train models with all features for comparison
knn.fit(X_train, y_train)
logreg.fit(X_train, y_train)

knn_pred_all = knn.predict(X_test)
logreg_pred_all = logreg.predict(X_test)

print("KNN Performance with All Features:")
print(classification_report(y_test, knn_pred_all))
print("Logistic Regression Performance with All Features:")
print(classification_report(y_test, logreg_pred_all))

KNN Performance with All Features:
              precision    recall  f1-score   support

           0       0.74      0.79      0.76        99
           1       0.56      0.49      0.52        55

    accuracy                           0.68       154
   macro avg       0.65      0.64      0.64       154
weighted avg       0.67      0.68      0.68       154

Logistic Regression Performance with All Features:
              precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154



In [None]:
# Determine the number of components that explain 95% of the variance
pca = PCA(0.88)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(f"Number of components selected: {pca.n_components_}")


Number of components selected: 6


In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for KNN with additional parameters
param_grid_knn = {'n_neighbors': [5, 7, 9, 11]}
grid_search_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search_knn.fit(X_train_pca, y_train)

# Best parameters for KNN
print("Best parameters for KNN:", grid_search_knn.best_params_)

# Hyperparameter tuning for Logistic Regression with additional parameters
param_grid_logreg = {'C': [0.1, 1, 10, 100]}
grid_search_logreg = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_logreg, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search_logreg.fit(X_train_pca, y_train)

# Best parameters for Logistic Regression
print("Best parameters for Logistic Regression:", grid_search_logreg.best_params_)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters for KNN: {'n_neighbors': 9}
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters for Logistic Regression: {'C': 0.1}


In [None]:
# Train KNN with the best parameters
knn_best = KNeighborsClassifier(n_neighbors=grid_search_knn.best_params_['n_neighbors'])
knn_best.fit(X_train_pca, y_train)

# Train Logistic Regression with the best parameters
logreg_best = LogisticRegression(C=grid_search_logreg.best_params_['C'], max_iter=1000)
logreg_best.fit(X_train_pca, y_train)

# Predictions and evaluation
knn_pred_pca = knn_best.predict(X_test_pca)
logreg_pred_pca = logreg_best.predict(X_test_pca)

print("KNN Performance with PCA (Optimized):")
print(classification_report(y_test, knn_pred_pca))
print("Logistic Regression Performance with PCA (Optimized):")
print(classification_report(y_test, logreg_pred_pca))


KNN Performance with PCA (Optimized):
              precision    recall  f1-score   support

           0       0.78      0.83      0.80        99
           1       0.65      0.58      0.62        55

    accuracy                           0.74       154
   macro avg       0.72      0.71      0.71       154
weighted avg       0.74      0.74      0.74       154

Logistic Regression Performance with PCA (Optimized):
              precision    recall  f1-score   support

           0       0.81      0.85      0.83        99
           1       0.70      0.64      0.67        55

    accuracy                           0.77       154
   macro avg       0.75      0.74      0.75       154
weighted avg       0.77      0.77      0.77       154

