# 1. Setup and Installation

In [1]:
# ! pip install numpy pandas scikit-learn matplotlib seaborn

# 2. Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.datasets import load_iris

# 3. Loading the Data

In [3]:
data = pd.read_csv("dataset/diabetes.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
data['Outcome'].value_counts(normalize=True)

Outcome
0    0.651042
1    0.348958
Name: proportion, dtype: float64

In [6]:
data.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [9]:
X = data.drop(columns=['Outcome'])
y = data['Outcome']

In [10]:
# Check the shape of the data
print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)

Feature matrix shape: (768, 8)
Target vector shape: (768,)


# 4. Splitting the Data

In [11]:
# Split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Check the shape of the train/test split
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (537, 8)
Testing set shape: (231, 8)


# 5. Feature Scaling

In [12]:
# Standardizing the features
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. Train the Logistic Regression Model

In [13]:
# Creating a logistic regression classifier
log_reg = LogisticRegression()

In [14]:
# Fit the model on the training data
log_reg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
# Predict on test set
y_pred = log_reg.predict(X_test)

In [16]:
# check accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7403


# 7. Evaluation of the Model

In [18]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report (precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Confusion Matrix:
[[129  21]
 [ 39  42]]

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.86      0.81       150
           1       0.67      0.52      0.58        81

    accuracy                           0.74       231
   macro avg       0.72      0.69      0.70       231
weighted avg       0.73      0.74      0.73       231

Accuracy: 0.7403


# 8. HyperPramatere Tunning

## Grid Search

In [19]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],
    'max_iter': [100, 200, 300]
}

In [20]:
# Create Logistic Regression model
model = LogisticRegression()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'C': 10, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score: 0.7858082381446868


**Summary of Key Hyperparameters to Tune:**

1. `C`: Regularization strength.
2. `penalty`: Regularization type (L1, L2, elasticnet).
3. `solver`: The optimization algorithm.
4. `max_iter`: Maximum iterations for convergence.
5. `class_weight`: Useful for imbalanced datasets.
6. `multi_class`: For multi-class classification problems.

In [22]:
# Create Logistic Regression model
model = LogisticRegression(**grid_search.best_params_)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report (precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Confusion Matrix:
[[130  20]
 [ 39  42]]

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.87      0.82       150
           1       0.68      0.52      0.59        81

    accuracy                           0.74       231
   macro avg       0.72      0.69      0.70       231
weighted avg       0.74      0.74      0.74       231

Accuracy: 0.7446


## Random Search

In [23]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform

In [24]:
# Define the parameter distribution for tuning
param_dist = {
    'C': uniform(0.001, 100),  # Uniform distribution for C
    'penalty': ['l1', 'l2'],   # Penalty type (L1 or L2)
    'solver': ['liblinear', 'lbfgs', 'saga'],  # Solvers
    'max_iter': [100, 200, 300, 500],  # Number of iterations
    'class_weight': [None, 'balanced'],  # Class weight (balanced for imbalanced datasets)
}

In [25]:
# Create Logistic Regression model
model = LogisticRegression()

# Set up RandomizedSearchCV (with 5-fold cross-validation)
random_search = RandomizedSearchCV(estimator=model,
                                   param_distributions=param_dist,
                                   n_iter=50,    # Number of random combinations to try
                                   cv=5,         # 5-fold cross-validation
                                   random_state=42,
                                   n_jobs=-1)    # Use all available cores for parallel computation

# Fit the RandomizedSearchCV
random_search.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [26]:
# Print the best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Best Parameters: {'C': 37.455011884736244, 'class_weight': None, 'max_iter': 300, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score: 0.7858082381446868


In [27]:
# Create Logistic Regression model
model = LogisticRegression(**random_search.best_params_)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report (precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Confusion Matrix:
[[130  20]
 [ 39  42]]

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.87      0.82       150
           1       0.68      0.52      0.59        81

    accuracy                           0.74       231
   macro avg       0.72      0.69      0.70       231
weighted avg       0.74      0.74      0.74       231

Accuracy: 0.7446
