In [1]:
# Import necessary libraries
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('../../Input/dataset.csv')

In [3]:
# Data Preprocessing
# Split the Data into features and Target
X = data.drop('Outcome', axis=1)
y = data.Outcome

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [4]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform SMOTE for class imbalance
smote = SMOTE(sampling_strategy='auto', random_state=7)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [5]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10]
}

In [6]:
# Create a logistic regression model
model = LogisticRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train_resampled, y_train_resampled)
best_model = grid_search.best_estimator_

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.7512065         nan 0.75312152]


In [8]:
# Make predictions on the test data
y_pred = model.predict(X_test)

In [9]:
# Calculate and print the accuracy of the model
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print("Accuracy:", accuracy, "%")

Accuracy: 79.5 %


In [10]:
# Display the confusion matrix and classification report
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Confusion Matrix:
[[240  33]
 [ 49  78]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       273
           1       0.70      0.61      0.66       127

    accuracy                           0.80       400
   macro avg       0.77      0.75      0.75       400
weighted avg       0.79      0.80      0.79       400



In [11]:
# Make predictions with the best model
y_pred = best_model.predict(X_test_scaled)

In [12]:
# Evaluate the model
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print("Accuracy:", accuracy, "%")

Accuracy: 77.0 %


In [13]:
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Confusion Matrix:
[[207  66]
 [ 26 101]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.76      0.82       273
           1       0.60      0.80      0.69       127

    accuracy                           0.77       400
   macro avg       0.75      0.78      0.75       400
weighted avg       0.80      0.77      0.78       400



In [14]:
# Predict the condition for individual person
# 1,115,70,30,96,34.6,0.529,32,1
individual = pd.DataFrame({
    'Pregnancies': [1],
    'Glucose': [115],
    'BloodPressure': [70],
    'SkinThickness': [30],
    'Insulin': [96],
    'BMI': [34.6],
    'DiabetesPedigreeFunction': [0.529],
    'Age': [32]
})

# Handle missing values and scaling
# individual = preprocess_individual_data(individual)

individual_report = best_model.predict(individual)
if individual_report == 0:
    print('congrats... You\'re allowed to eat sweets!')
else:
    print('Time to take your diet into consideration, Champ!')

Time to take your diet into consideration, Champ!




### Conclusion: Fine tuned model performs well with 81.82% accuracy over base model with 78.57% accuracy