In [1]:
import pandas as pd


In [3]:
data = pd.read_csv('diabetes.csv')

In [4]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
print('coloumn names:',data.columns)

coloumn names: Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [10]:
print('Check for null values:',data.isnull().sum())

Check for null values: Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [12]:
print('data set statistics:',data.describe())

data set statistics:        Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000

In [13]:
import numpy as np


In [15]:
cols_with_zero=['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for col in cols_with_zero:
    data[col]=data[col].replace(0,np.nan)
    data[col]=data[col].fillna(data[col].mean())

In [16]:
from sklearn.model_selection import train_test_split  

X = data.drop(columns=['Outcome']) 
y = data['Outcome']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Set Size:", X_train.shape)
print("Testing Set Size:", X_test.shape)


Training Set Size: (614, 8)
Testing Set Size: (154, 8)


In [17]:
from sklearn.linear_model import LogisticRegression  

model = LogisticRegression(max_iter=1000) 
model.fit(X_train, y_train)
print("Model training completed successfully!")


Model training completed successfully!


In [18]:
from sklearn.metrics import accuracy_score, classification_report  

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))


Model Accuracy: 75.32%
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.83      0.81        99
           1       0.67      0.62      0.64        55

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.73       154
weighted avg       0.75      0.75      0.75       154



In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}


grid = GridSearchCV(LogisticRegression(max_iter=5000, class_weight='balanced'), 
                    param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train_scaled, y_train)


print("Best Parameters:", grid.best_params_)


best_model = LogisticRegression(**grid.best_params_, max_iter=5000, class_weight='balanced')
best_model.fit(X_train_scaled, y_train)


y_pred_best = best_model.predict(X_test_scaled)
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Optimized Model Accuracy: {accuracy_best * 100:.2f}%")


Best Parameters: {'C': 0.1, 'solver': 'saga'}
Optimized Model Accuracy: 70.78%
