In [1]:
import pandas as pd
import numpy as np

url = r"C:\Users\shrey\Downloads\archive (1).zip"
df = pd.read_csv(url)
print(df.head())
print(df.info())
print(df.describe())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768

In [2]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [3]:
df['Outcome'].value_counts()


Outcome
0    500
1    268
Name: count, dtype: int64

In [4]:
df['Outcome'].value_counts(normalize = "Ture")

Outcome
0    0.651042
1    0.348958
Name: proportion, dtype: float64

In [5]:
column_with_zero_issue = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

for col in column_with_zero_issue :
    print(col, (df[col] == 0).sum)

Glucose <bound method Series.sum of 0      False
1      False
2      False
3      False
4      False
       ...  
763    False
764    False
765    False
766    False
767    False
Name: Glucose, Length: 768, dtype: bool>
BloodPressure <bound method Series.sum of 0      False
1      False
2      False
3      False
4      False
       ...  
763    False
764    False
765    False
766    False
767    False
Name: BloodPressure, Length: 768, dtype: bool>
SkinThickness <bound method Series.sum of 0      False
1      False
2       True
3      False
4      False
       ...  
763    False
764    False
765    False
766     True
767    False
Name: SkinThickness, Length: 768, dtype: bool>
Insulin <bound method Series.sum of 0       True
1       True
2       True
3      False
4      False
       ...  
763    False
764     True
765    False
766     True
767     True
Name: Insulin, Length: 768, dtype: bool>
BMI <bound method Series.sum of 0      False
1      False
2      False
3      False
4      False

In [6]:
for col in column_with_zero_issue:
    median = df[col].median()
    df[col] = df[col].replace(0, median)

In [7]:
for col in column_with_zero_issue :
    print(col, (df[col] == 0).sum)

Glucose <bound method Series.sum of 0      False
1      False
2      False
3      False
4      False
       ...  
763    False
764    False
765    False
766    False
767    False
Name: Glucose, Length: 768, dtype: bool>
BloodPressure <bound method Series.sum of 0      False
1      False
2      False
3      False
4      False
       ...  
763    False
764    False
765    False
766    False
767    False
Name: BloodPressure, Length: 768, dtype: bool>
SkinThickness <bound method Series.sum of 0      False
1      False
2      False
3      False
4      False
       ...  
763    False
764    False
765    False
766    False
767    False
Name: SkinThickness, Length: 768, dtype: bool>
Insulin <bound method Series.sum of 0      False
1      False
2      False
3      False
4      False
       ...  
763    False
764    False
765    False
766    False
767    False
Name: Insulin, Length: 768, dtype: bool>
BMI <bound method Series.sum of 0      False
1      False
2      False
3      False
4      False

In [8]:
from sklearn.model_selection import train_test_split 
x = df.drop('Outcome', axis=1)
y = df['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, test_size = 0.2, stratify = y)

In [9]:
print("Train distribution:")
print(y_train.value_counts(normalize=True))
print("\nTest distribution:")
print(y_test.value_counts(normalize=True))

Train distribution:
Outcome
0    0.651466
1    0.348534
Name: proportion, dtype: float64

Test distribution:
Outcome
0    0.649351
1    0.350649
Name: proportion, dtype: float64


In [10]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
print("accuracy_score", accuracy_score(y_test, y_pred))
print("precision_score", precision_score(y_test, y_pred))
print("recall_score", recall_score(y_test, y_pred))
print("confusion_matrix", confusion_matrix(y_test, y_pred))

accuracy_score 0.7012987012987013
precision_score 0.5833333333333334
recall_score 0.5185185185185185
confusion_matrix [[80 20]
 [26 28]]


In [11]:
model_balanced = LogisticRegression(max_iter=1000, class_weight='balanced')
model_balanced.fit(x_train, y_train)
y_pred_balanced = model_balanced.predict(x_test)
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
print("accuracy_score", accuracy_score(y_test, y_pred_balanced))
print("precision_score", precision_score(y_test, y_pred_balanced))
print("recall_score", recall_score(y_test, y_pred_balanced))
print("confusion_matrix", confusion_matrix(y_test, y_pred_balanced))

accuracy_score 0.7272727272727273
precision_score 0.59375
recall_score 0.7037037037037037
confusion_matrix [[74 26]
 [16 38]]


In [12]:
feature_importance = pd.DataFrame({
    "Feature": x.columns,
    "Coefficient": model_balanced.coef_[0],})

feature_importance = feature_importance.sort_values(
by = "Coefficient",
ascending=False,)

print(feature_importance)


                    Feature  Coefficient
6  DiabetesPedigreeFunction     0.835362
0               Pregnancies     0.113436
5                       BMI     0.109419
1                   Glucose     0.041248
7                       Age     0.014912
3             SkinThickness     0.000324
4                   Insulin    -0.001347
2             BloodPressure    -0.002460


In [13]:
print(model_balanced.coef_)

[[ 1.13435728e-01  4.12479713e-02 -2.45950234e-03  3.23501578e-04
  -1.34735023e-03  1.09419134e-01  8.35361873e-01  1.49122175e-02]]


In [14]:
x.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [15]:
import joblib

joblib.dump(model_balanced, "diabetes_model.pkl")


['diabetes_model.pkl']

In [16]:
import os
print(os.getcwd())


C:\Users\shrey


In [18]:
model_balanced.coef_[0]

array([ 1.13435728e-01,  4.12479713e-02, -2.45950234e-03,  3.23501578e-04,
       -1.34735023e-03,  1.09419134e-01,  8.35361873e-01,  1.49122175e-02])