# Disease Classification
Build a model to predict whether a person has diabetes based on health metrics like
BMI, glucose level, and age. Handle missing values and imbalanced classes. (logisti

Logistic Regression

In [4]:
import pandas as pd

data1 = pd.read_csv('diabetes.csv')

data1

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [5]:
data1.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [6]:
Q1 = data1[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']].quantile(0.25)
Q3 = data1[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']].quantile(0.75)

IQR = Q3-Q1

IQR

Pregnancies                   5.0000
Glucose                      41.2500
BloodPressure                18.0000
SkinThickness                32.0000
Insulin                     127.2500
BMI                           9.3000
DiabetesPedigreeFunction      0.3825
Age                          17.0000
dtype: float64

In [7]:
outliers = ((data1[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']] < (Q1-1.5*IQR)) | (data1[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']] > (Q3+1.5*IQR)))

outliers

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...
763,False,False,False,False,False,False,False,False
764,False,False,False,False,False,False,False,False
765,False,False,False,False,False,False,False,False
766,False,False,False,False,False,False,False,False


In [8]:
data2 = data1[~outliers.any(axis=1)]

data2k

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
5,5,116,74,0,0,25.6,0.201,30,0
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [9]:
from sklearn.model_selection import train_test_split


x = data2.drop("Outcome", axis=1)
y = data2["Outcome"]

x, y

(     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
 0              6      148             72             35        0  33.6   
 1              1       85             66             29        0  26.6   
 2              8      183             64              0        0  23.3   
 3              1       89             66             23       94  28.1   
 5              5      116             74              0        0  25.6   
 ..           ...      ...            ...            ...      ...   ...   
 763           10      101             76             48      180  32.9   
 764            2      122             70             27        0  36.8   
 765            5      121             72             23      112  26.2   
 766            1      126             60              0        0  30.1   
 767            1       93             70             31        0  30.4   
 
      DiabetesPedigreeFunction  Age  
 0                       0.627   50  
 1                    

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

x_train, x_test, y_train, y_test

(     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
 722            1      149             68             29      127  29.3   
 213            0      140             65             26      130  42.6   
 343            5      122             86              0        0  34.7   
 306           10      161             68             23      132  25.5   
 304            3      150             76              0        0  21.0   
 ..           ...      ...            ...            ...      ...   ...   
 91             4      123             80             15      176  32.0   
 132            3      170             64             37      225  34.5   
 328            2      102             86             36      120  45.5   
 527            3      116             74             15      105  26.3   
 128            1      117             88             24      145  34.5   
 
      DiabetesPedigreeFunction  Age  
 722                     0.349   42  
 213                  

In [11]:
from sklearn.linear_model import LogisticRegression


model_output = LogisticRegression(max_iter=1000)

result = model_output.fit(x_train, y_train)

result

In [12]:
y_pred = result.predict(x_train)

In [13]:
from sklearn.metrics import classification_report

report = classification_report(y_train, y_pred)

print(f'C_report: {report}')

C_report:               precision    recall  f1-score   support

           0       0.81      0.90      0.85       347
           1       0.72      0.54      0.62       164

    accuracy                           0.79       511
   macro avg       0.77      0.72      0.74       511
weighted avg       0.78      0.79      0.78       511



In [14]:
result.score(x_train, y_train)

0.786692759295499

In [15]:
result.score(x_test, y_test)

0.8125