# logistic regression model for heart_statlog_cleveland_hungary dataset

In [146]:
## the taken dataset are already 85% cleaned so we not required to preprocess more.

In [66]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

In [111]:
# used to import dataset
df = pd.read_csv("heart_statlog_cleveland_hungary_final.csv")
df

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1185,45,1,1,110,264,0,0,132,0,1.2,2,1
1186,68,1,4,144,193,1,0,141,0,3.4,2,1
1187,57,1,4,130,131,0,0,115,1,1.2,2,1
1188,57,0,2,130,236,0,2,174,0,0.0,2,1


In [90]:
# it give the shape of the data (number of rows, number of columns)
df.shape

(1190, 12)

In [91]:
# it give the basic statisticle measures of the columns
df.describe()

Unnamed: 0,age,sex,chest_pain_type,resting_bp_s,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_angina,oldpeak,st_slope,target
count,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0
mean,53.720168,0.763866,3.232773,132.153782,210.363866,0.213445,0.698319,139.732773,0.387395,0.922773,1.62437,0.528571
std,9.358203,0.424884,0.93548,18.368823,101.420489,0.409912,0.870359,25.517636,0.48736,1.086337,0.610459,0.499393
min,28.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,1.0,3.0,120.0,188.0,0.0,0.0,121.0,0.0,0.0,1.0,0.0
50%,54.0,1.0,4.0,130.0,229.0,0.0,0.0,140.5,0.0,0.6,2.0,1.0
75%,60.0,1.0,4.0,140.0,269.75,0.0,2.0,160.0,1.0,1.6,2.0,1.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,3.0,1.0


In [92]:
# it find the columns having the NAN values. if there is any null/NAN value then it will show True
df.isnull()

Unnamed: 0,age,sex,chest_pain_type,resting_bp_s,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_angina,oldpeak,st_slope,target
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1185,False,False,False,False,False,False,False,False,False,False,False,False
1186,False,False,False,False,False,False,False,False,False,False,False,False
1187,False,False,False,False,False,False,False,False,False,False,False,False
1188,False,False,False,False,False,False,False,False,False,False,False,False


In [93]:
# it find the number of empty/Null/NAN values in the columns
df.isnull().sum()

age                    0
sex                    0
chest_pain_type        0
resting_bp_s           0
cholesterol            0
fasting_blood_sugar    0
resting_ecg            0
max_heart_rate         0
exercise_angina        0
oldpeak                0
st_slope               0
target                 0
dtype: int64

In [72]:
df.head()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0


In [95]:
from sklearn.model_selection import train_test_split  # It is used to divide/split the data in train and test  
from sklearn.preprocessing import StandardScaler  # It is used to bring the datapoints in the same scale (data scaling)
from sklearn.linear_model import LogisticRegression   # It is used to build the logistic regression model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # it cosists of model evaluation parameters

In [96]:
# the space between the columns name is change to the '_' 
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

In [97]:
# X :- it is a features of we are using for training the model
# y :- it is a target column

X = df.drop("target", axis=1)
y = df["target"]


In [98]:
# here we are spliting the data in 8:2 ratio, 80% used for training and 20% used for testing

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y)


In [77]:
# it is used for bringing the data in the same scale

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [99]:
# model building using sklearn logistic regression and fitting the data to the model.

model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

In [100]:
# it give the prediction from the test data
y_pred = model.predict(X_test_scaled)
print(y_pred)

[1 0 1 1 0 1 0 0 0 1 1 1 0 1 1 0 1 0 1 1 1 0 0 0 1 1 1 0 0 1 0 1 0 1 1 0 0
 0 0 0 1 0 1 0 1 0 0 0 1 1 1 0 1 0 0 0 1 0 0 1 1 0 1 1 0 0 1 1 0 0 0 1 1 1
 0 0 1 0 0 1 1 1 0 1 1 0 0 1 1 0 0 0 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 0 0 1 1
 1 0 0 1 1 0 0 1 1 0 1 1 0 1 1 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1
 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 1 1 0 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1
 0 0 0 1 1 1 1 1 0 1 0 1 1 0 1 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1 1 1 0 0 1 0 0
 0 1 1 0 1 0 1 0 0 1 0 0 0 1 1 1]


In [102]:
# it give the accuracy of the model (here 84% accurate the model is answering)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8403361344537815


In [104]:
# these are the weights of the features

model.coef_

array([[ 0.21727569,  0.62130291,  0.69304451,  0.13114537, -0.25181841,
         0.38303257,  0.00717206, -0.22052714,  0.5086149 ,  0.40022507,
         0.78121404]])

In [105]:
# it is the intercept of the equation

model.intercept_

array([0.22567718])

# logistic regression model for Diabetes dataset

In [106]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

In [109]:
# used to import dataset
df1 = pd.read_csv("diabetes.csv")
df1

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [115]:
# it give the shape of the data (number of rows, number of columns)
df1.shape

(768, 9)

In [116]:
# it give the basic statisticle measures of the columns
df1.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [117]:
# it find the columns having the NAN values. if there is any null/NAN value then it will show True
df1.isnull()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
763,False,False,False,False,False,False,False,False,False
764,False,False,False,False,False,False,False,False,False
765,False,False,False,False,False,False,False,False,False
766,False,False,False,False,False,False,False,False,False


In [118]:
# it find the number of empty/Null/NAN values in the columns
df1.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [119]:
from sklearn.model_selection import train_test_split  # It is used to divide/split the data in train and test  
from sklearn.preprocessing import StandardScaler  # It is used to bring the datapoints in the same scale (data scaling)
from sklearn.linear_model import LogisticRegression   # It is used to build the logistic regression model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # it cosists of model evaluation parameters

In [123]:
# X :- it is a features of we are using for training the model
# y :- it is a target column

X_1 = df1.drop("Outcome", axis=1)
y_1 = df1["Outcome"]

In [125]:
# here we are spliting the data in 8:2 ratio, 80% used for training and 20% used for testing

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split( X_1, y_1, test_size=0.2, random_state=42, stratify=y)


In [127]:
# it is used for bringing the data in the same scale

scaler_1 = StandardScaler()
X_train_scaled_1 = scaler_1.fit_transform(X_train_1)
X_test_scaled_1 = scaler_1.transform(X_test_1)

In [137]:
# model building using sklearn logistic regression and fitting the data to the model.

model_1 = LogisticRegression(max_iter=1000)
model_1.fit(X_train_scaled_1, y_train_1)

In [138]:
# it give the prediction from the test data
y_pred_1 = model_1.predict(X_test_scaled_1)
print(y_pred_1)

[1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0
 0 1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 1 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0
 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 1 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1
 1 0 0 0 1 0]


In [139]:
# it give the accuracy of the model (here 84% accurate the model is answering)
print("Accuracy:", accuracy_score(y_test_1, y_pred_1))

Accuracy: 0.7142857142857143


In [140]:
# these are the weights of the features

model.coef_

array([[ 0.37317821,  1.14415127, -0.19763683,  0.06653497, -0.12730823,
         0.71389341,  0.25552675,  0.18417899]])

In [141]:
# it is the intercept of the equation

model.intercept_

array([-0.87496049])