In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as scp
import time
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

## Holdout Method:

In [4]:
# if testing size is 30%, use test_size= 0.3
def holdout_method(x,y,test_size):
    # total data sets:
    n = len(y)
    
    # training size= total data sets (1- testing size):
    train_index = int(n*(1-test_size))
   
    # training data set is from index 0 to train_index
    x_for_training = x[:train_index,:]
    # testing data set is from index (train_index) to len(y) i.e. n
    x_for_testing = x[train_index:,:]
    
    # similarly for the y:
    y_for_training = y[:train_index]
    y_for_testing = y[train_index:]
    
    return x_for_training, y_for_training, x_for_testing , y_for_testing

## K fold

In [5]:
def i_of_k_fold(x, i, k):
    n = len(x)
    l = list(range(n))
    train_index = l[0:n*(i-1)//k] + l[(n*i)//k:n]
    test_index = l[n*(i-1)//k : (n*i)//k ]
    return train_index, test_index

In [7]:
def k_fold(x_train,y_train,k):
    learning_rate_range = [10**i for i in range(-5,3)]
    mean_error_coresponding_to_Learning_Rate = dict()
    for lr in learning_rate_range:
        each_fold_error = [ ]
        for i in range(1, k+1):
            train_index , test_index = i_of_k_fold(x_train, i, k)
            w, w0= gradient_decent(x_train[train_index], y_train[train_index], lr, iteration = 30)
            each_fold_error.append(mean_error(x_train[test_index], y_train[test_index], w, w0))

    mean_error_coresponding_to_Learning_Rate[lr] = sum(each_fold_error)/len(each_fold_error)
    return mean_error_coresponding_to_Learning_Rate, each_fold_error

In [8]:
df15= pd.read_csv('heart.csv')
df15.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [9]:
df15['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [10]:
df15.shape

(303, 14)

In [11]:
X = df15.drop(columns='target', axis=1)
Y = df15['target']

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=101)

In [14]:
# X_train, X_test, Y_train, Y_test = holdout_method(X, Y, 0.3)

In [16]:
print(X.shape, X_train.shape, X_test.shape)

(303, 13) (242, 13) (61, 13)


In [17]:
model = LogisticRegression(solver='liblinear', max_iter=100)

In [18]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
# print(model.get_params())

In [21]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [22]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.8677685950413223


In [23]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [21]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.819672131147541


In [25]:
X_test_prediction = model.predict(X_test)
test_data_precision = precision_score(X_test_prediction, Y_test)

In [26]:
print('Precision on Test data : ', test_data_precision)

Precision on Test data :  0.8787878787878788
