In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Data

In [2]:
data = pd.read_csv('../input/heart-disease-prediction-using-logistic-regression/framingham.csv')

In [3]:
data

# Assessing for missing Data

In [4]:
data.info()

In [5]:
data.isnull().sum()

# Dealing with missing data

In [6]:
data.columns

In [7]:
columns_data= ['age', 'education', 'cigsPerDay', 'totChol', 'sysBP',
               'diaBP', 'BMI', 'heartRate', 'glucose','TenYearCHD']

In [8]:
#data.dropna(subset = ['male', 'currentSmoker', 'BPMeds','cigsPerDay',
#       'prevalentStroke', 'prevalentHyp', 'diabetes','TenYearCHD'], inplace=True)

In [9]:
data.dropna(inplace = True)

In [10]:
#data[columns_data].fillna(data[columns_data].mean(), inplace = True)

In [11]:
data

**Separating Male and Female Data**

In [12]:
data['male'].values

In [13]:
male_copy = data['male']
female = male_copy.replace([1,0],[0,1]).tolist()
data['female'] = female

In [14]:
data.columns



**Re-ordering Columns**

In [15]:
data = data[['male','female', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD']]
data

In [16]:
data.reset_index(drop = True, inplace = True)

# Re-assessment

In [17]:
data

In [18]:
data.info()

In [19]:
data.isnull().sum()

In [20]:
data.values

# Feature Scaling

In [21]:
# from sklearn.preprocessing import StandardScaler

# scaled_val = StandardScaler().fit_transform(data.values)
# data = pd.DataFrame(scaled_val , index = data.index , columns = data.columns)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scal_col = [ 'age', 'education', 'cigsPerDay',
             'totChol', 'sysBP',
             'diaBP', 'BMI', 'heartRate', 'glucose']

data[scal_col] = scaler.fit_transform(data[scal_col])

In [22]:
data

# Data separation for training/testing

In [23]:
Y = data['TenYearCHD'].to_numpy()
data.drop( columns = 'TenYearCHD' , inplace = True)
data



In [24]:
X = data.to_numpy()

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test , y_train , y_test = train_test_split(X,Y, test_size = 0.2 , random_state = 40)

In [26]:
display(X_train.shape)
display(y_train.shape)


In [27]:
n = len(data.columns)

# Defining Classification Functions

In [28]:
m_test = len(y_test)
y_test = y_test.reshape(m_test,1)

m_train = len(y_train)
y_train = y_train.reshape(m_train,1)

X_train = np.concatenate((np.ones((m_train,1)), X_train ), axis = 1 )
X_test = np.concatenate((np.ones((m_test,1)), X_test ), axis = 1 )

In [29]:
def sigmoid_fun(X , theta):
    sig = 1/(1+np.exp((-1)*np.dot(X,theta)))
    return sig

def cost(theta,X,y,lamb):
    m = len(y)
    n = len(theta)
    y = y.reshape(m,1)
    theta = theta.reshape(n,1)
    
    h = sigmoid_fun(X , theta)
    
    j = -np.dot(y.T,np.log(h)) - (np.dot((1-y).T,(1-np.log(h))))
    
    J =  j/(m)
    J = J + ( lamb*np.sum( (theta[1:,:]**2) ) )/(2*m)
    
    return J

def grad (theta,X,y,lamb):
    n = len(theta)
    m = len(y)
    y = y.reshape(m,1)
    theta = theta.reshape(n,1)
    grad = np.dot(X.T, (sigmoid_fun(X,theta) - y))/m
    grad[1:,:] = grad[1:,:] + (lamb/m)*theta[1:,:]
    return grad

# Running Optimizer for final weights


In [30]:
import scipy.optimize as op
theta = np.zeros((X_train.shape[1],1) )

lamb = 0

result = op.fmin_tnc(cost, x0=theta, args=(X_train,y_train, lamb),fprime=grad)
finaltheta = result[0].reshape((-1,1))

In [31]:
y_pred = sigmoid_fun(X_test , finaltheta)
#y_pred

In [32]:
len(y_pred)

In [33]:
for i in range(len(y_pred)):
    if (y_pred[i] >= 0.5):
        y_pred[i] = 1
    else:
        y_pred[i] = 0
        

#y_pred
    

# Results

In [34]:
from sklearn.metrics import confusion_matrix
confusion_matrix( y_test , y_pred)

In [35]:
#y_test
from sklearn.metrics import classification_report

print(classification_report(y_test , y_pred))