In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt

In [2]:
# Load dataset
data = pd.read_csv('heart.csv')

In [3]:
# Check for missing values and outliers
print(data.isnull().sum())
print(data.describe())

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64
              Age   RestingBP  Cholesterol   FastingBS       MaxHR  \
count  918.000000  918.000000   918.000000  918.000000  918.000000   
mean    53.510893  132.396514   198.799564    0.233115  136.809368   
std      9.432617   18.514154   109.384145    0.423046   25.460334   
min     28.000000    0.000000     0.000000    0.000000   60.000000   
25%     47.000000  120.000000   173.250000    0.000000  120.000000   
50%     54.000000  130.000000   223.000000    0.000000  138.000000   
75%     60.000000  140.000000   267.000000    0.000000  156.000000   
max     77.000000  200.000000   603.000000    1.000000  202.000000   

          Oldpeak  HeartDisease  
count  918.000000    918.000000  
mean     0.887364      0.553377  
std      1.066

In [4]:
# Convert categorical variables to numerical values
le = LabelEncoder()
data['Sex'] = le.fit_transform(data['Sex'])
data['ExerciseAngina'] = le.fit_transform(data['ExerciseAngina'])
cp_type = pd.get_dummies(data['ChestPainType'], prefix='ChestPainType')
data = pd.concat([data, cp_type], axis=1)
rest_ecg = pd.get_dummies(data['RestingECG'], prefix='RestingECG')
data = pd.concat([data, rest_ecg], axis=1)
st_slope = pd.get_dummies(data['ST_Slope'], prefix='ST_Slope')
data = pd.concat([data, st_slope], axis=1)
data.drop(['ChestPainType', 'RestingECG', 'ST_Slope'], axis=1, inplace=True)

In [5]:
# Split into training and testing data
X = data.drop('HeartDisease', axis=1)
y = data['HeartDisease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Scale the numerical variables
sc = StandardScaler()
num_vars = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
X_train[num_vars] = sc.fit_transform(X_train[num_vars])
X_test[num_vars] = sc.transform(X_test[num_vars])

In [7]:
# Build logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [8]:
# Predict on test data and evaluate performance
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
print('Accuracy:', accuracy)
print('Confusion Matrix:', confusion_mat)

Accuracy: 0.8532608695652174
Confusion Matrix: [[67 10]
 [17 90]]


In [9]:
# Tuning hyperparameters using GridSearchCV
from sklearn.model_selection import GridSearchCV

parameters = {'C': np.logspace(-4, 4, 20), 'penalty': ['l1', 'l2']}
grid_search = GridSearchCV(LogisticRegression(), parameters, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print('Best parameters:', grid_search.best_params_)


Best parameters: {'C': 4.281332398719396, 'penalty': 'l2'}


100 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "D:\anaconda 3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\anaconda 3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "D:\anaconda 3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.81742615        nan 0.84329513        nan 0.

In [20]:
# Deploy the model to predict heart disease risk in new patients
new_patient = pd.DataFrame({
    'Age': [60],
    'Sex': ['M'],
    'ChestPainType': ['ATA'],
    'RestingBP': [140],
    'Cholesterol': [300],
    'FastingBS': [1],
    'RestingECG': ['Normal'],
    'MaxHR': [120],
    'ExerciseAngina': ['N'],
    'Oldpeak': [2.0],
    'ST_Slope': ['Flat']
})

new_patient['Sex'] = le.transform(new_patient['Sex'])
new_patient['ExerciseAngina'] = le.transform(new_patient['ExerciseAngina'])
new_patient = pd.get_dummies(new_patient, columns=['ChestPainType', 'RestingECG', 'ST_Slope'])
new_patient[num_vars] = sc.transform(new_patient[num_vars])

risk = model.predict(new_patient)

ValueError: y contains previously unseen labels: 'M'