In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Reading the data
data = pd.read_csv('heart.csv')
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [3]:
#Checking for null values
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [4]:
#Checking for duplicates
data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1020     True
1021     True
1022     True
1023     True
1024     True
Length: 1025, dtype: bool

In [5]:
data[data.duplicated()]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
15,34,0,1,118,210,0,1,192,0,0.7,2,0,2,1
31,50,0,1,120,244,0,1,162,0,1.1,2,0,2,1
43,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0
55,55,1,0,140,217,0,1,111,1,5.6,0,0,3,0
61,66,0,2,146,278,0,0,152,0,0.0,1,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [6]:
data = data.drop_duplicates()

In [7]:
data.shape

(302, 14)

In [8]:
data.target.value_counts()

target
1    164
0    138
Name: count, dtype: int64

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

In [10]:
X = data.drop('target', axis=1)
y = data['target']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [13]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
model = LogisticRegression()

In [15]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [16]:
preds = model.predict(X_test)

In [17]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [18]:
accuracy_score(y_test, preds)

0.78

In [19]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.76      0.82      0.79        51
           1       0.80      0.73      0.77        49

    accuracy                           0.78       100
   macro avg       0.78      0.78      0.78       100
weighted avg       0.78      0.78      0.78       100



In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
param_grid = {
    'C' : [0.01, 0.1, 1, 10],
    'penalty':['l1', 'l2'],
    'solver':['linlinera', 'saga']
}

In [22]:
model = LogisticRegression(max_iter=1000)

In [23]:
grid = GridSearchCV(model, param_grid, cv=5, scoring='f1')

In [24]:
grid.fit(X_train, y_train)

0,1,2
,estimator,LogisticRegre...max_iter=1000)
,param_grid,"{'C': [0.01, 0.1, ...], 'penalty': ['l1', 'l2'], 'solver': ['linlinera', 'saga']}"
,scoring,'f1'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,1000


In [25]:
print('Best Parameters', grid.best_params_)
print('Best F1 score', grid.best_score_)

Best Parameters {'C': 1, 'penalty': 'l2', 'solver': 'saga'}
Best F1 score 0.8514581105169341


In [26]:
predss = grid.predict(X_test)

In [27]:
accuracy_score(y_test, predss)

0.78

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
param_grid = {
    'n_estimators' :[100, 150, 200],
    'max_depth':[None, 5, 10],
    'min_samples_split':[2, 5],
    'min_samples_leaf':[1, 2],
    'max_features':['auto', 'sqrt']
}

In [30]:
model = RandomForestClassifier(random_state=42)

In [31]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)

In [32]:
grid_search.fit(X_train, y_train)

0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'max_depth': [None, 5, ...], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2], 'min_samples_split': [2, 5], ...}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [33]:
print('Best params:',grid_search.best_params_)
print('Best score: ', grid_search.best_score_)

Best params: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best score:  0.8425120182460336


In [34]:
preds = grid_search.predict(X_test)

In [35]:
accuracy_score(y_test, preds)

0.78

In [36]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [37]:
for col in data.columns:
    print(f'------------{col}---------------')
    print(data[col].unique())

------------age---------------
[52 53 70 61 62 58 55 46 54 71 43 34 51 50 60 67 45 63 42 44 56 57 59 64
 65 41 66 38 49 48 29 37 47 68 76 40 39 77 69 35 74]
------------sex---------------
[1 0]
------------cp---------------
[0 1 2 3]
------------trestbps---------------
[125 140 145 148 138 100 114 160 120 122 112 132 118 128 124 106 104 135
 130 136 180 129 150 178 146 117 152 154 170 134 174 144 108 123 110 142
 126 192 115  94 200 165 102 105 155 172 164 156 101]
------------chol---------------
[212 203 174 294 248 318 289 249 286 149 341 210 298 204 308 266 244 211
 185 223 208 252 209 307 233 319 256 327 169 131 269 196 231 213 271 263
 229 360 258 330 342 226 228 278 230 283 241 175 188 217 193 245 232 299
 288 197 315 215 164 326 207 177 257 255 187 201 220 268 267 236 303 282
 126 309 186 275 281 206 335 218 254 295 417 260 240 302 192 225 325 235
 274 234 182 167 172 321 300 199 564 157 304 222 184 354 160 247 239 246
 409 293 180 250 221 200 227 243 311 261 242 205 306 219 353

In [38]:
bins = [0, 40, 50, 60, 100]
labels = ['Young', 'Middle_age_1', 'Middle_age_2', 'Senior']
data['age_group'] = pd.cut(data['age'], bins=bins, labels=labels)

In [39]:
data.head(1)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,age_group
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0,Middle_age_2


In [40]:
bins = [0, 200, 239, 1000]
labels = ['Desirable', 'Borderline', 'High']
data['chol_category'] = pd.cut(data['chol'], bins=bins, labels=labels)

In [41]:
data.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target', 'age_group',
       'chol_category'],
      dtype='object')

In [42]:
data.head(1)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,age_group,chol_category
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0,Middle_age_2,Borderline


In [43]:
data = pd.get_dummies(data, columns=['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'age_group', 'chol_category'], drop_first=True)

In [44]:
data.head(1)

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,target,sex_1,cp_1,cp_2,cp_3,...,ca_3,ca_4,thal_1,thal_2,thal_3,age_group_Middle_age_1,age_group_Middle_age_2,age_group_Senior,chol_category_Borderline,chol_category_High
0,52,125,212,168,1.0,0,True,False,False,False,...,False,False,False,False,True,False,True,False,True,False


In [45]:
data = data.astype(int)

In [46]:
X = data.drop('target', axis=1)
y = data['target']

In [47]:
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=43)

In [48]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [49]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [50]:
param_grid = {
    'C' : [0.01, 0.1, 1, 10],
    'penalty':['l1', 'l2'],
    'solver':['linlinera', 'saga']
}

In [51]:
model = LogisticRegression(max_iter=1000)

In [57]:
grid = GridSearchCV(model, param_grid, cv=100, scoring='f1')

In [58]:
grid.fit(X_train, y_train)

0,1,2
,estimator,LogisticRegre...max_iter=1000)
,param_grid,"{'C': [0.01, 0.1, ...], 'penalty': ['l1', 'l2'], 'solver': ['linlinera', 'saga']}"
,scoring,'f1'
,n_jobs,
,refit,True
,cv,100
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,1000


In [59]:
print('best_parameters', grid.best_params_)

best_parameters {'C': 0.1, 'penalty': 'l2', 'solver': 'saga'}


In [60]:
pred = grid.predict(X_test)

In [61]:
accuracy_score(y_test, pred)

0.82