In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import pickle
import xgboost

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

In [2]:
df = pd.read_csv('..\Internship\data\heart.csv')

df.head()

FileNotFoundError: [Errno 2] No such file or directory: '..\\Internship\\data\\heart.csv'

In [None]:
df.shape

In [None]:
df['thal'].value_counts()

In [None]:
rowstodrop = df[df['thal']==0]
df.drop(index=[48,281],axis=0,inplace=True)

In [None]:
df['thal'].value_counts()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
train_df = df.iloc[:, :-1]
y = df['target'].ravel()

train_df.shape, y.shape

In [None]:
# correlation matrix

corr_mat = train_df.corr()
plt.figure(figsize=(15, 15))
g = sns.heatmap(corr_mat, annot=True, cmap=sns.diverging_palette(20, 220, n=200))

In [None]:
# find and remove correlated features

threshold = 0.8

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

correlation(train_df.iloc[:,:-1],threshold)

In [None]:
# Select K best using Chi^2 test

ordered_rank_features = SelectKBest(score_func=chi2, k=13)
ordered_feature = ordered_rank_features.fit(train_df, y)

dfscores = pd.DataFrame(ordered_feature.scores_, columns=["Score"])
dfcolumns = pd.DataFrame(train_df.columns)

features_rank = pd.concat([dfcolumns, dfscores], axis=1)

features_rank.columns = ['Features','Score']

features_rank.nlargest(13, 'Score')

In [None]:
mutual_info = mutual_info_classif(train_df, y)
mutual_data = pd.Series(mutual_info, index=train_df.columns)
mutual_data.sort_values(ascending=False)

In [None]:
final_selected_features = ['ca', 'cp', 'exang', 'thal', 'oldpeak', 'thalach','age']

In [None]:
X = train_df[final_selected_features]

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.15, random_state=42)

In [None]:
# using random forest classifier

rfc = RandomForestClassifier() # ravel : from (n,m) => (n,)
rfc.fit(X_train, y_train)

In [None]:
# random forest classifier accuracy:
y_preds = rfc.predict(X_test)
print("Accuracy : {:.2f}%".format(accuracy_score(y_test, y_preds)*100))

In [None]:
# using xgboost

# hyperparameter optimization

params = {
    "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth"        : [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight" : [1, 3, 5, 7],
    "gamma"            : [0.0, 0.1, 0.2 , 0.3, 0.4],
    "colsample_bytree" : [0.3, 0.4, 0.5, 0.7]
}

clf = xgboost.XGBClassifier()

random_search = RandomizedSearchCV(
    clf, 
    param_distributions=params, 
    n_iter=5, 
    scoring='roc_auc', 
    n_jobs=-1, 
    cv=5, 
    verbose=0
)

random_search.fit(X_train, y_train)

In [None]:
random_search.best_estimator_


In [None]:
classifier = random_search.best_estimator_

classifier.fit(X_train, y_train)

In [None]:
# xgboost classifier accuracy:
y_preds = classifier.predict(X_test)
print("Accuracy : {:.2f}%".format(accuracy_score(y_test, y_preds)*100))

In [None]:
score = cross_val_score(classifier, X_train, y_train, cv=10)
print(score.mean())

In [None]:
classifier.get_params()

In [None]:
# saving trained model
filename = '../Internship/models/heart_disease_model.dat'
pickle.dump(classifier, open(filename, 'wb'))

In [None]:
df.iloc[0][final_selected_features]

In [None]:
#Plot the confusion matrix to understand the classification in detail
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pred_ls = y_preds
test_ls = y_test
conf_arr = confusion_matrix(test_ls, pred_ls)

plt.figure(figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')

ax = sns.heatmap(conf_arr, cmap='Oranges', annot=True, fmt='d', xticklabels=[0,1], yticklabels=[0,1])
plt.xlabel('Prediction')
plt.ylabel('Truth')
plt.show(ax)

In [None]:
X_test