In [2]:
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

In [4]:
df = pd.read_csv('/content/drive/MyDrive/dataset/heart.csv')
df.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [5]:
df.shape

(1025, 14)

In [6]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [7]:
X = df.drop('target',axis=1)
y = df['target']

In [8]:
X.shape

(1025, 13)

In [9]:
X.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                      y.values.flatten(),test_size=0.25, random_state=10)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((768, 13), (257, 13), (768,), (257,))

In [11]:
scaler = StandardScaler().fit(X_train)
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

In [12]:
X_test_sc[0]

array([ 0.28235435,  0.67011686, -0.92094523,  1.15289824,  0.54169191,
       -0.43457392,  0.89478384, -2.67075748,  1.41008044,  0.12623231,
       -0.60424534,  0.2478956 ,  1.10281856])

In [13]:
params = {"learning_rate":[0.05,0.10,0.20,0.25,0.30],
          "max_depth":[3,4,5,6,7,8,10,12,15,18,21,25,30],
          "min_child_weight":[1,3,5,7],
          "gamma":[0.0,0.1,0.2,0.3,0.4]}

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
classifier = xgb.XGBClassifier()

random_search = RandomizedSearchCV(classifier, param_distributions=params,
        n_iter=5, scoring='roc_auc', n_jobs=-1,cv=5,verbose=2)

random_search.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [14]:
random_search.best_params_

{'min_child_weight': 1, 'max_depth': 12, 'learning_rate': 0.2, 'gamma': 0.3}

In [26]:
classifier = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [27]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(classifier, X, y, cv=10)
score.mean()
classifier.fit(X_train,y_train)



In [28]:
predictions = classifier.predict(X_test)


In [29]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,predictions)
print("Accuracy: %.2f%%" %(accuracy * 100.0))

Accuracy: 96.11%


In [31]:
#testing
classifier.save_model("model.json")

In [32]:
model_xgb_2 = xgb.XGBClassifier()

model_xgb_2.load_model("model.json")

In [33]:
pred_list = [ 0.28235435,  0.67011686, -0.92094523,  1.15289824,  0.54169191,
       -0.43457392,  0.89478384, -2.67075748,  1.41008044,  0.12623231,
       -0.60424534,  0.2478956 ,  1.10281856]

In [34]:
lst_reshaped = np.array(pred_list).reshape((1,-1))

In [35]:
model_xgb_2.predict(lst_reshaped)

array([0])

In [24]:
#array([0]) here 1 means patient doesnot have heart disease