### Import libraries 

In [2]:
pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/28/3c/ddf5d9eb742cdb7fbcd5c854bce07471bad01194ac37de91db64fbef0c58/xgboost-2.1.3-py3-none-macosx_12_0_arm64.whl.metadata
  Downloading xgboost-2.1.3-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.3-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.3
Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import pandas as pd

#for visualization
import matplotlib.pyplot as plt

#scikit Learn for model
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

RANDOM_STATE = 55

#### Import Datasets

In [2]:
heart_data = pd.read_csv('heart.csv')

In [3]:
heart_data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
x_train= heart_data.drop(columns=['HeartDisease'])

print(f"X_training Data: \n{x_train.head()}")

y_train = heart_data['HeartDisease']

print(f"y_train data: \n{y_train}")

X_training Data: 
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  
0              N      0.0       Up  
1              N      1.0     Flat  
2              N      0.0       Up  
3              Y      1.5     Flat  
4              N      0.0       Up  
y_train data: 
0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: HeartDisease, Length: 918, dtype: int64


In [16]:
print(f"x_train shape {x_train.shape}\ny_train shape {y_train.shape}")

x_train shape (918, 11)
y_train shape (918,)


### One-hot encoding for categorical values

In [4]:
#one built in function for one-hot encoding of categorical features in pandas is pd.get_dummies

#categorical_features
categorical_features = [x for x in heart_data.columns if heart_data[x].dtype == 'object']

In [5]:
print(categorical_features)

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']


In [6]:
#ond hot encoding
df = pd.get_dummies(data = heart_data,
                    prefix = categorical_features, 
                    columns = categorical_features)

In [7]:
df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,False,True,False,...,False,False,False,True,False,True,False,False,False,True
1,49,160,180,0,156,1.0,1,True,False,False,...,True,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,0,False,True,False,...,False,False,False,False,True,True,False,False,False,True
3,48,138,214,0,108,1.5,1,True,False,True,...,False,False,False,True,False,False,True,False,True,False
4,54,150,195,0,122,0.0,0,False,True,False,...,True,False,False,True,False,True,False,False,False,True


In [8]:
#convert the true and False into 0 and 1
df1 = pd.get_dummies(df).astype(int)

In [9]:
df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,False,True,False,...,False,False,False,True,False,True,False,False,False,True
1,49,160,180,0,156,1.0,1,True,False,False,...,True,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,0,False,True,False,...,False,False,False,False,True,True,False,False,False,True
3,48,138,214,0,108,1.5,1,True,False,True,...,False,False,False,True,False,False,True,False,True,False
4,54,150,195,0,122,0.0,0,False,True,False,...,True,False,False,True,False,True,False,False,False,True


In [10]:
df.shape

(918, 21)

In [52]:
features = [x for x in df.columns if x not in 'HeartDisease'] ## Removing our target variable

In [54]:
print(len(features))
print(features)

20
['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak', 'Sex_F', 'Sex_M', 'ChestPainType_ASY', 'ChestPainType_ATA', 'ChestPainType_NAP', 'ChestPainType_TA', 'RestingECG_LVH', 'RestingECG_Normal', 'RestingECG_ST', 'ExerciseAngina_N', 'ExerciseAngina_Y', 'ST_Slope_Down', 'ST_Slope_Flat', 'ST_Slope_Up']


### Splitting the dataset

In [11]:
x= df.drop(columns = ['HeartDisease'])
y = df['HeartDisease']

In [12]:
print(x.shape)

(918, 20)


In [15]:
x_train, y_train, x_,  y_ = train_test_split(x, y , test_size=0.6, random_state=RANDOM_STATE)

x_cv,  y_cv, x_test, y_test = train_test_split(x_, y_, test_size= 0.4, random_state= RANDOM_STATE)

del x_, y_

ValueError: Found input variables with inconsistent numbers of samples: [367, 551]

In [16]:
from sklearn.model_selection import train_test_split

# First split: train and remaining data
x_train, x_, y_train, y_ = train_test_split(x, y, test_size=0.6, random_state=RANDOM_STATE)

# Second split: cross-validation and test sets from the remaining data
x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.4, random_state=RANDOM_STATE)

# Delete intermediate variables
del x_, y_
