# Stacking

In [8]:
import numpy as np
import pandas as pd

In [9]:
df = pd.read_csv('./heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [10]:
df.shape

(303, 14)

In [13]:
X = df.drop(columns='target')
y = df['target']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier

In [27]:
# we can tune the hyperparameteres of the base estimators here 

np.random.seed(42)

base_estimators = [
    ('dt', DecisionTreeClassifier(max_depth=5)),
    ('svc', SVC()),
    ('rfc', RandomForestClassifier(n_estimators=50, max_depth=5))
]

clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegression(),
    cv=10    # cv:int is basically the k in k-fold (default splitting method is k-fold)
)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import recall_score, accuracy_score
print(accuracy_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

0.9016393442622951
0.9375


In [32]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(clf, X, y, scoring='recall', cv=10, n_jobs=-1))
print(np.mean(cross_val_score(clf, X, y, scoring='recall', cv=10, n_jobs=-1)))

[1.         0.82352941 0.88235294 0.88235294 1.         0.75
 0.6875     0.9375     0.875      0.875     ]
0.8775735294117647
