In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score,train_test_split,RandomizedSearchCV,GridSearchCV

from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [3]:
df.shape

(1025, 14)

In [4]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
print(X_train.shape,X_test.shape)

(820, 13) (205, 13)


In [6]:
rf = RandomForestClassifier()
gbm = GradientBoostingClassifier()
svm = SVC()
lr = LogisticRegression()

rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.9853658536585366

In [7]:
gbm.fit(X_train,y_train)
y_pred = gbm.predict(X_test)
accuracy_score(y_test,y_pred)

0.9317073170731708

In [8]:
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
accuracy_score(y_test,y_pred)

0.6829268292682927

In [9]:
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
accuracy_score(y_test,y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7853658536585366

In [10]:
np.mean(cross_val_score(RandomForestClassifier(),X,y,cv=10,scoring='accuracy'))

0.9970588235294118

# GridSearchCV

In [11]:
param = {
    'n_estimators': [50,75,100,125],
    'max_features': ['auto','log2',0.5,0.6,1.0],
    'max_depth' : [2,8,None],
    'max_samples': [0.5,0.75,1.0],
    'bootstrap': [True,False],
    'criterion':['gini','entropy'],
    'min_samples_leaf':[1,2,3,5],
    'min_samples_split':[1,2,3,5]
    }

In [12]:
grid = GridSearchCV(RandomForestClassifier(), param_grid = param,cv = 10, verbose=1,  n_jobs = -1)
grid.fit(X_train,y_train)
model = grid.best_estimator_
y_pred = model.predict(X_test)
print(f'Accuracy Score {accuracy_score(y_test,y_pred)}')

Fitting 10 folds for each of 11520 candidates, totalling 115200 fits


72000 fits failed out of a total of 115200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
14400 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Nihar\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Nihar\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "C:\Users\Nihar\anaconda3\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\Nihar\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\Ni

Accuracy Score 0.9853658536585366


In [13]:
grid.best_params_

{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'log2',
 'max_samples': 1.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 125}

In [14]:
grid.best_score_

0.9939024390243901

In [15]:
print(grid.best_estimator_)

RandomForestClassifier(max_features='log2', max_samples=1.0, n_estimators=125)


# RandomizedSearchCV

In [17]:
random = RandomizedSearchCV(RandomForestClassifier(),param,cv=10,verbose=1,n_jobs=-1) #The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a DecisionTreeClassifier.
random.fit(X_train,y_train)
model = random.best_estimator_
y_pred = model.predict(X_test)
print(f'Accuracy Score {accuracy_score(y_test,y_pred)}')

Fitting 10 folds for each of 10 candidates, totalling 100 fits




Accuracy Score 0.9707317073170731


40 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Nihar\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Nihar\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 379, in fit
    raise ValueError(
ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `max_sample=None`.

 0.97682927        nan        nan        nan]
