In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
df = pd.read_csv('/content/heart.csv')
print("Shape of the Dataframe: ", df.shape)
df.head()

Shape of the Dataframe:  (303, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [8]:
X = df.iloc[:,0:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)
print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_test: ", y_test.shape)

Shape of X:  (303, 13)
Shape of y:  (303,)
Shape of X_train:  (242, 13)
Shape of y_train:  (242,)
Shape of X_test:  (61, 13)
Shape of y_test:  (61,)


In [10]:
random_forest = RandomForestClassifier()
gradient_boosting = GradientBoostingClassifier()
svc = SVC()
logistic_regression = LogisticRegression()

random_forest.fit(X_train, y_train)
gradient_boosting.fit(X_train, y_train)
svc.fit(X_train, y_train)
logistic_regression.fit(X_train, y_train)

y_pred_random_forest = random_forest.predict(X_test)
y_pred_gradient_boosting = gradient_boosting.predict(X_test)
y_pred_svc = svc.predict(X_test)
y_pred_logistic_regression = logistic_regression.predict(X_test)

print("Accuracy of Random Forest: ", accuracy_score(y_test, y_pred_random_forest))
print("Accuracy of Gradient Boosting: ", accuracy_score(y_test, y_pred_gradient_boosting))
print("Accuracy of SVC: ", accuracy_score(y_test, y_pred_svc))
print("Accuracy of Logistic Regression: ", accuracy_score(y_test, y_pred_logistic_regression))

Accuracy of Random Forest:  0.8524590163934426
Accuracy of Gradient Boosting:  0.7704918032786885
Accuracy of SVC:  0.7049180327868853
Accuracy of Logistic Regression:  0.8852459016393442


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
random_forest = RandomForestClassifier(max_samples=0.75,random_state=42)
random_forest.fit(X_train,y_train)
y_pred_random_forest = random_forest.predict(X_test)
print("Accuracy of the Random Forest: ", accuracy_score(y_test,y_pred_random_forest))

Accuracy of the Random Forest:  0.9016393442622951


In [15]:
np.mean(cross_val_score(RandomForestClassifier(max_samples=0.75),X,y,cv=10,scoring='accuracy'))

0.8118279569892474

## GridSearchCV

In [19]:
# Number of trees in random forest
n_estimators = [20,60,100,120] # 4

# Number of features to consider at every split
max_features = [0.2,0.6,1.0] # 3

# Maximum number of levels in tree
max_depth = [2,8,None] # 3

# Number of samples
max_samples = [0.5,0.75,1.0] # 3

# 4 * 3 * 3 * 3 = 4 * 27 = 108

# 108 diff random forest train

In [20]:
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              'max_samples':max_samples
             }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0]}


In [21]:
random_forest = RandomForestClassifier()

random_forest_grid = GridSearchCV(
    estimator = random_forest,
    param_grid = param_grid,
    cv = 5,
    verbose=2,
    n_jobs = -1
)

random_forest_grid.fit(X_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [22]:
print("Best Parameters: ", random_forest_grid.best_params_)

Best Parameters:  {'max_depth': 2, 'max_features': 0.2, 'max_samples': 1.0, 'n_estimators': 120}


In [23]:
print("Best Scores: ", random_forest_grid.best_score_)

Best Scores:  0.8347789115646259


## RandomizedSearchCV

In [24]:
# Number of trees in random forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth = [2,8,None]

# Number of samples
max_samples = [0.5,0.75,1.0]

# Bootstrap samples
bootstrap = [True,False]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

In [25]:
param_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'max_samples':max_samples,
    'bootstrap':bootstrap,
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf
}
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [27]:
random_forest_grid = RandomizedSearchCV(
    estimator = random_forest,
    param_distributions = param_grid,
    cv = 5,
    verbose=2,
    n_jobs = -1
)

random_forest_grid.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


40 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 431, in fit
    raise ValueError(
ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `max_sample=None`.

        nan        nan 0.80144558 0.830272

In [28]:
print("Best Parameters: ", random_forest_grid.best_params_)

Best Parameters:  {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_samples': 1.0, 'max_features': 0.2, 'max_depth': 2, 'bootstrap': True}


In [29]:
print("Best Scores: ", random_forest_grid.best_score_)

Best Scores:  0.8302721088435374
