### **RANDOM FOREST FOR CLASSIFICATION**

In [2]:
%pip install tqdm
import pandas as pd
import pylab as plt
import numpy as np
import scipy.optimize as opt
from sklearn import preprocessing
%matplotlib inline 
import matplotlib.pyplot as plt
from sklearn import metrics
from tqdm import tqdm

Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
# calculate the accuracy of the training and testing data given a model.

def get_accuracy(X_train, X_test, y_train, y_test, model):
    return  {"test Accuracy":metrics.accuracy_score(y_test, model.predict(X_test)),"trian Accuracy": metrics.accuracy_score(y_train, model.predict(X_train))}

In [5]:
# calculates the average correlation between predictors 
# and displays the pairwise correlation between predictors.

def get_correlation(X_test, y_test,models):
    #This function calculates the average correlation between predictors  
    n_estimators=len(models.estimators_)
    prediction=np.zeros((y_test.shape[0],n_estimators))
    predictions=pd.DataFrame({'estimator '+str(n+1):[] for n in range(n_estimators)})
    
    for key,model in zip(predictions.keys(),models.estimators_):
        predictions[key]=model.predict(X_test.to_numpy())
    
    corr=predictions.corr()
    print("Average correlation between predictors: ", corr.mean().mean()-1/n_estimators)
    return corr

* Random Forest is similar to Bagging in that it uses model ensembles to make predictions. Unlike Bagging, when you add more models, RF does not suffer from Overfitting. 

In [12]:
import pandas as pd
import requests, io, certifi

url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%203/data/ChurnData.csv"

resp = requests.get(url, timeout=30, verify=certifi.where())  # uses up-to-date CA bundle
resp.raise_for_status()  # will raise if HTTP error
churn_df = pd.read_csv(io.StringIO(resp.text))
churn_df.head()

Unnamed: 0,tenure,age,address,income,ed,employ,equip,callcard,wireless,longmon,...,pager,internet,callwait,confer,ebill,loglong,logtoll,lninc,custcat,churn
0,11.0,33.0,7.0,136.0,5.0,5.0,0.0,1.0,1.0,4.4,...,1.0,0.0,1.0,1.0,0.0,1.482,3.033,4.913,4.0,1.0
1,33.0,33.0,12.0,33.0,2.0,0.0,0.0,0.0,0.0,9.45,...,0.0,0.0,0.0,0.0,0.0,2.246,3.24,3.497,1.0,1.0
2,23.0,30.0,9.0,30.0,1.0,2.0,0.0,0.0,0.0,6.3,...,0.0,0.0,0.0,1.0,0.0,1.841,3.24,3.401,3.0,0.0
3,38.0,35.0,5.0,76.0,2.0,10.0,1.0,1.0,1.0,6.05,...,1.0,1.0,1.0,1.0,1.0,1.8,3.807,4.331,4.0,0.0
4,7.0,35.0,14.0,80.0,2.0,15.0,0.0,1.0,0.0,7.1,...,0.0,0.0,1.0,1.0,0.0,1.96,3.091,4.382,3.0,0.0


#### Data pre-processing and feature selection

Let's select some features for the modeling. Also, we change the target data type to be an integer, as it is a requirement by the skitlearn algorithm:

In [13]:
churn_df = churn_df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip', 'callcard', 'wireless','churn']]
churn_df['churn'] = churn_df['churn'].astype('int')
churn_df.head()

Unnamed: 0,tenure,age,address,income,ed,employ,equip,callcard,wireless,churn
0,11.0,33.0,7.0,136.0,5.0,5.0,0.0,1.0,1.0,1
1,33.0,33.0,12.0,33.0,2.0,0.0,0.0,0.0,0.0,1
2,23.0,30.0,9.0,30.0,1.0,2.0,0.0,0.0,0.0,0
3,38.0,35.0,5.0,76.0,2.0,10.0,1.0,1.0,1.0,0
4,7.0,35.0,14.0,80.0,2.0,15.0,0.0,1.0,0.0,0


#### Bootstrap Sampling

Bootstrap Sampling is a method that involves drawing sample data repeatedly with replacement from a data source to estimate a model parameter. Scikit-learn has methods for Bagging, but it is helpful to understand Bootstrap Sampling. We will import "resample".

In [14]:
from sklearn.utils import resample

In [15]:
# consider the first five rows of data

churn_df[0:5]

Unnamed: 0,tenure,age,address,income,ed,employ,equip,callcard,wireless,churn
0,11.0,33.0,7.0,136.0,5.0,5.0,0.0,1.0,1.0,1
1,33.0,33.0,12.0,33.0,2.0,0.0,0.0,0.0,0.0,1
2,23.0,30.0,9.0,30.0,1.0,2.0,0.0,0.0,0.0,0
3,38.0,35.0,5.0,76.0,2.0,10.0,1.0,1.0,1.0,0
4,7.0,35.0,14.0,80.0,2.0,15.0,0.0,1.0,0.0,0


* We can perform a bootstrap sample using the function "resample"; we see the dataset is of the same size(5), but some rows are repeated:

In [16]:
for n in range(5):
    print(resample(churn_df[0:5]))

   tenure   age  address  income   ed  employ  equip  callcard  wireless  \
4     7.0  35.0     14.0    80.0  2.0    15.0    0.0       1.0       0.0   
0    11.0  33.0      7.0   136.0  5.0     5.0    0.0       1.0       1.0   
3    38.0  35.0      5.0    76.0  2.0    10.0    1.0       1.0       1.0   
0    11.0  33.0      7.0   136.0  5.0     5.0    0.0       1.0       1.0   
1    33.0  33.0     12.0    33.0  2.0     0.0    0.0       0.0       0.0   

   churn  
4      0  
0      1  
3      0  
0      1  
1      1  
   tenure   age  address  income   ed  employ  equip  callcard  wireless  \
1    33.0  33.0     12.0    33.0  2.0     0.0    0.0       0.0       0.0   
4     7.0  35.0     14.0    80.0  2.0    15.0    0.0       1.0       0.0   
3    38.0  35.0      5.0    76.0  2.0    10.0    1.0       1.0       1.0   
3    38.0  35.0      5.0    76.0  2.0    10.0    1.0       1.0       1.0   
1    33.0  33.0     12.0    33.0  2.0     0.0    0.0       0.0       0.0   

   churn  
1      1 

#### Select Variables at Random

Like Bagging, Random Forest uses an independent bootstrap sample from the training data. In addition, we select $m$ variables at random out of all $M$ possible
variables.

In [17]:
X=churn_df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip']]

In [18]:
# there are 7 features:

M=X.shape[1]
M

7

In [19]:
# Let us select  𝑚=3, and randomly sample features 
# from the 5 Bootstrap Samples from above.

m=3

In [23]:
# list out the index of the features

feature_index=range(M)
feature_index

range(0, 7)

In [24]:
# use the function to sample to randomly select indexes

import random
random.sample(feature_index,m)

[4, 6, 5]

In [25]:
# We now randomly select features from the bootstrap samples 
# to randomly select a subset of features for each node to split on.

for n in range(5):
    print("sample {}".format(n))
    print(resample(X[0:5]).iloc[:, random.sample(feature_index,m)])

sample 0
   tenure   age   ed
3    38.0  35.0  2.0
0    11.0  33.0  5.0
3    38.0  35.0  2.0
4     7.0  35.0  2.0
3    38.0  35.0  2.0
sample 1
   tenure  income   ed
1    33.0    33.0  2.0
4     7.0    80.0  2.0
3    38.0    76.0  2.0
0    11.0   136.0  5.0
1    33.0    33.0  2.0
sample 2
    age  income  employ
4  35.0    80.0    15.0
2  30.0    30.0     2.0
4  35.0    80.0    15.0
3  35.0    76.0    10.0
0  33.0   136.0     5.0
sample 3
   income  employ  tenure
0   136.0     5.0    11.0
4    80.0    15.0     7.0
1    33.0     0.0    33.0
3    76.0    10.0    38.0
2    30.0     2.0    23.0
sample 4
   address  equip   ed
4     14.0    0.0  2.0
2      9.0    0.0  1.0
4     14.0    0.0  2.0
2      9.0    0.0  1.0
1     12.0    0.0  2.0


##### **In Random Forest, we would use these data subsets to train each node of a tree.**

In [26]:
# define X and y for the dataset

y = churn_df['churn']
y.head()

0    1
1    1
2    0
3    0
4    0
Name: churn, dtype: int64

In [27]:
# split our dataset into train and test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=1)
print ('Train set', X_train.shape,  y_train.shape)
print ('Test set', X_test.shape,  y_test.shape)

Train set (140, 7) (140,)
Test set (60, 7) (60,)


#### Bagging Review

Bagging improves models that suffer from overfitting; they do well on the training data, but they do not generalize well to unseen data. Decision Trees are a prime candidate for this reason. In addition, they are fast to train; We create a <code>BaggingClassifier</code> object,  with a Decision Tree as the <code>base_estimator</code>.

In [28]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [30]:
n_estimators=20
Bag = BaggingClassifier(estimator=DecisionTreeClassifier(criterion="entropy", max_depth=4, random_state=2), n_estimators=n_estimators, random_state=0, bootstrap=True)

In [31]:
# fit the model

Bag.fit(X_train,y_train)

0,1,2
,estimator,DecisionTreeC...andom_state=2)
,n_estimators,20
,max_samples,1.0
,max_features,1.0
,bootstrap,True
,bootstrap_features,False
,oob_score,False
,warm_start,False
,n_jobs,
,random_state,0

0,1,2
,criterion,'entropy'
,splitter,'best'
,max_depth,4
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,2
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [32]:
# The method predict aggregates the predictions by voting:

Bag.predict(X_test).shape

(60,)

In [37]:
# We see the training accuracy is slightly better but the test accuracy improves dramatically
from sklearn import metrics
print(get_accuracy(X_train, X_test, y_train, y_test,  Bag))

{'test Accuracy': 0.7333333333333333, 'trian Accuracy': 0.9071428571428571}


Each tree is similar; we can see this by plotting the correlation between each tree and the average correlation.

In [45]:
import numpy as np
import pandas as pd

def get_correlation(X, y, bag_model):
    """
    Returns a DataFrame of pairwise correlations between the base estimators'
    predictions on X, and prints the average off-diagonal correlation.
    Works for BaggingClassifier/Regressor (after .fit()).
    """
    # Collect per-estimator predictions
    preds = []
    for est in bag_model.estimators_:
        p = est.predict(X)
        # ensure 1-D numpy array
        p = np.asarray(p).ravel()
        preds.append(p)

    # Shape: (n_samples, n_estimators)
    P = np.column_stack(preds)

    # If labels are non-numeric (e.g., strings), encode to integers
    if not np.issubdtype(P.dtype, np.number):
        P = np.column_stack([pd.Categorical(col).codes for col in P.T]).astype(int)

    # Correlation matrix across estimators
    col_names = [f"estimator {i+1}" for i in range(P.shape[1])]
    corr_df = pd.DataFrame(P, columns=col_names).corr()

    # Average off-diagonal correlation
    upper = corr_df.where(np.triu(np.ones(corr_df.shape), k=1).astype(bool))
    avg_corr = upper.stack().mean()
    print(f"Average correlation between predictors:  {avg_corr}")

    return corr_df

In [47]:
# If jinja2 is installed:
get_correlation(X_test, y_test, Bag).style.background_gradient(cmap='coolwarm')

Average correlation between predictors:  0.26737548986813015




Unnamed: 0,estimator 1,estimator 2,estimator 3,estimator 4,estimator 5,estimator 6,estimator 7,estimator 8,estimator 9,estimator 10,estimator 11,estimator 12,estimator 13,estimator 14,estimator 15,estimator 16,estimator 17,estimator 18,estimator 19,estimator 20
estimator 1,1.0,-0.057709,0.152641,0.132379,0.068323,0.195047,0.209679,0.256111,0.177811,0.318511,-0.024845,0.318511,0.209679,0.112611,0.294475,-0.035245,0.161491,0.161491,0.236433,0.015456
estimator 2,-0.057709,1.0,-0.002979,0.335171,0.349647,0.121829,-0.078409,0.013546,0.180022,0.223814,0.451486,-0.074605,-0.078409,0.404443,0.24658,0.481571,0.04413,0.04413,0.215365,-0.059131
estimator 3,0.152641,-0.002979,1.0,0.395985,-0.010903,0.342381,0.455239,0.674356,0.442603,0.359425,-0.092675,0.51917,0.552099,0.296511,0.32485,0.216541,0.561502,0.47973,0.415029,0.006783
estimator 4,0.132379,0.335171,0.395985,1.0,0.456572,0.242393,0.436809,0.427623,0.417131,0.494783,0.051331,0.415618,0.340807,0.405843,0.224442,0.199294,0.375523,0.294475,0.445634,0.19496
estimator 5,0.068323,0.349647,-0.010903,0.456572,1.0,0.362231,-0.011036,0.090878,0.002915,0.409514,0.347826,-0.045502,0.099322,0.434355,0.294475,0.387699,0.068323,0.161491,0.315244,-0.100465
estimator 6,0.195047,0.121829,0.342381,0.242393,0.362231,1.0,0.19803,0.370625,0.183073,0.163299,0.195047,0.244949,0.19803,0.505181,0.605983,0.158114,0.529414,0.195047,0.494975,-0.069338
estimator 7,0.209679,-0.078409,0.455239,0.436809,-0.011036,0.19803,1.0,0.474619,0.564524,0.404226,-0.121393,0.619813,0.738562,0.323942,0.148803,0.062622,0.540752,0.430394,0.140028,0.247156
estimator 8,0.256111,0.013546,0.674356,0.427623,0.090878,0.370625,0.474619,1.0,0.546688,0.464008,-0.074355,0.625402,0.474619,0.256776,0.283884,0.140642,0.669193,0.50396,0.454257,0.020559
estimator 9,0.177811,0.180022,0.442603,0.417131,0.002915,0.183073,0.564524,0.546688,1.0,0.405727,-0.084533,0.491144,0.357359,0.241594,0.188913,0.314275,0.352707,0.177811,0.332877,0.07979
estimator 10,0.318511,0.223814,0.359425,0.494783,0.409514,0.163299,0.404226,0.464008,0.405727,1.0,0.318511,0.466667,0.404226,0.392837,0.178122,0.464758,0.318511,0.318511,0.50037,0.113228


##### RANDOM FOREST

Random forests are a combination of trees such that each tree depends on a random subset of the features and data. As a result, each tree in the forest is different and usually performs better than Bagging. The most important parameters are the number of trees and the number of features to sample.

In [48]:
from sklearn.ensemble import RandomForestClassifier

Like Bagging, increasing the number of trees improves results and does not lead to overfitting in most cases; but the improvements plateau as you add more trees. For this example, the number of trees in the forest (default=100):

In [49]:
n_estimators = 20

<code>max_features </code>   $m$ the number of features to consider when looking for the best split. If we have M features denoted by:

In [50]:
M_features=X.shape[1]

If we have M features, a popular method to determine m is to use the square root of M.

$m= floor(\sqrt{M}) $

In [51]:
max_features=round(np.sqrt(M_features))-1
max_features

2

In [52]:
y_test

58     1
40     0
34     0
102    0
184    0
198    1
95     1
4      0
29     0
168    0
171    1
18     0
11     0
89     0
110    0
118    0
159    1
35     0
136    1
59     0
51     0
16     0
44     0
94     0
31     0
162    0
38     0
28     0
193    0
27     0
47     0
165    0
194    0
177    0
176    0
97     1
174    1
73     0
69     0
172    1
108    0
107    1
189    0
14     0
56     0
19     1
114    0
39     0
185    0
124    0
98     0
123    1
119    0
53     0
33     1
179    0
181    0
106    0
199    0
138    1
Name: churn, dtype: int64

In [53]:
# We use floor to make sure $m$ is an integer:

# create the Random Forest object

model = RandomForestClassifier( max_features=max_features,n_estimators=n_estimators, random_state=0)

In [54]:
model.fit(X_train, y_train)

0,1,2
,n_estimators,20
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,2
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [55]:
# We obtain the training and testing accuracy; we see that RF does better than Bagging:
print(get_accuracy(X_train, X_test, y_train, y_test, model))

{'test Accuracy': 0.8, 'trian Accuracy': 0.9857142857142858}


In [56]:
# We see that each tree in RF is less correlated with the other trees than Bagging:

get_correlation(X_test, y_test,model).style.background_gradient(cmap='coolwarm')

Average correlation between predictors:  0.22458083493677547




Unnamed: 0,estimator 1,estimator 2,estimator 3,estimator 4,estimator 5,estimator 6,estimator 7,estimator 8,estimator 9,estimator 10,estimator 11,estimator 12,estimator 13,estimator 14,estimator 15,estimator 16,estimator 17,estimator 18,estimator 19,estimator 20
estimator 1,1.0,0.071067,0.339993,0.126674,0.169675,-0.052307,0.308607,0.222375,0.242393,-0.0,0.154303,0.195047,0.385758,0.0533,0.19803,0.122279,0.077152,0.385758,0.25,0.163299
estimator 2,0.071067,1.0,0.174712,0.104427,0.182597,0.100366,0.358218,0.108868,0.113692,-0.031607,0.212007,0.213862,0.285112,0.363636,0.182953,0.493591,0.212007,0.065795,0.355335,0.09671
estimator 3,0.339993,0.174712,1.0,0.287563,0.188913,0.015048,0.234061,0.314055,0.188913,0.158966,0.314772,0.440155,0.234061,0.289947,0.357359,0.212347,0.314772,0.314772,0.261533,0.234895
estimator 4,0.126674,0.104427,0.287563,1.0,0.42127,0.208052,0.336194,0.326761,0.200195,0.401878,0.179825,0.217425,0.258009,0.318682,0.215732,0.299877,0.179825,0.10164,0.278682,0.103429
estimator 5,0.169675,0.182597,0.188913,0.42127,1.0,0.341058,0.231893,0.283884,0.153937,0.140145,0.082285,0.213427,0.082285,0.299735,0.244805,0.18258,0.00748,0.157089,0.387829,0.178122
estimator 6,-0.052307,0.100366,0.015048,0.208052,0.341058,1.0,-0.008071,0.081422,0.112841,0.158966,-0.088782,0.440155,0.07264,0.373585,0.253777,0.058843,0.153351,0.153351,0.183073,0.149478
estimator 7,0.308607,0.358218,0.234061,0.336194,0.231893,-0.008071,1.0,0.282131,0.231893,0.358382,0.285714,0.326762,0.365079,0.263181,0.437978,0.332078,0.047619,0.126984,0.231455,0.125988
estimator 8,0.222375,0.108868,0.314055,0.326761,0.283884,0.081422,0.282131,1.0,0.212015,0.340659,0.205879,0.338727,0.358382,0.189642,0.2789,0.094265,0.205879,0.510885,0.2965,-0.020174
estimator 9,0.242393,0.113692,0.188913,0.200195,0.153937,0.112841,0.231893,0.212015,1.0,0.068276,0.306697,0.294475,0.381501,0.067182,0.340807,0.253715,0.00748,0.082285,0.096957,0.019791
estimator 10,-0.0,-0.031607,0.158966,0.401878,0.140145,0.158966,0.358382,0.340659,0.068276,1.0,0.053376,0.256111,0.282131,0.189642,0.2789,0.021753,0.129628,-0.022875,0.222375,0.14122


#### **EXAMPLE:**

Imagine that you are a medical researcher compiling data for a study. You have collected data about a set of patients, all of whom suffered from the same illness. During their course of treatment, each patient responded to one of 5 medications, Drug A, Drug B, Drug c, Drug x and y.

Part of your job is to build a model to find out which drug might be appropriate for a future patient with the same illness. The features of this dataset are Age, Sex, Blood Pressure, and the Cholesterol of the patients, and the target is the drug that each patient responded to.

It is a sample of multiclass classifier, and you can use the training part of the dataset to build a decision tree, and then use it to predict the class of a unknown patient, or to prescribe a drug to a new patient.

In [58]:
import requests, io, certifi

drug_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%203/data/drug200.csv"
drug_resp = requests.get(drug_url, timeout=30, verify=certifi.where())
drug_resp.raise_for_status()
df = pd.read_csv(io.StringIO(drug_resp.text), delimiter=",")
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [59]:
# create the X and y for our dataset:
X = df[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values
X[0:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.114],
       [28, 'F', 'NORMAL', 'HIGH', 7.798],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

In [60]:
y = df["Drug"]
y[0:5]

0    drugY
1    drugC
2    drugC
3    drugX
4    drugY
Name: Drug, dtype: object

In [61]:
# use a LabelEncoder to turn categorical features into numerical:

from sklearn import preprocessing
le_sex = preprocessing.LabelEncoder()
le_sex.fit(['F','M'])
X[:,1] = le_sex.transform(X[:,1]) 


le_BP = preprocessing.LabelEncoder()
le_BP.fit([ 'LOW', 'NORMAL', 'HIGH'])
X[:,2] = le_BP.transform(X[:,2])


le_Chol = preprocessing.LabelEncoder()
le_Chol.fit([ 'NORMAL', 'HIGH'])
X[:,3] = le_Chol.transform(X[:,3]) 

X[0:5]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.114],
       [28, 0, 2, 0, 7.798],
       [61, 0, 1, 0, 18.043]], dtype=object)

In [62]:
# Split the data into training and testing data with a 80/20 split

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (160, 5) (160,)
Test set: (40, 5) (40,)


In [64]:
# use GridSearch for Exhaustive search over specified parameter values

param_grid = {'n_estimators': [2*n+1 for n in range(20)],
             'max_depth' : [2*n+1 for n in range(10) ],
             'max_features':["auto", "sqrt", "log2"]}

In [66]:
# Create a RandomForestClassifier object called model

model = RandomForestClassifier()

Create GridSearchCV object called search with the estimator set to model, param_grid set to param_grid, scoring set to accuracy, and cv set to 3 and Fit the GridSearchCV object to our X_train and y_train data

In [69]:
from sklearn.model_selection import GridSearchCV

search = GridSearchCV(estimator=model, 
param_grid=param_grid,scoring='accuracy', cv=3)
search.fit(X_train, y_train)

600 fits failed out of a total of 1800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
600 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1358, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 471, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framewo

0,1,2
,estimator,RandomForestClassifier()
,param_grid,"{'max_depth': [1, 3, ...], 'max_features': ['auto', 'sqrt', ...], 'n_estimators': [1, 3, ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,21
,criterion,'gini'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'log2'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [72]:
# find the accuracy of the best model

search.best_score_

np.float64(1.0)

In [74]:
# find the best parameter values

search.best_params_

{'max_depth': 5, 'max_features': 'log2', 'n_estimators': 21}

In [76]:
# find the accuracy test data

print(get_accuracy(X_train, X_test, y_train, y_test, search.best_estimator_))

{'test Accuracy': 0.95, 'trian Accuracy': 1.0}
