# Random Forest Classifier With Pipeline And Hyperparameter Tuning

In [76]:
import seaborn as sns
df= sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [77]:
df.time.unique() # target

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [78]:
# convert into numerical

from sklearn.preprocessing import LabelEncoder
encoder= LabelEncoder()

df.time=encoder.fit_transform(df.time)

In [79]:
df.time.unique()

array([0, 1])

In [107]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.50,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,0,3
240,27.18,2.00,Female,Yes,Sat,0,2
241,22.67,2.00,Male,Yes,Sat,0,2
242,17.82,1.75,Male,No,Sat,0,2


In [80]:
# independent features and dependent feature

X= df.drop(labels=['time'], axis=1)
y= df.time

In [81]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [82]:
# pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer  # handling missing values
from sklearn.preprocessing import StandardScaler # feature scaling
from sklearn.preprocessing import OneHotEncoder # categorical to numerical
from sklearn.compose import ColumnTransformer # combine multiple pipeline

In [83]:
cat_col= ['sex', 'smoker', 'day']
num_col=['total_bill', 'tip', 'size']

In [84]:
# Feature Engineering Automation
# numerical pipeline for FE

num_pipline=Pipeline(
    steps=[('imputer' ,SimpleImputer(strategy='median')),
          ('preprocessing', StandardScaler())]
)

# categorical pipeline
cat_pipline=Pipeline(
    steps=[('imputer' ,SimpleImputer(strategy='most_frequent')),
          ('preprocessing', OneHotEncoder())]    
)


In [85]:
preprocessor= ColumnTransformer([
    ('num_pipline', num_pipline, num_col),
    ('cat_pipline', cat_pipline, cat_col)
])

In [86]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [87]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [88]:
# model training automation

model={
    'Rnadom Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression()
    
}

In [89]:
from sklearn.metrics import accuracy_score

In [90]:
def model_auto(X_train, X_test, y_train, y_test, model):

    report={}
    for i in range(len(model)):
        model= list(model.values())[i] 
        # train model
        model.fit(X_train, y_train)

        #predict test model
        predict = model.predict(X_test)

        # get accuracy score for test data
        test_model_score = accuracy_score(y_test,predict)
        report[model] = test_model_score

   

        return report
        

In [93]:
model_auto(X_train, X_test, y_train, y_test, model)

{RandomForestClassifier(): 0.9591836734693877}

In [97]:
claf= RandomForestClassifier()

In [98]:
# hyperparameters

params={'max_depth':[3,5,10,None],
       'n_estimators':[100,200,300],
       'criterion':['gini','entropy']}

In [99]:
from sklearn.model_selection import RandomizedSearchCV

In [100]:
cv= RandomizedSearchCV(claf, param_distributions=params, scoring= 'accuracy', cv=5, verbose=3)

In [101]:
cv.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=entropy, max_depth=10, n_estimators=100;, score=0.974 total time=   0.1s
[CV 2/5] END criterion=entropy, max_depth=10, n_estimators=100;, score=0.923 total time=   0.1s
[CV 3/5] END criterion=entropy, max_depth=10, n_estimators=100;, score=1.000 total time=   0.2s
[CV 4/5] END criterion=entropy, max_depth=10, n_estimators=100;, score=0.949 total time=   0.2s
[CV 5/5] END criterion=entropy, max_depth=10, n_estimators=100;, score=0.923 total time=   0.1s
[CV 1/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.974 total time=   0.4s
[CV 2/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.923 total time=   0.5s
[CV 3/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=1.000 total time=   0.4s
[CV 4/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.949 total time=   0.3s
[CV 5/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.92

In [102]:
cv.best_params_

{'n_estimators': 100, 'max_depth': 10, 'criterion': 'entropy'}

# ----------------------------------------------

In [105]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [106]:
tips= sns.load_dataset('tips')