<span style="font-size: 20px;">Random Forest Classifier with Pipeline and Hyperparameter Tuning .</span>

In [24]:
import seaborn as sns 
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [2]:
df['day'].unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [3]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [4]:
df['time'].unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [5]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['time']=encoder.fit_transform(df['time'])

In [6]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.5,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4


In [21]:
##independent and dependent features

X=df.drop('time',axis=1)
y=df['time']

In [8]:
X.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,size
0,16.99,1.01,Female,No,Sun,2
1,10.34,1.66,Male,No,Sun,3
2,21.01,3.5,Male,No,Sun,3
3,23.68,3.31,Male,No,Sun,2
4,24.59,3.61,Female,No,Sun,4


In [23]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer #handles missing values
from sklearn.preprocessing import StandardScaler #Feature Scaling
from sklearn.preprocessing import OneHotEncoder #categorical to numerical
from sklearn.compose import ColumnTransformer #categorical to numerical



In [10]:
categorical_cols=['sex','smoker','day']
numerical_cols=['total_bill','tip','size']

In [29]:
#numerical pipeline
num_pipeline=Pipeline(
          steps=[
              ('imputer',SimpleImputer(strategy='median')), #missing values
              ('scaler',StandardScaler()) #Scaling
          
          
          ]

)

#Categorical pipeline
cat_pipeline=Pipeline(
          steps=[
              ('imputer',SimpleImputer(strategy='most_frequent')), #missing values
              ('onehotencoder',OneHotEncoder()) #Encoding
          
          
          ]

)

In [30]:
preprocessor=ColumnTransformer([
    
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)


]
)

In [31]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [47]:
#Model Training Automation

models={
       'Random Forest':RandomForestClassifier(),
       'Logisitic Regression':LogisticRegression(),
       'DecisionTreeClassifier':DecisionTreeClassifier()
}


In [34]:
from sklearn.metrics import accuracy_score

In [48]:
def evaluate_model(X_train,y_train,X_test,y_test,models):
    
    
    report={}
    for i in range(len(models)):
        model=list(models.values())[i]
        
        #train model
        
        model.fit(X_train,y_train)
        
        y_test_pred=model.predict(X_test)
        
        #Get Accuracy
        
        test_model_score=accuracy_score(y_test,y_test_pred)
        
        report[list(models.keys())[i]]=test_model_score
        
    return report

In [49]:
evaluate_model(X_train,y_train,X_test,y_test,models)

{'Random Forest': 0.9672131147540983,
 'Logisitic Regression': 1.0,
 'DecisionTreeClassifier': 0.9508196721311475}

In [50]:
classifier=RandomForestClassifier()


In [52]:
##Hyperparameter Tuning

params={
    'max_depth':[3,5,10,None],
    'n_estimators':[100,200,300],
    'criterion':['gini','entropy']


}

In [53]:
from sklearn.model_selection import RandomizedSearchCV

In [56]:
Cv=RandomizedSearchCV(classifier,param_distributions=params,scoring='accuracy',cv=5,verbose=3)

In [57]:
Cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=3, n_estimators=300;, score=0.946 total time=   0.6s
[CV 2/5] END criterion=gini, max_depth=3, n_estimators=300;, score=0.973 total time=   0.6s
[CV 3/5] END criterion=gini, max_depth=3, n_estimators=300;, score=0.973 total time=   0.6s
[CV 4/5] END criterion=gini, max_depth=3, n_estimators=300;, score=0.917 total time=   0.6s
[CV 5/5] END criterion=gini, max_depth=3, n_estimators=300;, score=0.917 total time=   0.5s
[CV 1/5] END criterion=entropy, max_depth=3, n_estimators=100;, score=0.946 total time=   0.2s
[CV 2/5] END criterion=entropy, max_depth=3, n_estimators=100;, score=0.973 total time=   0.2s
[CV 3/5] END criterion=entropy, max_depth=3, n_estimators=100;, score=0.973 total time=   0.2s
[CV 4/5] END criterion=entropy, max_depth=3, n_estimators=100;, score=0.917 total time=   0.2s
[CV 5/5] END criterion=entropy, max_depth=3, n_estimators=100;, score=0.917 total time=   0.2s
[CV 