In [40]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer ## HAndle Missing Values
from sklearn.preprocessing import StandardScaler ## Feature Scaling
from sklearn.preprocessing import OneHotEncoder ## categorical to numerical
from sklearn.compose import ColumnTransformer # connecting pipelines 

In [25]:
df=pd.read_excel('tips.xlsx')

In [32]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,2,Dinner,2
1,10.34,1.66,Male,No,2,Dinner,3
2,21.01,3.5,Male,No,2,Dinner,3
3,23.68,3.31,Male,No,2,Dinner,2
4,24.59,3.61,Female,No,2,Dinner,4


In [27]:
from sklearn.preprocessing import LabelEncoder

In [29]:
encoder=LabelEncoder()
df['day']=encoder.fit_transform(df['day'])

In [30]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,2,Dinner,2
1,10.34,1.66,Male,No,2,Dinner,3
2,21.01,3.50,Male,No,2,Dinner,3
3,23.68,3.31,Male,No,2,Dinner,2
4,24.59,3.61,Female,No,2,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,1,Dinner,3
240,27.18,2.00,Female,Yes,1,Dinner,2
241,22.67,2.00,Male,Yes,1,Dinner,2
242,17.82,1.75,Male,No,1,Dinner,2


In [31]:
df.day.value_counts()

1    87
2    76
3    62
0    19
Name: day, dtype: int64

In [33]:
X=df.drop(labels='day',axis=1)
y=df['day']

In [34]:
X

Unnamed: 0,total_bill,tip,sex,smoker,time,size
0,16.99,1.01,Female,No,Dinner,2
1,10.34,1.66,Male,No,Dinner,3
2,21.01,3.50,Male,No,Dinner,3
3,23.68,3.31,Male,No,Dinner,2
4,24.59,3.61,Female,No,Dinner,4
...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Dinner,3
240,27.18,2.00,Female,Yes,Dinner,2
241,22.67,2.00,Male,Yes,Dinner,2
242,17.82,1.75,Male,No,Dinner,2


In [35]:
y

0      2
1      2
2      2
3      2
4      2
      ..
239    1
240    1
241    1
242    1
243    3
Name: day, Length: 244, dtype: int64

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=10)
X_train.shape, X_test.shape

((183, 6), (61, 6))

In [39]:
cat_cols=['sex', 'smoker', 'time']
num_cols=['total_bill', 'tip','size']


In [41]:
# numercial pipeline

num_pipeline=Pipeline(
     steps=[
     ('imputer',SimpleImputer(strategy='median')),
     ('scaler', StandardScaler())
     
     
     ]

)

# categorical pipeline

cat_pipeline=Pipeline(
     steps=[
     ('imputer',SimpleImputer(strategy='most_frequent')),
     ('encoder', OneHotEncoder())
     
     
     ]

)

In [43]:
preprocessor=ColumnTransformer([
    
    ('num_pipeline',num_pipeline,num_cols),
    ('cat_pipeline',cat_pipeline,cat_cols)


])

In [44]:
preprocessor

In [45]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [47]:
models={
    'Random Forest':RandomForestClassifier(),
    'Logistic Regression':LogisticRegression()
    

}

In [48]:
from sklearn.metrics import accuracy_score

In [49]:
def evaluate_model(X_train,y_train,X_test,y_test,models):

    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        # Train model
        model.fit(X_train,y_train)



        # Predict Testing data
        y_test_pred =model.predict(X_test)

        # Get accuracy for test data prediction

        test_model_score = accuracy_score(y_test,y_test_pred)

        report[list(models.keys())[i]] =  test_model_score



    return report


In [50]:
evaluate_model(X_train,y_train,X_test,y_test,models)

{'Random Forest': 0.6721311475409836, 'Logistic Regression': 0.639344262295082}

In [51]:
classfier=RandomForestClassifier()

In [52]:
params={'max_depth':[3,5,10,None],
              'n_estimators':[100,200,300],
               'criterion':['gini','entropy']
              }

In [53]:
from sklearn.model_selection import RandomizedSearchCV

In [54]:
cv=RandomizedSearchCV(classfier,param_distributions=params,scoring='accuracy',cv=5,verbose=3)
cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.622 total time=   0.2s
[CV 2/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.622 total time=   0.2s
[CV 3/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.568 total time=   0.2s
[CV 4/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.694 total time=   0.2s
[CV 5/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.556 total time=   0.2s
[CV 1/5] END criterion=entropy, max_depth=3, n_estimators=300;, score=0.595 total time=   0.6s
[CV 2/5] END criterion=entropy, max_depth=3, n_estimators=300;, score=0.649 total time=   0.6s
[CV 3/5] END criterion=entropy, max_depth=3, n_estimators=300;, score=0.541 total time=   0.6s
[CV 4/5] END criterion=entropy, max_depth=3, n_estimators=300;, score=0.639 total time=   0.6s
[CV 5/5] END criterion=entropy, max_depth=3, n_estimators=300;, score=0.528 total ti

In [55]:
cv.best_params_

{'n_estimators': 100, 'max_depth': 5, 'criterion': 'entropy'}

In [57]:
y_pred=cv.predict(X_test)

In [58]:
accuracy_score(y_test,y_pred)

0.6065573770491803