In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [4]:
#EDA
#ENCODING< SCALING< MISSING VALUE TREATMENT

In [5]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['time'] = encoder.fit_transform(df['time'])
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.50,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,0,3
240,27.18,2.00,Female,Yes,Sat,0,2
241,22.67,2.00,Male,Yes,Sat,0,2
242,17.82,1.75,Male,No,Sat,0,2


In [6]:
#Aim>> to predict time

In [7]:
df.time.unique() #dinner>0, lunch>1

array([0, 1])

In [8]:
x = df.drop('time', axis=1)
y = df['time']

In [9]:
x, y

(     total_bill   tip     sex smoker   day  size
 0         16.99  1.01  Female     No   Sun     2
 1         10.34  1.66    Male     No   Sun     3
 2         21.01  3.50    Male     No   Sun     3
 3         23.68  3.31    Male     No   Sun     2
 4         24.59  3.61  Female     No   Sun     4
 ..          ...   ...     ...    ...   ...   ...
 239       29.03  5.92    Male     No   Sat     3
 240       27.18  2.00  Female    Yes   Sat     2
 241       22.67  2.00    Male    Yes   Sat     2
 242       17.82  1.75    Male     No   Sat     2
 243       18.78  3.00  Female     No  Thur     2
 
 [244 rows x 6 columns],
 0      0
 1      0
 2      0
 3      0
 4      0
       ..
 239    0
 240    0
 241    0
 242    0
 243    0
 Name: time, Length: 244, dtype: int64)

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
x_train.shape, y_train.shape

((195, 6), (195,))

In [11]:
df.isna().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [12]:
#Handling missing values
#Data encoding
#feature scaling

In [13]:
from sklearn.impute import SimpleImputer #For missing values
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [14]:
#Pipeline>> a sequence of data transformation
#Column transformer>> groups all the pipeline steps for each of the columns

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [16]:
cat_cols = ['sex', 'smoker', 'day']
num_cols = ['total_bill', 'tip', 'size']

In [17]:
#Feature engineering automation using pipeline and column transformer

In [18]:
num_pipeline = Pipeline(steps=[('imputation', SimpleImputer(strategy="median")), ('scaling', StandardScaler())])
cat_pipeline = Pipeline(steps=[('imputation', SimpleImputer(strategy="most_frequent")), ('encoding', OneHotEncoder())])

In [19]:
preprocessor = ColumnTransformer([("num-pipeline", num_pipeline, num_cols), ("cat-columns", cat_pipeline, cat_cols)])

In [20]:
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

In [21]:
x_train

array([[-0.28611937, -1.47443803, -0.57766863, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.02695905, -0.71612531,  1.47042924, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.3716196 ,  1.19880579,  1.47042924, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.23206267,  0.43283335, -0.57766863, ...,  0.        ,
         0.        ,  1.        ],
       [-1.06543688, -1.29060464, -0.57766863, ...,  1.        ,
         0.        ,  0.        ],
       [-0.29287646,  0.1034652 ,  0.44638031, ...,  1.        ,
         0.        ,  0.        ]])

In [22]:
x_test

array([[-1.85376383, -1.48209775, -1.60171757,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-0.08453291,  0.04984713, -0.57766863,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.79501474,  0.36389583,  0.44638031,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-0.59356688, -0.33313909, -0.57766863,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.18349826,  0.04984713, -0.57766863,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-1.32783714, -1.14506988, -0.57766863,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.   

In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
models = {"SVC" : SVC(), "DTC" : DecisionTreeClassifier(), "LogReg" : LogisticRegression()}

In [24]:
from sklearn.metrics import accuracy_score
def model_train_eval(x_train, x_test, y_train, y_test, models):
    evaluation = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        model_score = accuracy_score(y_test, y_pred)
        evaluation[list(models.keys())[i]] = model_score
    return evaluation

In [25]:
model_train_eval(x_train, x_test, y_train, y_test, models)

{'SVC': 0.9183673469387755,
 'DTC': 0.8979591836734694,
 'LogReg': 0.9183673469387755}

In [42]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(oob_score=False)
rf

In [43]:
rf.fit(x_train, y_train)

In [44]:
y_pred = rf.predict(x_test)
y_pred

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0])

In [45]:
accuracy_score(y_test, y_pred) #0.8979591836734694

0.8979591836734694

In [46]:
#Hypreparameter tuning

In [47]:
from sklearn.model_selection import GridSearchCV
params = {
    'max_depth' : [1,2,3,10,20,30],
    'n_estimators' : [50,100,20,300,100,1000],
    'criterion' : ['gini', 'entropy']
}

In [48]:
params

{'max_depth': [1, 2, 3, 10, 20, 30],
 'n_estimators': [50, 100, 20, 300, 100, 1000],
 'criterion': ['gini', 'entropy']}

In [49]:
grid = GridSearchCV(estimator=rf, param_grid=params, cv=5, scoring='accuracy', verbose=3)
grid

In [50]:
grid.fit(x_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5] END criterion=gini, max_depth=1, n_estimators=50;, score=0.923 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=1, n_estimators=50;, score=0.974 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=1, n_estimators=50;, score=1.000 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=1, n_estimators=50;, score=1.000 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=1, n_estimators=50;, score=1.000 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=1, n_estimators=100;, score=0.923 total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=1, n_estimators=100;, score=0.974 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=1, n_estimators=100;, score=1.000 total time=   0.1s
[CV 4/5] END criterion=gini, max_depth=1, n_estimators=100;, score=0.923 total time=   0.1s
[CV 5/5] END criterion=gini, max_depth=1, n_estimators=100;, score=1.000 total time=   0.1s
[CV 1/5] END criterion=

In [51]:
grid.best_params_

{'criterion': 'gini', 'max_depth': 1, 'n_estimators': 50}

In [52]:
grid.best_score_

np.float64(0.9794871794871796)