In [1]:
import seaborn as sns
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [2]:
df.shape

(244, 7)

In [3]:
df.time.unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [4]:
# handling missing values
# handling categoricial features
# outliers
# feature scaling

# ---------------------------- AUTOMATE ---------------------------

# EDA

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [6]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
df['time'] = label.fit_transform(df['time'])

In [7]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.5,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4


In [8]:
df.time.value_counts()

0    176
1     68
Name: time, dtype: int64

# Pipelining

In [9]:
# independent features and dependent features

x = df.drop(['time'],axis = 1)

In [10]:
y = df['time']

In [11]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42)

In [12]:
from sklearn.impute import SimpleImputer    # handling missing values
from sklearn.preprocessing import OneHotEncoder     # handling categoricial features (data is nominal so we use OneHotEncoder)
from sklearn.preprocessing import StandardScaler   # feature scaling

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [13]:
categoical_cols = ['sex','smoker','day']
numerical_cols = ['total_bill','tip','size']

In [14]:
# automating feature engineering
num_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy = 'median')),
        ("scaler",StandardScaler())]
)

cat_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy = 'most_frequent')),
        ('onehotencoder',OneHotEncoder())
        ]
)

In [15]:
preprocessor = ColumnTransformer(
    [
        ( 'num_pipeline',num_pipeline,numerical_cols ),
        ('cat_pipeline',cat_pipeline,categoical_cols)
    ]
)

In [16]:
x_train = preprocessor.fit_transform(x_train)

In [17]:
x_test = preprocessor.fit_transform(x_test)

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [46]:
# Automate the model training process
models = {
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression' : LogisticRegression(),
    'SVM' : SVC(),
    "naive bayes" : GaussianNB()
}

In [47]:
from sklearn.metrics import accuracy_score, confusion_matrix


In [48]:
def eval_model(x_train,y_train,x_test,y_test,models):
  report = {}
  for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(x_train,y_train)

    y_test_pred = model.predict(x_test)
    test_model_score = accuracy_score(y_test,y_test_pred)
    report[list(models.keys())[i]] = test_model_score
  return report

In [49]:
eval_model(x_train,y_train,x_test,y_test,models)

{'Random Forest': 0.9795918367346939,
 'Decision Tree': 0.9795918367346939,
 'Logistic Regression': 1.0,
 'SVM': 0.9795918367346939,
 'naive bayes': 0.9591836734693877}

In [52]:
classifier = RandomForestClassifier()
params = {
    'max_depth':  [3,5,10,None],
    'n_estimators' : [100,200,300],
    'criterion' : ['gini','entropy']

    }

from sklearn.model_selection import RandomizedSearchCV
cv = RandomizedSearchCV(classifier,param_distributions = params,cv = 5,scoring = 'accuracy',verbose = 3)
cv.fit(x_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=5, n_estimators=200;, score=0.974 total time=   0.5s
[CV 2/5] END criterion=gini, max_depth=5, n_estimators=200;, score=0.923 total time=   0.5s
[CV 3/5] END criterion=gini, max_depth=5, n_estimators=200;, score=0.974 total time=   0.5s
[CV 4/5] END criterion=gini, max_depth=5, n_estimators=200;, score=0.923 total time=   0.4s
[CV 5/5] END criterion=gini, max_depth=5, n_estimators=200;, score=0.923 total time=   0.3s
[CV 1/5] END criterion=gini, max_depth=10, n_estimators=100;, score=0.974 total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=10, n_estimators=100;, score=0.923 total time=   0.2s
[CV 3/5] END criterion=gini, max_depth=10, n_estimators=100;, score=1.000 total time=   0.2s
[CV 4/5] END criterion=gini, max_depth=10, n_estimators=100;, score=0.897 total time=   0.2s
[CV 5/5] END criterion=gini, max_depth=10, n_estimators=100;, score=0.923 total time=   0.2s
[CV 1/5] END c

In [53]:
cv.best_params_

{'n_estimators': 300, 'max_depth': 3, 'criterion': 'gini'}

In [54]:
cv = RandomForestClassifier(n_estimators =  300, max_depth =  3, criterion = 'gini')

In [55]:
cv.fit(x_train,y_train)

In [57]:
y_test_predi = cv.predict(x_test)

In [58]:
accuracy_score(y_test,y_test_predi)

1.0