In [1]:
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Reading Dataset

In [2]:
dataset = sns.load_dataset('tips')

In [3]:
dataset.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [5]:
dataset.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [6]:
dataset.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [7]:
dataset['time'].unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

### In this problem statement we need to predict time.

### Transforming time into numeric feature

In [8]:
label_encoder = LabelEncoder()
dataset['time'] = label_encoder.fit_transform(dataset['time'])

In [9]:
dataset.time.unique()

array([0, 1])

### Dependent and Independent features

In [10]:
X = dataset.drop('time', axis = 1) # Independent features
y = dataset.time # Dependent features

In [11]:
X.shape, y.shape

((244, 6), (244,))

### Splitting the dataset into train and test datasets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 15)

In [13]:
X_train.shape, y_train.shape

((195, 6), (195,))

In [14]:
X_test.shape, y_test.shape

((49, 6), (49,))

### Categorical features

In [15]:
categorical_features = [feature for feature in X_train.columns if X_train[feature].dtype == 'category']

In [16]:
categorical_features

['sex', 'smoker', 'day']

### Numerical features

In [17]:
numerical_features = [feature for feature in X_train.columns if X_train[feature].dtype != 'category']

In [18]:
numerical_features

['total_bill', 'tip', 'size']

## Automating Feature Engineering for deployment using **Pipeline**

### Pipeline for numeric features

In [19]:
numerical_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'median')), # Handing Missing Values
        ('scaler', StandardScaler()) # Feature Scaling
    ]
)

### Pipeline for categorical features

In [20]:
categorical_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'most_frequent')), # Handling Missing Values
        ('one_hot_encoder', OneHotEncoder()) # Nominal features to numeric features
    ]
)

### Combining both the pipelines using **ColumnTransformer**

In [21]:
preprocessor = ColumnTransformer(
    transformers = [
        ('numerical_pipeline', numerical_pipeline, numerical_features),
        ('categorical_pipeline', categorical_pipeline, categorical_features)
    ]
)

### Preprocessing: Applying piplines

In [22]:
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

## Model training automation

### Function to evaluate different models

In [23]:
def evaluate_model(X_train, y_train, X_test, y_test, models):

    report = {} # Stores stores key value pairs as model_name and model_accuracy

    for model_name in models.keys():

        model = models[model_name]

        ### Train Model
        model.fit(X_train, y_train)

        ### Prediction of the model
        y_pred_test = model.predict(X_test)

        ### Accuracy score of the model
        report[model_name] = accuracy_score(y_test, y_pred_test)

    return report

### Defining models

In [24]:
models = {
    'logistic_classifier' : LogisticRegression(),
    'decision_tree_classifier' : DecisionTreeClassifier(),
    'random_forest_classifier' : RandomForestClassifier()
}

### Evaluating different models

In [25]:
report = evaluate_model(X_train_scaled, y_train, X_test_scaled, y_test, models)

In [26]:
report

{'logistic_classifier': 0.8979591836734694,
 'decision_tree_classifier': 0.9387755102040817,
 'random_forest_classifier': 0.8979591836734694}

### Retriving the best model accuracy

In [27]:
max(sorted(report.values()))

0.9387755102040817

### Hyperparameter tuning for Random Forest Classifier using RandomizedSearchCV

In [28]:
parameters = {
    'n_estimators' : (50, 100, 200, 300),
    'criterion' : ('gini', 'entropy'),
    'max_depth' : (3, 5, 10, None)
}

### RandomizedSearchCV

In [29]:
classifier = RandomForestClassifier()
randomized_search_cv_clf = RandomizedSearchCV(classifier, param_distributions = parameters, cv = 5, scoring = 'accuracy', verbose = 3)

In [30]:
randomized_search_cv_clf.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=5, n_estimators=100;, score=1.000 total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=5, n_estimators=100;, score=0.974 total time=   0.1s
[CV 3/5] END criterion=gini, max_depth=5, n_estimators=100;, score=0.949 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=5, n_estimators=100;, score=1.000 total time=   0.1s
[CV 5/5] END criterion=gini, max_depth=5, n_estimators=100;, score=0.974 total time=   0.1s


[CV 1/5] END criterion=entropy, max_depth=None, n_estimators=300;, score=0.974 total time=   0.5s
[CV 2/5] END criterion=entropy, max_depth=None, n_estimators=300;, score=0.974 total time=   0.6s
[CV 3/5] END criterion=entropy, max_depth=None, n_estimators=300;, score=0.949 total time=   0.9s
[CV 4/5] END criterion=entropy, max_depth=None, n_estimators=300;, score=1.000 total time=   0.5s
[CV 5/5] END criterion=entropy, max_depth=None, n_estimators=300;, score=0.974 total time=   0.7s
[CV 1/5] END criterion=gini, max_depth=5, n_estimators=300;, score=0.974 total time=   0.5s
[CV 2/5] END criterion=gini, max_depth=5, n_estimators=300;, score=0.974 total time=   0.3s
[CV 3/5] END criterion=gini, max_depth=5, n_estimators=300;, score=0.949 total time=   0.3s
[CV 4/5] END criterion=gini, max_depth=5, n_estimators=300;, score=1.000 total time=   0.3s
[CV 5/5] END criterion=gini, max_depth=5, n_estimators=300;, score=0.974 total time=   0.2s
[CV 1/5] END criterion=entropy, max_depth=10, n_es

In [32]:
randomized_search_cv_clf.best_params_

{'n_estimators': 100, 'max_depth': 5, 'criterion': 'gini'}

In [33]:
random_forest_classifier = RandomForestClassifier(n_estimators = 100, max_depth = 5, criterion = 'gini')

In [34]:
random_forest_classifier.fit(X_train_scaled, y_train)

### Prediction

In [35]:
y_pred_test = random_forest_classifier.predict(X_test_scaled)

### Confusion Matrix, Accuracy Score and Classification Report

In [38]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))

Confusion Matrix:
 [[32  2]
 [ 2 13]]


In [40]:
print("Accuracy Score: ", accuracy_score(y_test, y_pred_test))

Accuracy Score:  0.9183673469387755


In [41]:
print(
        "------------------Classification Report------------------\n",
        classification_report(y_test, y_pred_test)
)

------------------Classification Report------------------
               precision    recall  f1-score   support

           0       0.94      0.94      0.94        34
           1       0.87      0.87      0.87        15

    accuracy                           0.92        49
   macro avg       0.90      0.90      0.90        49
weighted avg       0.92      0.92      0.92        49

