In [None]:
>> We’ll use the Seaborn "tips" dataset. 
>> The goal is to predict whether a customer will be a "high tipper" (tip ≥ 20% of total bill) based on features like total bill, day, time, etc.

# Step 1: Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import pickle 

# Step 2: Load and Explore the Dataset

In [2]:
# Load dataset
df = sns.load_dataset("tips")


In [3]:
# Create target variable: High tipper (tip ≥ 20% of total_bill)
df["high_tipper"] = (df["tip"] / df["total_bill"] >= 0.2).astype(int)


In [4]:
# Check first 5 rows
print(df.head())

   total_bill   tip     sex smoker  day    time  size  high_tipper
0       16.99  1.01  Female     No  Sun  Dinner     2            0
1       10.34  1.66    Male     No  Sun  Dinner     3            0
2       21.01  3.50    Male     No  Sun  Dinner     3            0
3       23.68  3.31    Male     No  Sun  Dinner     2            0
4       24.59  3.61  Female     No  Sun  Dinner     4            0


In [5]:
# Check for missing values
print(df.isnull().sum())

total_bill     0
tip            0
sex            0
smoker         0
day            0
time           0
size           0
high_tipper    0
dtype: int64


In [6]:
df.describe()

Unnamed: 0,total_bill,tip,size,high_tipper
count,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,0.159836
std,8.902412,1.383638,0.9511,0.367207
min,3.07,1.0,1.0,0.0
25%,13.3475,2.0,2.0,0.0
50%,17.795,2.9,2.0,0.0
75%,24.1275,3.5625,3.0,0.0
max,50.81,10.0,6.0,1.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   total_bill   244 non-null    float64 
 1   tip          244 non-null    float64 
 2   sex          244 non-null    category
 3   smoker       244 non-null    category
 4   day          244 non-null    category
 5   time         244 non-null    category
 6   size         244 non-null    int64   
 7   high_tipper  244 non-null    int32   
dtypes: category(4), float64(2), int32(1), int64(1)
memory usage: 8.3 KB


# Step 3: Data Preprocessing

In [None]:
# Encode 'sex', 'smoker', 'day', and 'time' using LabelEncoder

In [10]:
le_sex = LabelEncoder()
df["sex"] = le_sex.fit_transform(df["sex"])

In [11]:
le_smoker = LabelEncoder()
df["smoker"] = le_smoker.fit_transform(df["smoker"])


In [12]:
le_day = LabelEncoder()
df["day"] = le_day.fit_transform(df["day"])

In [13]:
le_time = LabelEncoder()
df["time"] = le_time.fit_transform(df["time"])

In [14]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,high_tipper
0,16.99,1.01,0,0,2,0,2,0
1,10.34,1.66,1,0,2,0,3,0
2,21.01,3.50,1,0,2,0,3,0
3,23.68,3.31,1,0,2,0,2,0
4,24.59,3.61,0,0,2,0,4,0
...,...,...,...,...,...,...,...,...
239,29.03,5.92,1,0,1,0,3,1
240,27.18,2.00,0,1,1,0,2,0
241,22.67,2.00,1,1,1,0,2,0
242,17.82,1.75,1,0,1,0,2,0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   total_bill   244 non-null    float64
 1   tip          244 non-null    float64
 2   sex          244 non-null    int32  
 3   smoker       244 non-null    int64  
 4   day          244 non-null    int32  
 5   time         244 non-null    int32  
 6   size         244 non-null    int64  
 7   high_tipper  244 non-null    int32  
dtypes: float64(2), int32(4), int64(2)
memory usage: 11.6 KB


In [16]:
X = df.drop("high_tipper", axis=1)
y = df["high_tipper"]

# Step 4: Split Data into Train and Test Sets

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Step 5: Train a Single Decision Tree (Baseline Model)

In [18]:
# Train a single Decision Tree
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

In [19]:
# Evaluate
y_pred_tree = tree.predict(X_test)
print("Single Tree Accuracy:", accuracy_score(y_test, y_pred_tree))
print(classification_report(y_test, y_pred_tree))

Single Tree Accuracy: 0.8513513513513513
              precision    recall  f1-score   support

           0       0.92      0.90      0.91        62
           1       0.54      0.58      0.56        12

    accuracy                           0.85        74
   macro avg       0.73      0.74      0.74        74
weighted avg       0.86      0.85      0.85        74



# Step 6: Implement Bagging with Decision Trees

In [20]:
# Initialize Bagging with 100 Decision Trees
bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=42),  # Base model
    n_estimators=100,                                   # Number of trees
    bootstrap=True,                                     # Bootstrap sampling
    n_jobs=-1,                                          # Parallelize training
    random_state=42                                     # Reproducibility
)

In [None]:
>> n_estimators=100==Number of decision trees to train.
>> bootstrap=True==Enables bootstrap sampling (random sampling with replacement) for rows.
>> max_samples=0.7==Each tree is trained on 70% of the training data (randomly sampled with replacement).
>> bootstrap_features=False==Disables resampling of features (columns). All features are used for each tree.
>> max_features=1.0== Each tree uses 100% of the features (all columns). Lower values (e.g., 0.5) would randomly sample a subset of features, increasing diversity
>> n_jobs=-1 == Uses all CPU cores (n_jobs=-1) for parallelization, speeding up training.
>> random_state=42== Ensures reproducibility by fixing the random seed for sampling and tree splits.


In [None]:
Rows (Samples):
    With max_samples=0.7, each tree uses 70% of the training data (e.g., if X_train has 100 rows, each tree uses ~70 rows).

Columns (Features) :
    With max_features=1.0 and bootstrap_features=False, all features (columns) are used for every tree.
If max_features=0.5, each tree would use 50% of the features (randomly selected)

In [21]:
# Train the model
bagging.fit(X_train, y_train)

In [22]:
# Evaluate
y_pred_bagging = bagging.predict(X_test)
print("Bagging Accuracy:", accuracy_score(y_test, y_pred_bagging))
print(classification_report(y_test, y_pred_bagging))

Bagging Accuracy: 0.9054054054054054
              precision    recall  f1-score   support

           0       0.92      0.97      0.94        62
           1       0.78      0.58      0.67        12

    accuracy                           0.91        74
   macro avg       0.85      0.78      0.81        74
weighted avg       0.90      0.91      0.90        74



# Step 8: Save the Model Using pickle

In [None]:
# Save the Bagging model
with open("bagging_tips_model.pkl", "wb") as file:
    pickle.dump(bagging, file)

# Save encoders for future inference
with open("sex_encoder.pkl", "wb") as file:
    pickle.dump(le_sex, file)

with open("smoker_encoder.pkl", "wb") as file:
    pickle.dump(le_smoker, file)

with open("day_encoder.pkl", "wb") as file:
    pickle.dump(le_day, file)

with open("time_encoder.pkl", "wb") as file:
    pickle.dump(le_time, file)

In [None]:
>> treating missing values, treating outliers, feature scaling is not required
>> tree models don't require this
>> Bagging uses Decision Trees as base estimators, which are not sensitive to feature scaling 
