# Step 1: Import Libraries

In [3]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.4
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier  # Boosting algorithm
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import pickle  # For saving the model

# Step 2: Load and Explore the Dataset

In [5]:
# Load dataset
df = sns.load_dataset("tips")

# Create target variable: High tipper (tip ≥ 20% of total_bill)
df["high_tipper"] = (df["tip"] / df["total_bill"] >= 0.2).astype(int)

# Check first 5 rows
print(df.head())

# Check for missing values
print(df.isnull().sum())

   total_bill   tip     sex smoker  day    time  size  high_tipper
0       16.99  1.01  Female     No  Sun  Dinner     2            0
1       10.34  1.66    Male     No  Sun  Dinner     3            0
2       21.01  3.50    Male     No  Sun  Dinner     3            0
3       23.68  3.31    Male     No  Sun  Dinner     2            0
4       24.59  3.61  Female     No  Sun  Dinner     4            0
total_bill     0
tip            0
sex            0
smoker         0
day            0
time           0
size           0
high_tipper    0
dtype: int64


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   total_bill   244 non-null    float64 
 1   tip          244 non-null    float64 
 2   sex          244 non-null    category
 3   smoker       244 non-null    category
 4   day          244 non-null    category
 5   time         244 non-null    category
 6   size         244 non-null    int64   
 7   high_tipper  244 non-null    int32   
dtypes: category(4), float64(2), int32(1), int64(1)
memory usage: 8.3 KB


In [7]:
df.describe()

Unnamed: 0,total_bill,tip,size,high_tipper
count,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,0.159836
std,8.902412,1.383638,0.9511,0.367207
min,3.07,1.0,1.0,0.0
25%,13.3475,2.0,2.0,0.0
50%,17.795,2.9,2.0,0.0
75%,24.1275,3.5625,3.0,0.0
max,50.81,10.0,6.0,1.0


# Step 3: Data Preprocessing

In [8]:
# Encode 'sex', 'smoker', 'day', and 'time' using LabelEncoder
le_sex = LabelEncoder()
df["sex"] = le_sex.fit_transform(df["sex"])

le_smoker = LabelEncoder()
df["smoker"] = le_smoker.fit_transform(df["smoker"])

le_day = LabelEncoder()
df["day"] = le_day.fit_transform(df["day"])

le_time = LabelEncoder()
df["time"] = le_time.fit_transform(df["time"])

In [9]:
#defining x and y

X = df.drop("high_tipper", axis=1)
y = df["high_tipper"]

# Step 4: Split Data into Train and Test Sets

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Step 5: Train a Single Decision Tree (Baseline Model)

In [13]:
from sklearn.tree import DecisionTreeClassifier

In [14]:
# Train a single Decision Tree
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

# Evaluate
y_pred_tree = tree.predict(X_test)
print("Single Tree Accuracy:", accuracy_score(y_test, y_pred_tree))
print(classification_report(y_test, y_pred_tree))

Single Tree Accuracy: 0.8513513513513513
              precision    recall  f1-score   support

           0       0.92      0.90      0.91        62
           1       0.54      0.58      0.56        12

    accuracy                           0.85        74
   macro avg       0.73      0.74      0.74        74
weighted avg       0.86      0.85      0.85        74



# Step 6: Implement Boosting with XGBoost


In [15]:
# Initialize XGBoost Classifier
xgb = XGBClassifier(
    n_estimators=100,          # Number of boosting rounds
    max_depth=3,               # Maximum depth of each tree
    learning_rate=0.1,         # Step size to prevent overfitting
    subsample=0.8,             # Fraction of rows used per tree
    colsample_bytree=1.0,      # Fraction of columns used per tree
    objective='binary:logistic',  # Binary classification task
    eval_metric='logloss',     # Evaluation metric
    use_label_encoder=False,   # Avoid warning for newer versions
    random_state=42
)

# Train the model
xgb.fit(X_train, y_train)

# Evaluate
y_pred_xgb = xgb.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

XGBoost Accuracy: 0.9324324324324325
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        62
           1       1.00      0.58      0.74        12

    accuracy                           0.93        74
   macro avg       0.96      0.79      0.85        74
weighted avg       0.94      0.93      0.92        74



Parameters: { "use_label_encoder" } are not used.



# Step 8: Save the Model Using pickle

In [None]:
# Save the XGBoost model
with open("xgboost_tips_model.pkl", "wb") as file:
    pickle.dump(xgb, file)

# Save encoders for future inference
with open("sex_encoder.pkl", "wb") as file:
    pickle.dump(le_sex, file)

with open("smoker_encoder.pkl", "wb") as file:
    pickle.dump(le_smoker, file)

with open("day_encoder.pkl", "wb") as file:
    pickle.dump(le_day, file)

with open("time_encoder.pkl", "wb") as file:
    pickle.dump(le_time, file)

In [None]:
>> feature scaling, treating outliers, handling missing values is not required.

In [None]:
AdaBoost is part of scikit-learn:
    No separate installation required.
    
    from sklearn.ensemble import AdaBoostClassifier
    n_estimators: Number of weak learners (trees) .
    estimator: Base model (default is a decision stump) 

In [None]:
CatBoost Implementation:
    pip install catboost
    
    from catboost import CatBoostClassifier
    
    cat = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=3, verbose=0) 
    
parameters:
    iterations: Number of boosting iterations .
learning_rate: Controls step size for updates .
depth: Maximum depth of trees