In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from tabulate import tabulate
%matplotlib inline
import os
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    recall_score,
    precision_score,
    roc_auc_score,
)
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier, Pool, cv 

# Initializing the CatBoost model with lower verbosity (0 to suppress output)
catboost_model = CatBoostClassifier(verbose=0)  # 0 to suppress output

import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="catboost")
sns.set()

data = pd.read_csv('C:/Users/Nikhil_Chamle/Desktop/Final_Projects/1. Coupon Recc/Data.csv')

df = data.copy()

pd.set_option('display.max_columns', 50)

# dropping irrelvant columns on basis of null values and dropping null rows, duplicated values
df = df.drop(columns='car')
df = df.dropna()
df = df.drop_duplicates(ignore_index=True)

# Renaming column

df = df.rename(columns = {'Accept(Y/N?)': 'Accept'})

# Creating new column 'Age' 
# (below21,21,26) -> young
# (31, 36, 41, 46) -> mid_age
# (above50) -> older

df.loc[df['age'] == 'below21', 'Age'] = 'young' 
df.loc[df['age'] == '21', 'Age'] = 'young' 
df.loc[df['age'] == '26', 'Age'] = 'young' 
df.loc[df['age'] == '31', 'Age'] = 'mid_age' 
df.loc[df['age'] == '36', 'Age'] = 'mid_age' 
df.loc[df['age'] == '41', 'Age'] = 'mid_age' 
df.loc[df['age'] == '46', 'Age'] = 'mid_age'
df.loc[df['age'] == '50plus', 'Age'] = 'older'

# Creating new column 'Income' 

# below 49999 - low income
# above 50000 - mid income

# we will not go for high income as High income individuals wont only be attracted by coupons and for these going to food places may vary on several other factors, can be for mid income level too but for now lets consider earlir

df.loc[df['income'] == 'Less than $12500','Income'] = 'Low'
df.loc[df['income'] == '$12500 - $24999', 'Income'] = 'Low'
df.loc[df['income'] == '$25000 - $37499','Income'] = 'Low'
df.loc[df['income'] == '$37500 - $49999', 'Income'] = 'Low'
df.loc[df['income'] == '$50000 - $62499', 'Income'] = 'Mid'
df.loc[df['income'] == '$62500 - $74999', 'Income'] = 'Mid'
df.loc[df['income'] == '$75000 - $87499','Income'] = 'Mid'
df.loc[df['income'] == '$87500 - $99999', 'Income'] = 'Mid'
df.loc[df['income'] == '$100000 or More', 'Income'] = 'Mid'

# we will create new feature based on education

# criteria High school(low_education)<college(low_education)<1st_degree(mid_education)<High_degree(high_education)

# low_education < mid_eduction < high_education

#Creating new feature by using logic statement for durantion required to travel
 
df.loc[df['education'] == 'Some High School', 'Level_of_education'] = 'low_education' 
df.loc[df['education'] == 'High School Graduate', 'Level_of_education'] = 'low_education' 
df.loc[df['education'] == 'Some college - no degree', 'Level_of_education'] = 'low_education' 
df.loc[df['education'] == 'Associates degree', 'Level_of_education'] = 'mid_education' 
df.loc[df['education'] == 'Bachelors degree', 'Level_of_education'] = 'mid_education' 
df.loc[df['education'] == 'Graduate degree (Masters or Doctorate)', 'Level_of_education'] = 'high_education' 

# creating new feature on basis of occupation and %acceptance of coupon

# criteria - =above 60% > High, 50-59.99% > Medium, below 49.99 > low acceptance

import pandas as pd

cross_tab = pd.crosstab(columns=[df['Accept']], index=[df['occupation']], margins=True, normalize='index')

# Calculate the acceptance percentage and categorize it
cross_tab['Acceptance%'] = cross_tab[1] * 100  # Calculate the acceptance percentage

# criteria for categorization
def categorize_acceptance(percentage):
    if percentage >= 60:
        return 'High'
    elif 50 <= percentage <= 59.99:
        return 'Medium'
    else:
        return 'Low'

# Applying the categorization function to the 'Acceptance%' column
cross_tab['Occupation_with_(1)%'] = cross_tab['Acceptance%'].apply(categorize_acceptance)

# Merging the 'Acceptance%' and 'Acceptance_Category' columns back into the 'df' DataFrame based on 'occupation'
df = df.merge(cross_tab[['Acceptance%', 'Occupation_with_(1)%']], left_on='occupation', right_index=True, how='left')

#Creating new feature by time required

df['Coupon_timeRequired_15to25'] = 0 
df.loc[(df['toCoupon_GEQ15min'] == 1) & (df['toCoupon_GEQ25min'] == 0), 'Coupon_timeRequired_15to25'] = 1
df.loc[df['toCoupon_GEQ15min'] == 0, 'Coupon_timeRequired_15to25'] = 0
df.loc[df['toCoupon_GEQ25min'] == 1, 'Coupon_timeRequired_15to25'] = 2

# we want to know if we offer a coupon to an individual does he accept it if he is the regular visitor to that category of place

Relative_coupon_Accepted_count = []

for i in range(df.shape[0]):
    if df['coupon'].iloc[i] == 'Restaurant(<20)':
        Relative_coupon_Accepted_count.append(df['RestaurantLessThan20'].iloc[i])
    elif df['coupon'].iloc[i] == 'Coffee House':
        Relative_coupon_Accepted_count.append(df['CoffeeHouse'].iloc[i])
    elif df['coupon'].iloc[i] == 'Carry out & Take away':
        Relative_coupon_Accepted_count.append(df['CarryAway'].iloc[i])
    elif df['coupon'].iloc[i] == 'Bar':
        Relative_coupon_Accepted_count.append(df['Bar'].iloc[i])
    elif df['coupon'].iloc[i] == 'Restaurant(20-50)':
        Relative_coupon_Accepted_count.append(df['Restaurant20To50'].iloc[i])

df['Relative_coupon_Accepted_count'] = Relative_coupon_Accepted_count

# dropping columns

df = df.drop(['Bar', 'CoffeeHouse', 'CarryAway',
       'RestaurantLessThan20', 'Restaurant20To50', 'age', 'income', 'education', 'occupation', 'Acceptance%', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min', 'direction_same', 'direction_opp', 'toCoupon_GEQ5min'], axis=1)

categorical_features = ['destination', 'passanger', 'weather', 'temperature', 'coupon',
       'expiration', 'gender', 'maritalStatus', 'has_children',
       'Age', 'Income', 'Level_of_education', 'Occupation_with_(1)%',
       'Coupon_timeRequired_15to25', 'Relative_coupon_Accepted_count']

In [2]:
X = df.drop('Accept', axis=1)
y = df['Accept']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

catboost_model = CatBoostClassifier(iterations=100,  
                                    depth=6,         
                                    learning_rate=0.1, 
                                    cat_features=categorical_features,
                                    verbose=0)

catboost_model.fit(X_train, y_train)

y_pred = catboost_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred)}")
print()
print("----------------------------------------------------------------------------")



Accuracy: 0.7169491525423729
Confusion Matrix:
[[ 619  410]
 [ 258 1073]]
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.60      0.65      1029
           1       0.72      0.81      0.76      1331

    accuracy                           0.72      2360
   macro avg       0.71      0.70      0.71      2360
weighted avg       0.72      0.72      0.71      2360

Recall: 0.8061607813673929
Precision: 0.7235333782872556
ROC AUC: 0.7038578445223747

----------------------------------------------------------------------------


In [3]:
param_grid = {
    'iterations': [50, 100, 150],
    'depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2],
}

catboost_model = CatBoostClassifier(cat_features=categorical_features, verbose=0)

grid_search = GridSearchCV(catboost_model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)
print()

best_catboost_model = grid_search.best_estimator_

y_pred_best = best_catboost_model.predict(X_test)

print("Best Model - Accuracy:", accuracy_score(y_test, y_pred_best))
print(f"Best Model - Confusion Matrix:\n{confusion_matrix(y_test, y_pred_best)}")
print(f"Best Model - Classification Report:\n{classification_report(y_test, y_pred_best)}")
print(f"Best Model - Recall: {recall_score(y_test, y_pred_best)}")
print(f"Best Model - Precision: {precision_score(y_test, y_pred_best)}")
print(f"Best Model - ROC AUC: {roc_auc_score(y_test, y_pred_best)}")
print("----------------------------------------------------------------------------")


Best Hyperparameters: {'depth': 8, 'iterations': 150, 'learning_rate': 0.1}

Best Model - Accuracy: 0.7296610169491525
Best Model - Confusion Matrix:
[[ 640  389]
 [ 249 1082]]
Best Model - Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.62      0.67      1029
           1       0.74      0.81      0.77      1331

    accuracy                           0.73      2360
   macro avg       0.73      0.72      0.72      2360
weighted avg       0.73      0.73      0.73      2360

Best Model - Recall: 0.8129226145755072
Best Model - Precision: 0.7355540448674371
Best Model - ROC AUC: 0.717442842759085
----------------------------------------------------------------------------
