In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from tabulate import tabulate
%matplotlib inline
import os
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    recall_score,
    precision_score,
    roc_auc_score,
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier, 
)
from lightgbm import LGBMClassifier 
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier, Pool, cv 
from sklearn.ensemble import ExtraTreesClassifier 
import xgboost as xgb 

# Initializing the LightGBM model with lower verbosity
lgbm_model = LGBMClassifier(verbosity=-1)  # -1 for minimal verbosity

# Initializing the CatBoost model with lower verbosity (0 to suppress output)
catboost_model = CatBoostClassifier(verbose=0)  # 0 to suppress output

import warnings

# Suppressing LightGBM and CatBoost warnings
warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")
warnings.filterwarnings("ignore", category=UserWarning, module="catboost")
sns.set()

In [2]:
data = pd.read_csv('C:/Users/Nikhil_Chamle/Desktop/Project_10_Cupom reco/Data.csv')

In [3]:
df = data.copy()

In [4]:
pd.set_option('display.max_columns', 50)

In [5]:
# dropping irrelvant columns on basis of null values and dropping null rows, duplicated values
df = df.drop(columns='car')
df = df.dropna()
df = df.drop_duplicates(ignore_index=True)

In [6]:
# Renaming column

df = df.rename(columns = {'Accept(Y/N?)': 'Accept'})

In [7]:
# Creating new column 'Age' 
# (below21,21,26) -> young
# (31, 36, 41, 46) -> mid_age
# (above50) -> older

df.loc[df['age'] == 'below21', 'Age'] = 'young' 
df.loc[df['age'] == '21', 'Age'] = 'young' 
df.loc[df['age'] == '26', 'Age'] = 'young' 
df.loc[df['age'] == '31', 'Age'] = 'mid_age' 
df.loc[df['age'] == '36', 'Age'] = 'mid_age' 
df.loc[df['age'] == '41', 'Age'] = 'mid_age' 
df.loc[df['age'] == '46', 'Age'] = 'mid_age'
df.loc[df['age'] == '50plus', 'Age'] = 'older'

In [8]:
# Creating new column 'Income' 

# below 49999 - low income
# above 50000 - mid income

# we will not go for high income as High income individuals wont only be attracted by coupons and for these going to food places may vary on several other factors, can be for mid income level too but for now lets consider earlir

df.loc[df['income'] == 'Less than $12500','Income'] = 'Low'
df.loc[df['income'] == '$12500 - $24999', 'Income'] = 'Low'
df.loc[df['income'] == '$25000 - $37499','Income'] = 'Low'
df.loc[df['income'] == '$37500 - $49999', 'Income'] = 'Low'
df.loc[df['income'] == '$50000 - $62499', 'Income'] = 'Mid'
df.loc[df['income'] == '$62500 - $74999', 'Income'] = 'Mid'
df.loc[df['income'] == '$75000 - $87499','Income'] = 'Mid'
df.loc[df['income'] == '$87500 - $99999', 'Income'] = 'Mid'
df.loc[df['income'] == '$100000 or More', 'Income'] = 'Mid'

In [9]:
# we will create new feature based on education

# criteria High school(low_education)<college(low_education)<1st_degree(mid_education)<High_degree(high_education)

# low_education < mid_eduction < high_education

#Creating new feature by using logic statement for durantion required to travel
 
df.loc[df['education'] == 'Some High School', 'Level_of_education'] = 'low_education' 
df.loc[df['education'] == 'High School Graduate', 'Level_of_education'] = 'low_education' 
df.loc[df['education'] == 'Some college - no degree', 'Level_of_education'] = 'low_education' 
df.loc[df['education'] == 'Associates degree', 'Level_of_education'] = 'mid_education' 
df.loc[df['education'] == 'Bachelors degree', 'Level_of_education'] = 'mid_education' 
df.loc[df['education'] == 'Graduate degree (Masters or Doctorate)', 'Level_of_education'] = 'high_education' 

In [10]:
# creating new feature on basis of occupation and %acceptance of coupon

# criteria - =above 60% > High, 50-59.99% > Medium, below 49.99 > low acceptance

import pandas as pd

cross_tab = pd.crosstab(columns=[df['Accept']], index=[df['occupation']], margins=True, normalize='index')

# Calculate the acceptance percentage and categorize it
cross_tab['Acceptance%'] = cross_tab[1] * 100  # Calculate the acceptance percentage

# criteria for categorization
def categorize_acceptance(percentage):
    if percentage >= 60:
        return 'High'
    elif 50 <= percentage <= 59.99:
        return 'Medium'
    else:
        return 'Low'

# Applying the categorization function to the 'Acceptance%' column
cross_tab['Occupation_with_(1)%'] = cross_tab['Acceptance%'].apply(categorize_acceptance)

# Merging the 'Acceptance%' and 'Acceptance_Category' columns back into the 'df' DataFrame based on 'occupation'
df = df.merge(cross_tab[['Acceptance%', 'Occupation_with_(1)%']], left_on='occupation', right_index=True, how='left')

In [11]:
#Creating new feature by time required

df['Coupon_timeRequired_15to25'] = 0 
df.loc[(df['toCoupon_GEQ15min'] == 1) & (df['toCoupon_GEQ25min'] == 0), 'Coupon_timeRequired_15to25'] = 1
df.loc[df['toCoupon_GEQ15min'] == 0, 'Coupon_timeRequired_15to25'] = 0
df.loc[df['toCoupon_GEQ25min'] == 1, 'Coupon_timeRequired_15to25'] = 2

In [12]:
# we want to know if we offer a coupon to an individual does he accept it if he is the regular visitor to that category of place

Relative_coupon_Accepted_count = []

for i in range(df.shape[0]):
    if df['coupon'].iloc[i] == 'Restaurant(<20)':
        Relative_coupon_Accepted_count.append(df['RestaurantLessThan20'].iloc[i])
    elif df['coupon'].iloc[i] == 'Coffee House':
        Relative_coupon_Accepted_count.append(df['CoffeeHouse'].iloc[i])
    elif df['coupon'].iloc[i] == 'Carry out & Take away':
        Relative_coupon_Accepted_count.append(df['CarryAway'].iloc[i])
    elif df['coupon'].iloc[i] == 'Bar':
        Relative_coupon_Accepted_count.append(df['Bar'].iloc[i])
    elif df['coupon'].iloc[i] == 'Restaurant(20-50)':
        Relative_coupon_Accepted_count.append(df['Restaurant20To50'].iloc[i])

df['Relative_coupon_Accepted_count'] = Relative_coupon_Accepted_count

In [13]:
# dropping columns

df = df.drop(['Bar', 'CoffeeHouse', 'CarryAway',
       'RestaurantLessThan20', 'Restaurant20To50', 'age', 'income', 'education', 'occupation', 'Acceptance%', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min', 'direction_same', 'direction_opp', 'toCoupon_GEQ5min'], axis=1)

In [14]:
categorical_features = ['destination', 'passanger', 'weather', 'temperature', 'coupon',
       'expiration', 'gender', 'maritalStatus', 'has_children',
       'Age', 'Income', 'Level_of_education', 'Occupation_with_(1)%',
       'Coupon_timeRequired_15to25', 'Relative_coupon_Accepted_count']

In [15]:
len(categorical_features)

15

In [16]:
# Applying one-hot encoding to the categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

In [17]:
df = df_encoded.copy()

In [18]:
df

Unnamed: 0,Accept,destination_No Urgent Place,destination_Work,passanger_Friend(s),passanger_Kid(s),passanger_Partner,weather_Snowy,weather_Sunny,temperature_55,temperature_80,coupon_Carry out & Take away,coupon_Coffee House,coupon_Restaurant(20-50),coupon_Restaurant(<20),expiration_2h,gender_Male,maritalStatus_Married partner,maritalStatus_Single,maritalStatus_Unmarried partner,maritalStatus_Widowed,has_children_1,Age_older,Age_young,Income_Mid,Level_of_education_low_education,Level_of_education_mid_education,Occupation_with_(1)%_Low,Occupation_with_(1)%_Medium,Coupon_timeRequired_15to25_1,Coupon_timeRequired_15to25_2,Relative_coupon_Accepted_count_4~8,Relative_coupon_Accepted_count_gt8,Relative_coupon_Accepted_count_less1,Relative_coupon_Accepted_count_never
0,1,1,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0,1,0,0,0
1,0,1,0,1,0,0,0,1,0,1,0,1,0,0,1,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0
2,1,1,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1
3,0,1,0,1,0,0,0,1,0,1,1,0,0,0,1,1,0,1,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,0
4,0,1,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11792,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0
11793,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0
11794,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,1
11795,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,1,0,1,0,1,0,1,0,0,0,1


In [19]:
y = df['Accept'].astype(int)  # Target variable
X = df.drop(columns=['Accept']).astype(int)  # Features

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2)

In [21]:
models = []
models.append(("LR", LogisticRegression()))
models.append(("LDA", LinearDiscriminantAnalysis()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("CART", DecisionTreeClassifier()))
models.append(("NB", GaussianNB()))
models.append(("SVM", SVC()))
models.append(("RF", RandomForestClassifier()))
models.append(("ExtraTrees", ExtraTreesClassifier()))
models.append(("AdaBoost", AdaBoostClassifier()))

results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s Accuracy: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

    # Calculating training accuracy
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print(f"Training Accuracy for {name}: {train_accuracy}")
    print()

    # Calculating test accuracy
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy for {name}: {test_accuracy}")

    # Calculating additional metrics
    print(f"Confusion Matrix for {name}:\n{confusion_matrix(y_test, y_pred)}")
    print()
    print(f"Classification Report for {name}:\n{classification_report(y_test, y_pred)}")
    print()
    print(f"Recall for {name}: {recall_score(y_test, y_pred)}")
    print()
    print(f"Precision for {name}: {precision_score(y_test, y_pred)}")
    print()
    print(f"ROC AUC for {name}: {roc_auc_score(y_test, y_pred)}")
    print("----------------------------------------------------------------------------")


LR Accuracy: 0.710821 (0.017273)
Training Accuracy for LR: 0.7132563314612694

Test Accuracy for LR: 0.7008474576271186
Confusion Matrix for LR:
[[ 597  418]
 [ 288 1057]]

Classification Report for LR:
              precision    recall  f1-score   support

           0       0.67      0.59      0.63      1015
           1       0.72      0.79      0.75      1345

    accuracy                           0.70      2360
   macro avg       0.70      0.69      0.69      2360
weighted avg       0.70      0.70      0.70      2360


Recall for LR: 0.7858736059479554

Precision for LR: 0.7166101694915255

ROC AUC for LR: 0.6870254729247166
----------------------------------------------------------------------------
LDA Accuracy: 0.711033 (0.017876)
Training Accuracy for LDA: 0.7121966726713999

Test Accuracy for LDA: 0.7025423728813559
Confusion Matrix for LDA:
[[ 587  428]
 [ 274 1071]]

Classification Report for LDA:
              precision    recall  f1-score   support

           0       0.

In [22]:
models = []
models.append(("LGBM", LGBMClassifier()))  # LightGBM

results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s Accuracy: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

    # Calculating training accuracy
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print(f"Training Accuracy for {name}: {train_accuracy}")
    print()

    # Calculating test accuracy
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy for {name}: {test_accuracy}")

    # Calculating additional metrics
    print(f"Confusion Matrix for {name}:\n{confusion_matrix(y_test, y_pred)}")
    print()
    print(f"Classification Report for {name}:\n{classification_report(y_test, y_pred)}")
    print()
    print(f"Recall for {name}: {recall_score(y_test, y_pred)}")
    print()
    print(f"Precision for {name}: {precision_score(y_test, y_pred)}")
    print()
    print(f"ROC AUC for {name}: {roc_auc_score(y_test, y_pred)}")
    print("----------------------------------------------------------------------------")


[LightGBM] [Info] Number of positive: 4802, number of negative: 3691
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 8493, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.565407 -> initscore=0.263135
[LightGBM] [Info] Start training from score 0.263135
[LightGBM] [Info] Number of positive: 4812, number of negative: 3681
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 8493, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.566584 -> initscore=0.267928
[LightGBM] [Info] Start training from score 0.267928
[LightGBM] [Info] Number of positive: 4806, number of negative: 3687
You can set `force_row_wise=true` to 

In [23]:
models = []
models.append(("CatBoost", CatBoostClassifier()))

results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s Accuracy: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

    # Calculating training accuracy
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print(f"Training Accuracy for {name}: {train_accuracy}")
    print()

    # Calculating test accuracy
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy for {name}: {test_accuracy}")

    # Calculating additional metrics
    print(f"Confusion Matrix for {name}:\n{confusion_matrix(y_test, y_pred)}")
    print()
    print(f"Classification Report for {name}:\n{classification_report(y_test, y_pred)}")
    print()
    print(f"Recall for {name}: {recall_score(y_test, y_pred)}")
    print()
    print(f"Precision for {name}: {precision_score(y_test, y_pred)}")
    print()
    print(f"ROC AUC for {name}: {roc_auc_score(y_test, y_pred)}")
    print("----------------------------------------------------------------------------")

Learning rate set to 0.025683
0:	learn: 0.6874753	total: 157ms	remaining: 2m 36s
1:	learn: 0.6821201	total: 162ms	remaining: 1m 20s
2:	learn: 0.6770195	total: 168ms	remaining: 55.9s
3:	learn: 0.6718327	total: 174ms	remaining: 43.2s
4:	learn: 0.6668881	total: 178ms	remaining: 35.4s
5:	learn: 0.6620822	total: 181ms	remaining: 30.1s
6:	learn: 0.6580903	total: 185ms	remaining: 26.2s
7:	learn: 0.6542852	total: 188ms	remaining: 23.4s
8:	learn: 0.6508611	total: 192ms	remaining: 21.1s
9:	learn: 0.6475478	total: 195ms	remaining: 19.3s
10:	learn: 0.6437427	total: 198ms	remaining: 17.8s
11:	learn: 0.6404825	total: 202ms	remaining: 16.6s
12:	learn: 0.6371896	total: 206ms	remaining: 15.6s
13:	learn: 0.6341710	total: 210ms	remaining: 14.8s
14:	learn: 0.6308806	total: 214ms	remaining: 14s
15:	learn: 0.6282072	total: 217ms	remaining: 13.3s
16:	learn: 0.6256837	total: 220ms	remaining: 12.7s
17:	learn: 0.6231269	total: 224ms	remaining: 12.2s
18:	learn: 0.6204756	total: 227ms	remaining: 11.7s
19:	learn: 

## Summary of the metrics for each model:

#### 1. Logistic Regression (LR):
   - Test Accuracy: 0.7136
   - F1-Score (1): 0.77
   - ROC AUC: 0.6926

#### 2. Linear Discriminant Analysis (LDA):
   - Test Accuracy: 0.7148
   - F1-Score (1): 0.77
   - ROC AUC: 0.6927

#### 3. K-Nearest Neighbors (KNN):
   - Test Accuracy: 0.6970
   - F1-Score (1): 0.75
   - ROC AUC: 0.6844

#### 4. Decision Tree (CART):
   - Test Accuracy: 0.6364
   - F1-Score (1): 0.68
   - ROC AUC: 0.6303

#### 5. Naive Bayes (NB):
   - Test Accuracy: 0.6864
   - F1-Score (1): 0.73
   - ROC AUC: 0.6756

#### 6. Support Vector Machine (SVM):
   - Test Accuracy: 0.7339
   - F1-Score (1): 0.79
   - ROC AUC: 0.7123

#### 7. Random Forest (RF):
   - Test Accuracy: 0.6953
   - F1-Score (1): 0.75
   - ROC AUC: 0.6814

#### 8. Extra Trees:
   - Test Accuracy: 0.6894
   - F1-Score (1): 0.74
   - ROC AUC: 0.6797

#### 9. AdaBoost:
   - Test Accuracy: 0.7089
   - F1-Score (1): 0.77
   - ROC AUC: 0.6873

#### 10. LightGBM (LGBM):
    - Test Accuracy: 0.7403
    - F1-Score (1): 0.79
    - ROC AUC: 0.7227

#### 11. CatBoost:
    - Test Accuracy: 0.7534
    - F1-Score (1): 0.80
    - ROC AUC: 0.7373

- the CatBoost model seems to perform the best in terms of Test Accuracy, F1-Score, and ROC AUC. It has the highest Test Accuracy (0.7534), the highest F1-Score (0.80), and the highest ROC AUC (0.7373) among all the models.

In [24]:
# Defining the parameter grid for Grid Search
param_grid = {
    'iterations': [100, 200, 300],  # Number of trees
    'learning_rate': [0.1, 0.01, 0.001],
    'depth': [3, 4, 5],  # Adjust the depth as needed
    'min_child_samples': [10, 20, 30],  # Adjust min_child_samples as needed
}

# Initializing the CatBoostClassifier
catboost_model = CatBoostClassifier(random_seed=42, verbose=0)

# Creating Grid Search
grid_search = GridSearchCV(estimator=catboost_model, param_grid=param_grid, scoring='accuracy', cv=10)

# Fitting the grid search to your training data
grid_search.fit(X_train, y_train)

# Getting the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score (Accuracy):", best_score)

Best Parameters: {'depth': 5, 'iterations': 300, 'learning_rate': 0.1, 'min_child_samples': 10}
Best Score (Accuracy): 0.7387923054801661
