In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    recall_score,
    precision_score,
    roc_auc_score,
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier, 
)

In [2]:
data = pd.read_csv('C:/Users/Nikhil_Chamle/Desktop/P1/Recommendation/Data.csv')

In [3]:
df = data.copy()

In [4]:
pd.set_option('display.max_columns', 50)

In [5]:
df.head(7)

Unnamed: 0,destination,passanger,weather,temperature,coupon,expiration,gender,age,maritalStatus,has_children,education,occupation,income,car,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Accept(Y/N?)
0,No Urgent Place,Alone,Sunny,55,Restaurant(<20),1d,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,Coffee House,2h,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,Carry out & Take away,2h,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,Coffee House,2h,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,Coffee House,1d,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,1,0,0,1,0
5,No Urgent Place,Friend(s),Sunny,80,Restaurant(<20),2h,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,1,0,0,1,1
6,No Urgent Place,Friend(s),Sunny,55,Carry out & Take away,1d,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,1,0,0,1,1


In [6]:
# dropping irrelvant columns on basis of null values and dropping null rows, duplicated values
df = df.drop(columns='car')
df = df.dropna()
df = df.drop_duplicates(ignore_index=True)

In [7]:
# Renaming column

df = df.rename(columns = {'Accept(Y/N?)': 'Accept'})

In [8]:
# Creating new column 'Age' 
# (below21,21,26) -> young
# (31, 36, 41, 46) -> mid_age
# (above50) -> older

df.loc[df['age'] == 'below21', 'Age'] = 'young' 
df.loc[df['age'] == '21', 'Age'] = 'young' 
df.loc[df['age'] == '26', 'Age'] = 'young' 
df.loc[df['age'] == '31', 'Age'] = 'mid_age' 
df.loc[df['age'] == '36', 'Age'] = 'mid_age' 
df.loc[df['age'] == '41', 'Age'] = 'mid_age' 
df.loc[df['age'] == '46', 'Age'] = 'mid_age'
df.loc[df['age'] == '50plus', 'Age'] = 'older'

In [9]:
# Creating new column 'Income' 

# below 49999 - low income
# above 50000 - mid income

# we will not go for high income as High income individuals wont only be attracted by coupons and for these going to food places may vary on several other factors, can be for mid income level too but for now lets consider earlir

df.loc[df['income'] == 'Less than $12500','Income'] = 'Low'
df.loc[df['income'] == '$12500 - $24999', 'Income'] = 'Low'
df.loc[df['income'] == '$25000 - $37499','Income'] = 'Low'
df.loc[df['income'] == '$37500 - $49999', 'Income'] = 'Low'
df.loc[df['income'] == '$50000 - $62499', 'Income'] = 'Mid'
df.loc[df['income'] == '$62500 - $74999', 'Income'] = 'Mid'
df.loc[df['income'] == '$75000 - $87499','Income'] = 'Mid'
df.loc[df['income'] == '$87500 - $99999', 'Income'] = 'Mid'
df.loc[df['income'] == '$100000 or More', 'Income'] = 'Mid'

In [10]:
# we will create new feature based on education

# criteria High school(low_education)<college(low_education)<1st_degree(mid_education)<High_degree(high_education)

# low_education < mid_eduction < high_education

#Creating new feature by using logic statement for durantion required to travel
 
df.loc[df['education'] == 'Some High School', 'Level_of_education'] = 'low_education' 
df.loc[df['education'] == 'High School Graduate', 'Level_of_education'] = 'low_education' 
df.loc[df['education'] == 'Some college - no degree', 'Level_of_education'] = 'low_education' 
df.loc[df['education'] == 'Associates degree', 'Level_of_education'] = 'mid_education' 
df.loc[df['education'] == 'Bachelors degree', 'Level_of_education'] = 'mid_education' 
df.loc[df['education'] == 'Graduate degree (Masters or Doctorate)', 'Level_of_education'] = 'high_education' 

In [11]:
# creating new feature on basis of occupation and %acceptance of coupon

# criteria - =above 60% > High, 50-59.99% > Medium, below 49.99 > low acceptance

cross_tab = pd.crosstab(columns=[df['Accept']], index=[df['occupation']], margins=True, normalize='index')

# Calculate the acceptance percentage and categorize it
cross_tab['Acceptance%'] = cross_tab[1] * 100  # Calculate the acceptance percentage

# criteria for categorization
def categorize_acceptance(percentage):
    if percentage >= 60:
        return 'High'
    elif 50 <= percentage <= 59.99:
        return 'Medium'
    else:
        return 'Low'

# Applying the categorization function to the 'Acceptance%' column
cross_tab['Occupation_with_(1)%'] = cross_tab['Acceptance%'].apply(categorize_acceptance)

# Merging the 'Acceptance%' and 'Acceptance_Category' columns back into the 'df' DataFrame based on 'occupation'
df = df.merge(cross_tab[['Acceptance%', 'Occupation_with_(1)%']], left_on='occupation', right_index=True, how='left')

In [12]:
#Creating new feature by time required

df['Coupon_timeRequired_15to25'] = 0 
df.loc[(df['toCoupon_GEQ15min'] == 1) & (df['toCoupon_GEQ25min'] == 0), 'Coupon_timeRequired_15to25'] = 1
df.loc[df['toCoupon_GEQ15min'] == 0, 'Coupon_timeRequired_15to25'] = 0
df.loc[df['toCoupon_GEQ25min'] == 1, 'Coupon_timeRequired_15to25'] = 2

In [13]:
# we want to know if we offer a coupon to an individual does he accept it if he is the regular visitor to that category of place

Relative_coupon_Accepted_count = []

for i in range(df.shape[0]):
    if df['coupon'].iloc[i] == 'Restaurant(<20)':
        Relative_coupon_Accepted_count.append(df['RestaurantLessThan20'].iloc[i])
    elif df['coupon'].iloc[i] == 'Coffee House':
        Relative_coupon_Accepted_count.append(df['CoffeeHouse'].iloc[i])
    elif df['coupon'].iloc[i] == 'Carry out & Take away':
        Relative_coupon_Accepted_count.append(df['CarryAway'].iloc[i])
    elif df['coupon'].iloc[i] == 'Bar':
        Relative_coupon_Accepted_count.append(df['Bar'].iloc[i])
    elif df['coupon'].iloc[i] == 'Restaurant(20-50)':
        Relative_coupon_Accepted_count.append(df['Restaurant20To50'].iloc[i])

df['Relative_coupon_Accepted_count'] = Relative_coupon_Accepted_count

In [14]:
# dropping columns

df = df.drop(['Bar', 'CoffeeHouse', 'CarryAway',
       'RestaurantLessThan20', 'Restaurant20To50', 'age', 'income', 'education', 'occupation', 'Acceptance%', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min', 'direction_same', 'direction_opp', 'toCoupon_GEQ5min'], axis=1)

In [15]:
categorical_features = ['destination', 'passanger', 'weather', 'temperature', 'coupon',
       'expiration', 'gender', 'maritalStatus', 'has_children',
       'Age', 'Income', 'Level_of_education', 'Occupation_with_(1)%',
       'Coupon_timeRequired_15to25', 'Relative_coupon_Accepted_count']

In [16]:
# Apply label encoding to each categorical column
for col in categorical_features:
    df[col] = label_encoder.fit_transform(df[col])

In [17]:
df.head(7)

Unnamed: 0,destination,passanger,weather,temperature,coupon,expiration,gender,maritalStatus,has_children,Accept,Age,Income,Level_of_education,Occupation_with_(1)%,Coupon_timeRequired_15to25,Relative_coupon_Accepted_count
0,1,0,2,1,4,0,1,2,0,1,2,1,2,0,0,1
1,1,1,2,2,2,1,1,2,0,0,2,1,2,0,0,3
2,1,1,2,2,0,0,1,2,0,1,2,1,2,0,0,4
3,1,1,2,2,1,1,1,2,0,0,2,1,2,0,1,1
4,1,1,2,2,2,0,1,2,0,0,2,1,2,0,0,3
5,1,1,2,2,2,1,1,2,0,0,2,1,2,0,1,3
6,1,1,2,2,2,0,1,2,0,0,2,1,2,0,1,3


In [18]:
df['Accept'].value_counts()

1    6686
0    5111
Name: Accept, dtype: int64

In [19]:
df.isnull().sum()

destination                       0
passanger                         0
weather                           0
temperature                       0
coupon                            0
expiration                        0
gender                            0
maritalStatus                     0
has_children                      0
Accept                            0
Age                               0
Income                            0
Level_of_education                0
Occupation_with_(1)%              0
Coupon_timeRequired_15to25        0
Relative_coupon_Accepted_count    0
dtype: int64

In [20]:
y = df['Accept'].astype(int)  # Target variable
X = df.drop(columns=['Accept']).astype(int)  # Features

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2)

In [22]:
models = []
models.append(("LR", LogisticRegression()))
models.append(("LDA", LinearDiscriminantAnalysis()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("CART", DecisionTreeClassifier()))
models.append(("NB", GaussianNB()))
models.append(("SVM", SVC()))
models.append(("RF", RandomForestClassifier()))

results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s Accuracy: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

    # Calculating training accuracy
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print(f"Training Accuracy for {name}: {train_accuracy}")
    print()

    # Calculating test accuracy
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy for {name}: {test_accuracy}")

    # Calculating additional metrics
    print(f"Confusion Matrix for {name}:\n{confusion_matrix(y_test, y_pred)}")
    print()
    print(f"Classification Report for {name}:\n{classification_report(y_test, y_pred)}")
    print()
    print(f"Recall for {name}: {recall_score(y_test, y_pred)}")
    print()
    print(f"Precision for {name}: {precision_score(y_test, y_pred)}")
    print()
    print(f"ROC AUC for {name}: {roc_auc_score(y_test, y_pred)}")
    print("----------------------------------------------------------------------------")


LR Accuracy: 0.675636 (0.013883)
Training Accuracy for LR: 0.6776517961216488

Test Accuracy for LR: 0.6707627118644067
Confusion Matrix for LR:
[[ 570  476]
 [ 301 1013]]

Classification Report for LR:
              precision    recall  f1-score   support

           0       0.65      0.54      0.59      1046
           1       0.68      0.77      0.72      1314

    accuracy                           0.67      2360
   macro avg       0.67      0.66      0.66      2360
weighted avg       0.67      0.67      0.67      2360


Recall for LR: 0.7709284627092846

Precision for LR: 0.6803223640026864

ROC AUC for LR: 0.6579307705515831
----------------------------------------------------------------------------
LDA Accuracy: 0.675318 (0.013241)
Training Accuracy for LDA: 0.6790293525484794

Test Accuracy for LDA: 0.6711864406779661
Confusion Matrix for LDA:
[[ 576  470]
 [ 306 1008]]

Classification Report for LDA:
              precision    recall  f1-score   support

           0       0.