# Importing Modules and Datasets

In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# test data
true_data = pd.read_csv('../input/marketing-strategy-personalised-offer/test_data.csv')

# train data raw
train_data = pd.read_csv('../input/marketing-strategy-personalised-offer/train_data.csv')

y_train_all = train_data.pop('Offer Accepted')

# Data Pre-Processing

In [3]:
true_data.shape, train_data.shape, y_train_all.shape

((5305, 30), (12379, 30), (12379,))

In [4]:
# replacing missing and unnecessary values 

all_col = [i for i in train_data.columns if i not in ["car","restuarant_opposite_direction_house","travelled_more_than_5mins_for_offer"]]

transformer1 = ColumnTransformer(
    [
        ("drop_cols", "drop", ["car","restuarant_opposite_direction_house","travelled_more_than_5mins_for_offer"]),
        ("imputer", SimpleImputer(strategy="most_frequent"), all_col)
    ],
    remainder="passthrough"
)

train_data1 = pd.DataFrame(transformer1.fit_transform(train_data), columns=all_col)
true_data1 = pd.DataFrame(transformer1.transform(true_data), columns=all_col)

In [5]:
# column names

no_enc = ['travelled_more_than_15mins_for_offer','Prefer western over chinese','travelled_more_than_25mins_for_offer',
              'restuarant_same_direction_house','Cooks regularly','is foodie','has Children','Prefer home food','visit restaurant with rating (avg)']
ord_enc = ['offer expiration','income_range','no_visited_Cold drinks','no_visited_bars','no_Take-aways',
           'Restaur_spend_less_than20','Restaur_spend_greater_than20', 'age','restaurant type','Qualification',
          'Customer type','Marital Status','temperature','Travel Time']

# ord_enc variables

income_list = np.array(['Less than ₹12500', '₹12500 - ₹24999', '₹25000 - ₹37499', 
               '₹37500 - ₹49999', '₹50000 - ₹62499', '₹62500 - ₹74999', 
                '₹75000 - ₹87499', '₹87500 - ₹99999',  '₹100000 or More'])
no_list = np.array(['never', 'less1', '1~3', '4~8', 'gt8'])
offer_list = np.array(['10hours', '2days'])
age_list = np.array(['below21','21', '26', '31', '36', '41', '46','50plus'])
resto_list = np.array(['Cold drinks','Take-away restaurant', 'Restaurant with pub',
              '2 star restaurant','4 star restaurant'])
quali_list = np.array(['Some High School','High School Graduate', 'Some college - no degree','Associates degree',
              'Bachelors degree','Graduate degree (Masters or Doctorate)'])
custo_list = np.array(['Individual', 'With Colleagues', 'With Kids', 'With Family'])
marital_list = np.array([ 'Single', 'Unmarried partner','Married partner', 'Divorced', 'Widowed'])
temp_list = np.array([40, 67, 89])
travel_list = np.array([7, 10, 14, 18, 22])

In [6]:
# ordinal encoding both test and training data

income_list_oe = OrdinalEncoder(categories=[income_list],dtype=np.int64)
no_list_oe = OrdinalEncoder(categories=[no_list]*5,dtype=np.int64)
offer_list_oe = OrdinalEncoder(categories=[offer_list],dtype=np.int64)
age_list_oe = OrdinalEncoder(categories=[age_list],dtype=np.int64)
resto_list_oe = OrdinalEncoder(categories=[resto_list],dtype=np.int64)
quali_list_oe = OrdinalEncoder(categories=[quali_list],dtype=np.int64)
custo_list_oe = OrdinalEncoder(categories=[custo_list],dtype=np.int64)
marital_list_oe = OrdinalEncoder(categories=[marital_list],dtype=np.int64)
temp_list_oe = OrdinalEncoder(categories=[temp_list],dtype=np.int64)
travel_list_oe = OrdinalEncoder(categories=[travel_list],dtype=np.int64)
one_hot = OneHotEncoder(sparse=False,drop='first', dtype=np.int64)

transformer2 = ColumnTransformer(
    [
        ('offer_list_oe',offer_list_oe,['offer expiration']),
        ('income_list_oe',income_list_oe,['income_range']),
        ('no_list_oe',no_list_oe,['no_visited_Cold drinks','no_visited_bars','no_Take-aways','Restaur_spend_less_than20','Restaur_spend_greater_than20']),
        ('age_list_oe',age_list_oe,['age']),
        ('resto_list_oe',resto_list_oe,['restaurant type']),
        ('quali_list_oe', quali_list_oe,['Qualification']),
        ('custo_list_oe',custo_list_oe,['Customer type']),
        ('marital_list_oe',marital_list_oe,['Marital Status']),
        ('temp_list_oe',temp_list_oe,['temperature']),
        ('travel_list_oe',travel_list_oe,['Travel Time'])
    ],
    remainder="drop"
)

ord_enc_data = pd.DataFrame(transformer2.fit_transform(train_data1), columns=ord_enc)
true_ord_enc_data = pd.DataFrame(transformer2.fit_transform(true_data1), columns=ord_enc)

In [7]:
# OneHotEncoding both test and train_data

transformer3 = ColumnTransformer(
    [
        ('one_hot1',one_hot,['Job/Job Industry']),
        ('one_hot2',one_hot,['Climate']),
        ('one_hot3',one_hot,['drop location']),
        ('one_hot4',one_hot,['gender'])
    ],
    remainder="drop"
)

one_hot_data = pd.DataFrame(transformer3.fit_transform(train_data1), columns=[name.split("__")[1] for name in transformer3.get_feature_names_out()])
one_hot_list = [name.split("__")[1] for name in transformer3.get_feature_names_out()]
true_one_hot_data = pd.DataFrame(transformer3.transform(true_data1), columns=one_hot_list)

In [8]:
# final full training data

X_train_full = train_data1[no_enc].astype('int64')
X_train_full[one_hot_list] = one_hot_data
X_train_full[ord_enc] = ord_enc_data

# final full test data

X_true = true_data1[no_enc].astype('int64')
X_true[one_hot_list] = true_one_hot_data
X_true[ord_enc] = true_ord_enc_data

# final full label

le = LabelEncoder()
y_train_full = le.fit_transform(y_train_all)

In [9]:
X_train_full.shape, X_true.shape, y_train_full.shape

((12379, 52), (5305, 52), (12379,))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=32)

# Model Building

### Dummy Model

In [11]:
dummy_model = DummyClassifier(strategy="most_frequent")
dummy_model.fit(X_train, y_train)
dummy_model.fit(X_train, y_train)
print(classification_report(y_test, dummy_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1327
           1       0.57      1.00      0.73      1768

    accuracy                           0.57      3095
   macro avg       0.29      0.50      0.36      3095
weighted avg       0.33      0.57      0.42      3095



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Logistic Regression Model

In [12]:
# Logisitc Regression, Lasso

# model = SGDClassifier(random_state=32, loss="log", penalty="l1")
# model.fit(X_train, y_train)
# print(classification_report(y_test, model.predict(X_test)))

In [13]:
# Logisitc Regression, Ridge

# model = SGDClassifier(random_state=32, loss="log", penalty="l2")
# model.fit(X_train, y_train)
# print(classification_report(y_test, model.predict(X_test)))

In [14]:
# Logisitc Regression, Elasticnet

# model = SGDClassifier(random_state=32, loss="log", penalty="elasticnet")
# model.fit(X_train, y_train)
# print(classification_report(y_test, model.predict(X_test)))

In [15]:
# Polynomial features (degree=2, including interaction) + Logistic Regression + Elasticnet

# polyn = PolynomialFeatures(2)

# transform_train_data_poly = polyn.fit_transform(X_train_full)
# feature_names = [i for i in polyn.get_feature_names_out()]
# transform_train_df_poly = pd.DataFrame(data=transform_train_data_poly, columns=feature_names)

# X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(transform_train_df_poly, y_train_full, test_size=0.3, random_state=32)

# model = SGDClassifier(random_state=32, loss="log", penalty="elasticnet")
# model.fit(X_train_poly, y_train_poly)
# preds = model.predict(X_test_poly)
# print(classification_report(y_test_poly, preds))

In [16]:
# Logistic Regression hyper-parameter tuning

# param_grid = {
#     "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
#     "learning_rate": ["optimal", "invscaling", "adaptive"],
#     "eta0": [1, 10, 100],
#     "penalty": ["l1", "l2", "elasticnet"]
# }
# model = SGDClassifier(random_state=32, loss="log")
# search = GridSearchCV(model, param_grid=param_grid, scoring="f1_macro", refit=True, cv=5, verbose=3)
# search.fit(X_train_full, y_train_full)
# search.best_params_

In [17]:
lr_model = SGDClassifier(
    random_state=32,
    loss="log",
    penalty="l2",
    alpha=0.01,
    eta0=1,
    learning_rate="optimal",
)
lr_model.fit(X_train, y_train)
print(classification_report(y_test, lr_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.47      0.68      0.55      1327
           1       0.63      0.42      0.51      1768

    accuracy                           0.53      3095
   macro avg       0.55      0.55      0.53      3095
weighted avg       0.56      0.53      0.53      3095



### KNeighborsClassifier Clustering Model

In [18]:
# Weights = inverse of distance

# model = KNeighborsClassifier(weights="distance")
# model.fit(X_train, y_train)
# print(classification_report(y_test, model.predict(X_test)))

In [19]:
# Metric = Manhattan distance
# Weight = inverse of distance

# model = KNeighborsClassifier(weights="distance", p=1)
# model.fit(X_train, y_train)
# print(classification_report(y_test, model.predict(X_test)))

In [20]:
# Metric = Manhattan distance
# Weight = inverse of distance
# Algorithm = BallTree

# model = KNeighborsClassifier(weights="distance", p=1, algorithm="ball_tree")
# model.fit(X_train, y_train)
# print(classification_report(y_test, model.predict(X_test)))

In [21]:
### Seems like changing algorithm only just affects time taken
### HPT for the `leaf_size` parameter

# param_grid = {
#     "leaf_size": [5, 10, 30, 50, 100, 200]
# }
# model = KNeighborsClassifier(weights="distance", p=1)
# search = GridSearchCV(model, param_grid=param_grid, cv=5, scoring="f1_macro", refit=True, verbose=3)
# search.fit(X_train_full, y_train_full)
# search.best_params_

In [22]:
# Metric = Manhattan distance
# Weight = inverse of distance

kn_model = KNeighborsClassifier(weights="distance", p=1, leaf_size=5)
kn_model.fit(X_train, y_train)
print(classification_report(y_test, kn_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.53      0.46      0.49      1327
           1       0.63      0.69      0.66      1768

    accuracy                           0.59      3095
   macro avg       0.58      0.57      0.57      3095
weighted avg       0.58      0.59      0.59      3095



### SVM Model

In [23]:
# RBF kernel

# model = SVC(random_state=32)
# model.fit(X_train, y_train)
# print(classification_report(y_test, model.predict(X_test)))

In [24]:
# sigmoid kernel

# model = SVC(random_state=32, kernel='sigmoid')
# model.fit(X_train, y_train)
# print(classification_report(y_test, model.predict(X_test)))

In [25]:
# Linear kernel

# model = SVC(random_state=32, kernel="linear")
# model.fit(X_train, y_train)
# print(classification_report(y_test, model.predict(X_test)))

In [26]:
# Poly kernel, degree = 2

# model = SVC(random_state=32, degree=2, kernel="poly")
# model.fit(X_train, y_train)
# print(classification_report(y_test, model.predict(X_test)))

In [27]:
# Poly kernel, degree = 3

# model = SVC(random_state=32, degree=3, kernel="poly")
# model.fit(X_train, y_train)
# print(classification_report(y_test, model.predict(X_test)))

In [28]:
# HPT for rbf kernel (which performed best)

# param_grid = {
#     "C": [0.01, 0.1, 1, 10, 100],
#     "gamma": ["scale", 0.01, 0.1],
# }
# model = SVC(random_state=32)
# search = GridSearchCV(model, param_grid=param_grid, cv=3, scoring="f1_macro", refit=True, verbose=3)
# search.fit(X_train_full, y_train_full)
# search.best_params_

In [29]:
#tuned model

sv_model = SVC(random_state=32, C=10, gamma='scale')
sv_model.fit(X_train, y_train)
print(classification_report(y_test, sv_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.56      0.41      0.47      1327
           1       0.63      0.76      0.69      1768

    accuracy                           0.61      3095
   macro avg       0.60      0.58      0.58      3095
weighted avg       0.60      0.61      0.60      3095



### CART Models

In [30]:
# With PCA (seems to reduce the score)

# pca = PCA(random_state=32)
# reduced_transform_train_df = pca.fit_transform(X_train_full[ord_enc+one_hot_list])

# print(pca.explained_variance_ratio_[:5])

# reduced_transform_train_df = pd.DataFrame(data = reduced_transform_train_df[:,:4], columns = ["PC1", "PC2", "PC3", "PC4"])
# for col in no_enc:
#     reduced_transform_train_df[col] = X_train_full[col]
# reduced_X_train_com = reduced_transform_train_df.copy()
# reduced_y_train_com = y_train_full
# reduced_X_train, reduced_X_valid, reduced_y_train, reduced_y_valid = train_test_split(reduced_X_train_com, reduced_y_train_com, test_size=0.3, random_state=32)

# model = DecisionTreeClassifier(class_weight="balanced", random_state=32, max_depth=5, ccp_alpha=0.001)
# cross_val_score(model, reduced_X_train_com, reduced_y_train_com, cv=5, scoring="f1_macro").mean()

In [31]:
# Without PCA

# model = DecisionTreeClassifier(class_weight="balanced", random_state=1, ccp_alpha=0.001, max_depth=8)
# cross_val_score(model, X_train_full, y_train_full, cv=5, scoring="f1_macro").mean()

In [32]:
# Grid search without PCA

# param_grid = {
#     "ccp_alpha": [0.0001, 0.001, 0.01],
#     "max_depth": [5, 10, 15],
#     "min_samples_split": [5, 10, 15],
#     "min_samples_leaf": [5, 10, 15],
#     "max_features": ["auto", "log2", None]
# }
# model = DecisionTreeClassifier(class_weight="balanced", random_state=32)
# search = GridSearchCV(model, param_grid=param_grid, scoring="f1_macro", refit=True, cv=5, verbose=3)
# search.fit(X_train_full, y_train_full)
# search.best_params_

In [33]:
tree_model = DecisionTreeClassifier(
    class_weight="balanced",
    random_state=32,
    ccp_alpha=0.0001,
    max_depth=5,
    max_features=None,
    min_samples_leaf=15,
    min_samples_split=5
)
tree_model.fit(X_train, y_train)
print(classification_report(y_test, tree_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.50      0.57      0.53      1327
           1       0.64      0.58      0.61      1768

    accuracy                           0.58      3095
   macro avg       0.57      0.57      0.57      3095
weighted avg       0.58      0.58      0.58      3095



### Bagging

In [34]:
# HPT

# param_grid = {
#     "min_samples_split": [2, 5, 10],
#     "max_samples": [0.5, 0.9],
#     "n_estimators": [100, 200],
#     "min_samples_leaf": [10, 20],
#     "max_depth": [10, 20],
#     "ccp_alpha": [0.0001, 0.001]
# }
# model = RandomForestClassifier(
#     max_features=None,
#     random_state=32,
#     class_weight="balanced",
#     min_samples_leaf=15,
#     n_estimators=100
# )
# search = GridSearchCV(model, param_grid, scoring="f1_macro", refit=True, cv=3, verbose=3)
# search.fit(X_train_full, y_train_full)
# search.best_params_

In [35]:
bg_model = RandomForestClassifier(
    max_features=None,
    random_state=32,
    class_weight="balanced",
    n_estimators=200,
    max_samples=0.5,
    min_samples_split=2,
    min_samples_leaf=10,
    max_depth=20,
    ccp_alpha=0.0001
)
bg_model.fit(X_train, y_train)
print(classification_report(y_test, bg_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.56      0.51      0.54      1327
           1       0.66      0.70      0.68      1768

    accuracy                           0.62      3095
   macro avg       0.61      0.61      0.61      3095
weighted avg       0.62      0.62      0.62      3095



### Boosting

### Comparision of evaluation criteria (Avg Macro F1 score) for various models
- RandomForestClassifier: 0.61
- SVM: 0.58
- DecisionTree: 0.57
- KNeighbors Clustering: 0.57
- Logistic Regression: 0.53
- Dummy Classifier: 0.36

# Submission


In [36]:
# fit model on whole data first !!!!!!!!!!!!!!!!!!!!!!!

bg_model.fit(X_train_full, y_train_full)


# prediction; do check that the model is correct !!!!!!!!!!!!!!!!!!!!!!!

preds = [int(i) for i in bg_model.predict(X_true)]
submission = pd.DataFrame({"id": np.arange(0,len(preds)), "Offer Accepted": le.inverse_transform(preds)})
submission.to_csv("submission.csv", index=False)

In [37]:
len(preds)

5305

In [38]:
submission.head()

Unnamed: 0,id,Offer Accepted
0,0,No
1,1,Yes
2,2,No
3,3,No
4,4,Yes
