In [22]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/marketing-strategy-personalised-offer/sample.csv
/kaggle/input/marketing-strategy-personalised-offer/train_data.csv
/kaggle/input/marketing-strategy-personalised-offer/test_data.csv


In [36]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder, \
                                PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, f1_score
from sklearn.neighbors import KNeighborsClassifier

In [24]:
# test data
true_data = pd.read_csv('../input/marketing-strategy-personalised-offer/test_data.csv')

# train data raw
train_data = pd.read_csv('../input/marketing-strategy-personalised-offer/train_data.csv')

y_train_all = train_data.pop('Offer Accepted')

In [25]:
# replacing missing and unnecessary values 

all_col = [i for i in train_data.columns if i not in ["car","restuarant_opposite_direction_house","travelled_more_than_5mins_for_offer"]]

transformer1 = ColumnTransformer(
    [
        ("drop_cols", "drop", ["car","restuarant_opposite_direction_house","travelled_more_than_5mins_for_offer"]),
        ("imputer", SimpleImputer(strategy="most_frequent"), all_col)
    ],
    remainder="passthrough"
)

train_data1 = pd.DataFrame(transformer1.fit_transform(train_data), columns=all_col)
true_data1 = pd.DataFrame(transformer1.transform(true_data), columns=all_col)

In [26]:
# column names

no_enc = ['travelled_more_than_15mins_for_offer','Prefer western over chinese','travelled_more_than_25mins_for_offer',
              'restuarant_same_direction_house','Cooks regularly','is foodie','has Children','Prefer home food','visit restaurant with rating (avg)']
ord_enc = ['offer expiration','income_range','no_visited_Cold drinks','no_visited_bars','no_Take-aways',
           'Restaur_spend_less_than20','Restaur_spend_greater_than20', 'age','restaurant type','Qualification',
          'Customer type','Marital Status','temperature','Travel Time']

# ord_enc variables

income_list = np.array(['Less than ₹12500', '₹12500 - ₹24999', '₹25000 - ₹37499', 
               '₹37500 - ₹49999', '₹50000 - ₹62499', '₹62500 - ₹74999', 
                '₹75000 - ₹87499', '₹87500 - ₹99999',  '₹100000 or More'])
no_list = np.array(['never', 'less1', '1~3', '4~8', 'gt8'])
offer_list = np.array(['10hours', '2days'])
age_list = np.array(['below21','21', '26', '31', '36', '41', '46','50plus'])
resto_list = np.array(['Cold drinks','Take-away restaurant', 'Restaurant with pub',
              '2 star restaurant','4 star restaurant'])
quali_list = np.array(['Some High School','High School Graduate', 'Some college - no degree','Associates degree',
              'Bachelors degree','Graduate degree (Masters or Doctorate)'])
custo_list = np.array(['Individual', 'With Colleagues', 'With Kids', 'With Family'])
marital_list = np.array([ 'Single', 'Unmarried partner','Married partner', 'Divorced', 'Widowed'])
temp_list = np.array([40, 67, 89])
travel_list = np.array([7, 10, 14, 18, 22])

In [27]:
# ordinal encoding both test and training data

income_list_oe = OrdinalEncoder(categories=[income_list],dtype=np.int64)
no_list_oe = OrdinalEncoder(categories=[no_list]*5,dtype=np.int64)
offer_list_oe = OrdinalEncoder(categories=[offer_list],dtype=np.int64)
age_list_oe = OrdinalEncoder(categories=[age_list],dtype=np.int64)
resto_list_oe = OrdinalEncoder(categories=[resto_list],dtype=np.int64)
quali_list_oe = OrdinalEncoder(categories=[quali_list],dtype=np.int64)
custo_list_oe = OrdinalEncoder(categories=[custo_list],dtype=np.int64)
marital_list_oe = OrdinalEncoder(categories=[marital_list],dtype=np.int64)
temp_list_oe = OrdinalEncoder(categories=[temp_list],dtype=np.int64)
travel_list_oe = OrdinalEncoder(categories=[travel_list],dtype=np.int64)
one_hot = OneHotEncoder(sparse=False,drop='first', dtype=np.int64)

transformer2 = ColumnTransformer(
    [
        ('offer_list_oe',offer_list_oe,['offer expiration']),
        ('income_list_oe',income_list_oe,['income_range']),
        ('no_list_oe',no_list_oe,['no_visited_Cold drinks','no_visited_bars','no_Take-aways','Restaur_spend_less_than20','Restaur_spend_greater_than20']),
        ('age_list_oe',age_list_oe,['age']),
        ('resto_list_oe',resto_list_oe,['restaurant type']),
        ('quali_list_oe', quali_list_oe,['Qualification']),
        ('custo_list_oe',custo_list_oe,['Customer type']),
        ('marital_list_oe',marital_list_oe,['Marital Status']),
        ('temp_list_oe',temp_list_oe,['temperature']),
        ('travel_list_oe',travel_list_oe,['Travel Time'])
    ],
    remainder="drop"
)

ord_enc_data = pd.DataFrame(transformer2.fit_transform(train_data1), columns=ord_enc)
true_ord_enc_data = pd.DataFrame(transformer2.fit_transform(true_data1), columns=ord_enc)

In [28]:
# OneHotEncoding both test and train_data

transformer3 = ColumnTransformer(
    [
        ('one_hot1',one_hot,['Job/Job Industry']),
        ('one_hot2',one_hot,['Climate']),
        ('one_hot3',one_hot,['drop location']),
        ('one_hot4',one_hot,['gender'])
    ],
    remainder="drop"
)

one_hot_data = pd.DataFrame(transformer3.fit_transform(train_data1), columns=[name.split("__")[1] for name in transformer3.get_feature_names_out()])
one_hot_list = [name.split("__")[1] for name in transformer3.get_feature_names_out()]
true_one_hot_data = pd.DataFrame(transformer3.transform(true_data1), columns=one_hot_list)

In [29]:
ss = StandardScaler()
# final full training data

X_train_full = train_data1[no_enc].astype('int64')
X_train_full[one_hot_list] = one_hot_data
X_train_full[ord_enc] = ord_enc_data
X_train_full = pd.DataFrame(ss.fit_transform(X_train_full), columns=X_train_full.columns)

# final full test data

X_true = true_data1[no_enc].astype('int64')
X_true[one_hot_list] = true_one_hot_data
X_true[ord_enc] = true_ord_enc_data
X_true = pd.DataFrame(ss.transform(X_true), columns=X_true.columns)

# final full label

le = LabelEncoder()
y_train_full = le.fit_transform(y_train_all)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=32)

### Model Building

In [35]:
# Weights = inverse of distance

model = KNeighborsClassifier(weights="distance")
model.fit(X_train, y_train)
print(classification_report(y_test, model.predict(X_test)))
print(f1_score(y_test, model.predict(X_test), average='micro'))

              precision    recall  f1-score   support

           0       0.48      0.41      0.44      1327
           1       0.60      0.68      0.64      1768

    accuracy                           0.56      3095
   macro avg       0.54      0.54      0.54      3095
weighted avg       0.55      0.56      0.55      3095

0.5599353796445881


In [38]:
# Metric = Manhattan distance
# Weight = inverse of distance

model = KNeighborsClassifier(weights="distance", p=1)
model.fit(X_train, y_train)
print(classification_report(y_test, model.predict(X_test)))
print(f1_score(y_test, model.predict(X_test), average='micro'))

              precision    recall  f1-score   support

           0       0.51      0.44      0.47      1327
           1       0.62      0.68      0.65      1768

    accuracy                           0.58      3095
   macro avg       0.56      0.56      0.56      3095
weighted avg       0.57      0.58      0.57      3095

0.575767366720517


In [39]:
# Metric = Manhattan distance
# Weight = inverse of distance
# Algorithm = BallTree

model = KNeighborsClassifier(weights="distance", p=1, algorithm="ball_tree")
model.fit(X_train, y_train)
print(classification_report(y_test, model.predict(X_test)))
print(f1_score(y_test, model.predict(X_test), average='micro'))

              precision    recall  f1-score   support

           0       0.51      0.44      0.47      1327
           1       0.62      0.68      0.65      1768

    accuracy                           0.58      3095
   macro avg       0.56      0.56      0.56      3095
weighted avg       0.57      0.58      0.57      3095

0.575767366720517


In [None]:
## Seems like changing algorithm only just affects time taken
### HPT for the `leaf_size` parameter

param_grid = {
    'n_neighbors': [5,7,9],
    "leaf_size": [1, 2, 3, 5, 10],
    'weights': ['uniform','distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
model = KNeighborsClassifier()
search = GridSearchCV(model, param_grid=param_grid, cv=3, scoring="f1_micro", refit=True, verbose=3)
search.fit(X_train_full, y_train_full)
search.best_params_


In [43]:
# final model
# Metric = Manhattan distance
# Weight = inverse of distance

kn_model = KNeighborsClassifier(algorithm='auto', leaf_size=1, 
                     metric_params=None, n_jobs=-1, n_neighbors=9,
                     weights='distance')

kn_model.fit(X_train, y_train)
print(classification_report(y_test, kn_model.predict(X_test)))
print(f1_score(y_test, kn_model.predict(X_test), average='micro'))

              precision    recall  f1-score   support

           0       0.51      0.39      0.44      1327
           1       0.61      0.72      0.66      1768

    accuracy                           0.58      3095
   macro avg       0.56      0.55      0.55      3095
weighted avg       0.57      0.58      0.57      3095

0.5773828756058158
