In [138]:
import pandas as pd
import sweetviz as sv
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import itertools
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

# Convert Data to Vector

## clean data

In [2]:
# load csv
df_user = pd.read_csv('../csv/dtt_users.csv')

df_country = pd.read_csv('../csv/dtt_country.csv')[['country_id','country_code','country_phonecode','country_name']]
id_to_countryCode = {i[1][0] : i[1][3] for i in df_country.iterrows()}
countryCode_to_id = {i[1][1] : i[1][0] for i in df_country.iterrows()}
phoneCode_to_id = {}
countryName_to_id = {i[1][3] : i[1][0] for i in df_country.iterrows()}
for i in df_country.iterrows():
    val = i[1][0]
    if len(i[1][2].split(',')) > 1:
        for j in i[1][2].split(','):
            phoneCode_to_id[j] = val
    else:
        phoneCode_to_id[i[1][2]] = val

order_col = [
    'customer_id','order_product_id','browser', 'platform', # ref
    'customer_firstname','customer_lastname','customer_email','customer_gender', #customer details
    'customer_nationality','customer_country','customer_phone_iso','customer_phone_code', # convert to country id
    'order_price_paid','order_state','order_payment_by',
    'order_quantity_infant','order_quantity_children','order_quantity_adult','order_quantity_elder', # create private, group, family, kid, adult
    'order_departure_date','date_create',# columns >> date in year, mount, year
    ]
df_order = pd.read_csv('../csv/dtt_order.csv')[order_col]
print(f'Total rows : {len(df_order)}')

Total rows : 27328


  df_order = pd.read_csv('../csv/dtt_order.csv')[order_col]


> clean dtt_order.csv
- drop na
- drop test
- remove special characters
- encode object columns
- add datetime column
- group country columns
- add sum of kids, adults columns
- add private, group, family columns
- add category 1-18 columns

In [3]:
# function
def has_numbers(inputString):
    return bool(re.search(r'\d', inputString))

def encode_gender(inputString):
    if 'Mr.' == inputString or 'mr.' in inputString:
        return 0
    elif 'Ms.' == inputString or 'ms.' in inputString:
        return 1
    elif 'Mrs.' == inputString or 'mrs.' in inputString:
        return 2
    else:
        return 3

def encode_phone_code(inputFloat):
    try:
        return phoneCode_to_id[str(int(inputFloat))]
    except:
        print(int(inputFloat))

def encode_order_payment_by(inputString):
    if 'creditcard' == inputString:
        return 1
    elif 'paypal' == inputString:
        return 2
    else:
        return 0

In [4]:

df_order_v1 = df_order[order_col]
# drop na
na_index_v1 = df_order_v1[df_order_v1['customer_country'].isna()].index
df_order_v1 = df_order_v1.drop(na_index_v1)

# drop test
test_index_v1 = df_order_v1[df_order_v1['customer_email'].map(lambda x: 'test' in str(x) or '@mail.com' in str(x) or '360' in str(x))].index
df_order_v1 = df_order_v1.drop(test_index_v1)

test_index_v2 = df_order_v1[df_order_v1['customer_email'].map(lambda x: 'mitkung' in str(x) or 'suphattra' in str(x))].index
df_order_v1 = df_order_v1.drop(test_index_v2)

test_index_v3 = df_order_v1[df_order_v1['customer_firstname'].map(lambda x: 'admin' in str(x) or 'test' in str(x) or 'Thanadol' in str(x) or 'Test' in str(x))].index
df_order_v1 = df_order_v1.drop(test_index_v3)

test_index_v3 = df_order_v1[df_order_v1['customer_firstname'].map(lambda x: ('asd' in str(x) or 'dsd' in str(x) or 'xx' in str(x)) and not 'Jasdeep' in str(x))].index
df_order_v1 = df_order_v1.drop(test_index_v3)

# clean
df_order_v1.loc[3627, 'customer_firstname'] = 'Jennyhan'
df_order_v1['customer_firstname'] = df_order_v1['customer_firstname'].map(lambda x: re.sub('[0-9#$;:]+', '', x))
df_order_v1['customer_lastname'] = df_order_v1['customer_lastname'].map(lambda x: re.sub('[0-9#$;:]+', '', x))
df_order_v1['customer_nationality'] = df_order_v1['customer_nationality'].map(lambda x: re.sub('ไทย|Thailand', 'thai', x))
df_order_v1['customer_nationality'] = df_order_v1['customer_nationality'].map(lambda x : x.lower())

# encode
df_order_v1['customer_gender'] = df_order_v1['customer_gender'].map(lambda x : encode_gender(str(x)))
df_order_v1['customer_country'] = df_order_v1['customer_country'].map(lambda x : countryName_to_id[x] if x != '' else x)

df_order_v1.fillna(value={'customer_phone_iso': -1, 'customer_phone_code':-1, 'order_payment_by': 0}, inplace=True)
df_order_v1['customer_phone_iso'] = df_order_v1['customer_phone_iso'].map(lambda x : -1 if x == -1 else countryCode_to_id[x])
df_order_v1['customer_phone_code'] = df_order_v1['customer_phone_code'].map(lambda x : -1 if x == -1 else encode_phone_code(x))
df_order_v1['order_payment_by'] = df_order_v1['order_payment_by'].map(lambda x : encode_order_payment_by(str(x)))

df_order_v1['customer_phone_iso'] = np.where(df_order_v1['customer_phone_iso']==-1, df_order_v1['customer_country'], df_order_v1['customer_phone_iso'])
df_order_v1['customer_phone_code'] = np.where(df_order_v1['customer_phone_code']==-1, df_order_v1['customer_country'], df_order_v1['customer_phone_iso'])

enc = OrdinalEncoder(dtype='int8', encoded_missing_value=-1)
cat_cols = ['browser', 'platform', 'order_state']
df_order_v1[cat_cols] = enc.fit_transform(df_order_v1[cat_cols])

# change dtype to int
int_cols = ['customer_phone_iso', 'customer_phone_code','order_price_paid', 'customer_id']
df_order_v1['customer_phone_iso'] = np.where(df_order_v1['customer_phone_iso']==-1, df_order_v1['customer_country'], df_order_v1['customer_phone_iso'])
df_order_v1['customer_phone_code'] = np.where(df_order_v1['customer_phone_code']==-1, df_order_v1['customer_country'], df_order_v1['customer_phone_iso'])
df_order_v1[int_cols] = df_order_v1[int_cols].astype('int64')

# convert to datetime
df_order_v1[['order_departure_date','date_create']] = df_order_v1[['order_departure_date','date_create']].apply(pd.to_datetime)

df_order_v1['booked_days'] =  df_order_v1['order_departure_date'] - df_order_v1['date_create']
df_order_v1['booked_days'] = df_order_v1['booked_days'].map(lambda x: x.days if x.days >= 0 else 0)

df_order_v1['departure_year'] = df_order_v1['order_departure_date'].map(lambda x:x.year)
df_order_v1['departure_month'] = df_order_v1['order_departure_date'].map(lambda x:x.month)
df_order_v1['departure_day'] = df_order_v1['order_departure_date'].map(lambda x:x.day)

df_order_v1['departure_DayofYear'] = df_order_v1['order_departure_date'].map(lambda x:x.day_of_year)
df_order_v1['departure_DayofWeek'] = df_order_v1['order_departure_date'].map(lambda x:x.day_of_week)

# Majority Vote for country code
df_order_v1['country_code'] = df_order_v1[['customer_country','customer_phone_iso','customer_phone_code']].mode(axis=1)

# sum of kid  and adult
df_order_v1['sum_kids'] = df_order_v1['order_quantity_infant'] + df_order_v1['order_quantity_children']
df_order_v1['sum_adults'] = df_order_v1['order_quantity_adult'] + df_order_v1['order_quantity_elder']

df_order_v1['private'] = np.where((df_order_v1['sum_kids'] == 0) & (df_order_v1['sum_adults'] <= 2), 1, 0)
df_order_v1['group'] = np.where((df_order_v1['sum_kids'] == 0) & (df_order_v1['sum_adults'] > 2), 1, 0)
df_order_v1['family'] = np.where((df_order_v1['sum_kids'] > 0) & (df_order_v1['sum_adults'] > 0), 1, 0)

In [5]:
enc.categories_

[array(['Android Browser', 'AppleWebKit', 'Chrome', 'Edge', 'Firefox',
        'MSIE', 'Opera Next', 'Safari', 'SamsungBrowser', 'Vivaldi', nan],
       dtype=object),
 array(['Android', 'Chrome OS', 'Linux', 'Macintosh', 'Windows', 'iPad',
        'iPhone', nan], dtype=object),
 array(['cancelled', 'detail_success', 'payment_success'], dtype=object)]

In [6]:
product_cat_df = pd.read_csv('../csv/dtt_product_category.csv')

# Perform one-hot encoding
one_hot_encoded = pd.get_dummies(product_cat_df['category_id'], prefix='cat')

# Group by 'product_id' and sum the one-hot encoded columns
grouped_df = one_hot_encoded.groupby(product_cat_df['product_id']).sum()

# Reset the index to make 'product_id' a column again
grouped_df = grouped_df.reset_index()

# Drop duplicate rows based on 'product_id'
product_cat_df = product_cat_df.drop_duplicates(subset='product_id')

# Merge the grouped DataFrame back to the original DataFrame
product_cat_df = pd.merge(product_cat_df.drop(columns=['category_id']), grouped_df, on='product_id')
product_cat_df.rename(columns={'product_id': 'order_product_id'}, inplace=True)

In [7]:
df_order_v2 = pd.merge(df_order_v1[df_order_v1['order_state']==2], product_cat_df, on='order_product_id')
df_order_v2

Unnamed: 0,customer_id,order_product_id,browser,platform,customer_firstname,customer_lastname,customer_email,customer_gender,customer_nationality,customer_country,...,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18
0,0,121,2,0,Julie,Staples,juliestaples40@gmail.com,2,british,77,...,0,0,0,0,0,0,0,0,0,0
1,0,121,2,0,Alana,Locandro,alana.lannie@gmail.com,1,australian,13,...,0,0,0,0,0,0,0,0,0,0
2,0,121,7,6,Harriet,Peace,Harrietpeace@icloud.com,1,british,77,...,0,0,0,0,0,0,0,0,0,0
3,221,121,8,0,Kain,Ingham,Kaino_01@hotmail.com,0,australian,13,...,0,0,0,0,0,0,0,0,0,0
4,0,121,2,4,Sandra,Nicholl,sandinicholl@hotmail.co.uk,2,british,77,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5512,0,202,2,4,Ali,Mohammad,fourtwenty.aggregation@gmail.com,0,kuwait,123,...,0,0,0,0,0,1,0,0,0,0
5513,0,202,2,4,Kirill,Kiryushin,kirill.kiryushin.o@gmail.com,0,russian,191,...,0,0,0,0,0,1,0,0,0,0
5514,0,202,2,4,Paitoon,Uthaisang,puthaisang@gmail.com,0,thai,218,...,0,0,0,0,0,1,0,0,0,0
5515,0,202,2,0,Alex,Schetinin,shatz.g@gmail.com,0,israel,103,...,0,0,0,0,0,1,0,0,0,0


In [8]:
df_order_v2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5517 entries, 0 to 5516
Data columns (total 51 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   customer_id              5517 non-null   int64         
 1   order_product_id         5517 non-null   int64         
 2   browser                  5517 non-null   int8          
 3   platform                 5517 non-null   int8          
 4   customer_firstname       5517 non-null   object        
 5   customer_lastname        5517 non-null   object        
 6   customer_email           5517 non-null   object        
 7   customer_gender          5517 non-null   int64         
 8   customer_nationality     5517 non-null   object        
 9   customer_country         5517 non-null   int64         
 10  customer_phone_iso       5517 non-null   int64         
 11  customer_phone_code      5517 non-null   int64         
 12  order_price_paid         5517 non-

In [9]:
df_order_v2.to_csv('./csv/1_clean_data.csv', index=False)

## create new user id

In [10]:
df_user_id = pd.read_csv('./csv/1_clean_data.csv')
df_user_id

Unnamed: 0,customer_id,order_product_id,browser,platform,customer_firstname,customer_lastname,customer_email,customer_gender,customer_nationality,customer_country,...,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18
0,0,121,2,0,Julie,Staples,juliestaples40@gmail.com,2,british,77,...,0,0,0,0,0,0,0,0,0,0
1,0,121,2,0,Alana,Locandro,alana.lannie@gmail.com,1,australian,13,...,0,0,0,0,0,0,0,0,0,0
2,0,121,7,6,Harriet,Peace,Harrietpeace@icloud.com,1,british,77,...,0,0,0,0,0,0,0,0,0,0
3,221,121,8,0,Kain,Ingham,Kaino_01@hotmail.com,0,australian,13,...,0,0,0,0,0,0,0,0,0,0
4,0,121,2,4,Sandra,Nicholl,sandinicholl@hotmail.co.uk,2,british,77,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5512,0,202,2,4,Ali,Mohammad,fourtwenty.aggregation@gmail.com,0,kuwait,123,...,0,0,0,0,0,1,0,0,0,0
5513,0,202,2,4,Kirill,Kiryushin,kirill.kiryushin.o@gmail.com,0,russian,191,...,0,0,0,0,0,1,0,0,0,0
5514,0,202,2,4,Paitoon,Uthaisang,puthaisang@gmail.com,0,thai,218,...,0,0,0,0,0,1,0,0,0,0
5515,0,202,2,0,Alex,Schetinin,shatz.g@gmail.com,0,israel,103,...,0,0,0,0,0,1,0,0,0,0


In [11]:
# fix customer_id is 0 or 23
df_user_id['new_id'] = df_user_id['customer_id']
df_user_id['new_firstname'] = df_user_id['customer_firstname'].str.lower()
df_user_id['new_lastname'] = df_user_id['customer_lastname'].str.lower()

df_user_id['new_email'] = df_user_id['customer_email'].str.lower()
email_dict = df_user_id['new_email'].value_counts()
del email_dict['support@daytriptour.com']

# replace customer_id [0,23]
start_val = 10000
for i in email_dict.keys():
    x = df_user_id.query(f'new_email == "{str(i)}"')['customer_id'].unique().tolist()
    if 0 in x: x.remove(0)
    if 23 in x: x.remove(23)
    if len(x) > 0:
        val_id = max(x)
        idx = df_user_id.query(f'new_email == "{str(i)}"').index
        for j in idx:
            if df_user_id.loc[j]['customer_id'] in [0,23]:
                df_user_id.loc[j,['new_id']] = val_id
    else:
        idx = df_user_id.query(f'new_email == "{str(i)}"').index
        for j in idx:
            df_user_id.loc[j,['new_id']] = start_val
        start_val+=1

In [12]:
# support@daytriptour.com [0,23] has 421 items
df_user_id['new_id'].value_counts()

23       215
0        204
2704       9
2705       7
10000      6
        ... 
13336      1
376        1
13337      1
401        1
13339      1
Name: new_id, Length: 4595, dtype: int64

In [13]:
email_dict_daytriptour = df_user_id.query('new_email == "support@daytriptour.com"')['new_lastname'].value_counts()
# start_val =13340
for i in email_dict_daytriptour.keys():
    x = df_user_id.query(f'new_lastname == "{str(i)}" & new_email == "support@daytriptour.com"')['new_firstname'].unique().tolist()
    if len(x) == 1:
        idx = df_user_id.query(f'new_lastname == "{str(i)}" & new_email == "support@daytriptour.com"').index
        for j in idx:
            df_user_id.loc[j,['new_id']] = start_val
        start_val+=1
    else:
        for k in x:
            idx = df_user_id.query(f'new_lastname == "{str(i)}" & new_firstname == "{str(k)}" & new_email == "support@daytriptour.com"').index
            for j in idx:
                df_user_id.loc[j,['new_id']] = start_val
            start_val+=1
print(start_val)

13739


In [14]:
df_user_id['new_id'].value_counts()

2704     9
2705     7
10000    6
10001    5
888      5
        ..
2428     1
13607    1
13608    1
13609    1
13339    1
Name: new_id, Length: 4992, dtype: int64

In [15]:
df_user_id

Unnamed: 0,customer_id,order_product_id,browser,platform,customer_firstname,customer_lastname,customer_email,customer_gender,customer_nationality,customer_country,...,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18,new_id,new_firstname,new_lastname,new_email
0,0,121,2,0,Julie,Staples,juliestaples40@gmail.com,2,british,77,...,0,0,0,0,0,0,10531,julie,staples,juliestaples40@gmail.com
1,0,121,2,0,Alana,Locandro,alana.lannie@gmail.com,1,australian,13,...,0,0,0,0,0,0,12774,alana,locandro,alana.lannie@gmail.com
2,0,121,7,6,Harriet,Peace,Harrietpeace@icloud.com,1,british,77,...,0,0,0,0,0,0,12285,harriet,peace,harrietpeace@icloud.com
3,221,121,8,0,Kain,Ingham,Kaino_01@hotmail.com,0,australian,13,...,0,0,0,0,0,0,221,kain,ingham,kaino_01@hotmail.com
4,0,121,2,4,Sandra,Nicholl,sandinicholl@hotmail.co.uk,2,british,77,...,0,0,0,0,0,0,12286,sandra,nicholl,sandinicholl@hotmail.co.uk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5512,0,202,2,4,Ali,Mohammad,fourtwenty.aggregation@gmail.com,0,kuwait,123,...,0,1,0,0,0,0,11192,ali,mohammad,fourtwenty.aggregation@gmail.com
5513,0,202,2,4,Kirill,Kiryushin,kirill.kiryushin.o@gmail.com,0,russian,191,...,0,1,0,0,0,0,11193,kirill,kiryushin,kirill.kiryushin.o@gmail.com
5514,0,202,2,4,Paitoon,Uthaisang,puthaisang@gmail.com,0,thai,218,...,0,1,0,0,0,0,11194,paitoon,uthaisang,puthaisang@gmail.com
5515,0,202,2,0,Alex,Schetinin,shatz.g@gmail.com,0,israel,103,...,0,1,0,0,0,0,11195,alex,schetinin,shatz.g@gmail.com


In [16]:
df_user_id[['customer_id', 'new_id', 'order_product_id', 'browser', 'platform',
       'customer_firstname', 'customer_lastname', 'customer_email',
       'customer_gender', 'customer_nationality', 'customer_country',
       'customer_phone_iso', 'customer_phone_code', 'order_price_paid',
       'order_state', 'order_payment_by', 'order_quantity_infant',
       'order_quantity_children', 'order_quantity_adult',
       'order_quantity_elder', 'order_departure_date', 'date_create',
       'booked_days', 'departure_year', 'departure_month', 'departure_day',
       'departure_DayofYear', 'departure_DayofWeek', 'country_code',
       'sum_kids', 'sum_adults', 'private', 'group', 'family', 'cat_1',
       'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9',
       'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16',
       'cat_17', 'cat_18']].to_csv('./csv/2_new_user_id.csv', index=False)

## convert data to vector

In [17]:
use_cols = [
    'new_id', 'order_product_id', 'browser', 'platform', 'customer_gender','country_code', # group matrix
    'booked_days', 'order_price_paid', 
    'sum_kids', 'sum_adults', # normal and mean
    'private', 'group', 'family', # softmax
    'cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9', # softmax
    'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16','cat_17', 'cat_18' # softmax
    ]

cat_cols = ['cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9',
    'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16','cat_17', 'cat_18']

df_full_matrix = pd.read_csv('./csv/2_new_user_id.csv')[use_cols]
df_full_matrix

Unnamed: 0,new_id,order_product_id,browser,platform,customer_gender,country_code,booked_days,order_price_paid,sum_kids,sum_adults,...,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18
0,10531,121,2,0,2,77,60,4500,0,2,...,0,0,0,0,0,0,0,0,0,0
1,12774,121,2,0,1,13,53,4500,0,2,...,0,0,0,0,0,0,0,0,0,0
2,12285,121,7,6,1,77,75,4500,0,2,...,0,0,0,0,0,0,0,0,0,0
3,221,121,8,0,0,13,1,4500,0,2,...,0,0,0,0,0,0,0,0,0,0
4,12286,121,2,4,2,77,7,15750,0,7,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5512,11192,202,2,4,0,38,0,3580,0,2,...,0,0,0,0,0,1,0,0,0,0
5513,11193,202,2,4,0,218,1,3980,0,2,...,0,0,0,0,0,1,0,0,0,0
5514,11194,202,2,4,0,218,117,7020,0,3,...,0,0,0,0,0,1,0,0,0,0
5515,11195,202,2,0,0,103,0,3980,1,2,...,0,0,0,0,0,1,0,0,0,0


In [18]:
# constant values for replace 0 before softmax
c = 0.0
df_full_matrix.loc[:,'private':'cat_18'] = df_full_matrix.loc[:,'private':'cat_18'].applymap(lambda x: c if x == 0 else x)
df_full_matrix.loc[:,'private':'cat_18']

df_full_matrix['order_price_paid'] = df_full_matrix['order_price_paid']/5000

def softmax(x, axis = 1):
    return np.exp(x)/np.sum(np.exp(x),axis=axis, keepdims=True)

# softmax
df_full_matrix[cat_cols] = softmax(df_full_matrix[cat_cols].to_numpy())
df_full_matrix[['private', 'group', 'family']] = softmax(df_full_matrix[['private', 'group', 'family']].to_numpy())

# normalize
df_full_matrix[['sum_kids', 'sum_adults']] = df_full_matrix[['sum_kids', 'sum_adults']].div(df_full_matrix[['sum_kids', 'sum_adults']].sum(axis=1),axis=0)

  df_full_matrix.loc[:,'private':'cat_18'] = df_full_matrix.loc[:,'private':'cat_18'].applymap(lambda x: c if x == 0 else x)


In [19]:
df_full_matrix.to_csv('./csv/3_full_matrix.csv', index=False)
df_full_matrix

Unnamed: 0,new_id,order_product_id,browser,platform,customer_gender,country_code,booked_days,order_price_paid,sum_kids,sum_adults,...,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18
0,10531,121,2,0,2,77,60,0.900,0.000000,1.000000,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
1,12774,121,2,0,1,13,53,0.900,0.000000,1.000000,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
2,12285,121,7,6,1,77,75,0.900,0.000000,1.000000,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
3,221,121,8,0,0,13,1,0.900,0.000000,1.000000,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
4,12286,121,2,4,2,77,7,3.150,0.000000,1.000000,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5512,11192,202,2,4,0,38,0,0.716,0.000000,1.000000,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714
5513,11193,202,2,4,0,218,1,0.796,0.000000,1.000000,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714
5514,11194,202,2,4,0,218,117,1.404,0.000000,1.000000,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714
5515,11195,202,2,0,0,103,0,0.796,0.333333,0.666667,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714


In [20]:
matrix_cols = [
    # 'new_id', 'order_product_id', 'browser', 'platform', 'customer_gender','country_code', # groupby mean
    'booked_days', 'order_price_paid', 
    'sum_kids', 'sum_adults', # normal and mean
    'private', 'group', 'family', # softmax
    'cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9', # softmax
    'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16','cat_17', 'cat_18' # softmax
    ]

In [21]:
# user_matrix
df_user_matrix = df_full_matrix.groupby('new_id')[matrix_cols].mean()
df_user_matrix.reset_index(inplace=True)
df_user_matrix.to_csv('./csv/3_user_matrix.csv', index=False)

# product_matrix
df_product_matrix = df_full_matrix.groupby('order_product_id')[matrix_cols].mean()
df_product_matrix.reset_index(inplace=True)
df_product_matrix.to_csv('./csv/3_product_matrix.csv', index=False)

# browser_matrix
df_browser_matrix = df_full_matrix.groupby('browser')[matrix_cols].mean()
df_browser_matrix.reset_index(inplace=True)
df_browser_matrix.to_csv('./csv/3_browser_matrix.csv', index=False)

# platform_matrix
df_platform_matrix = df_full_matrix.groupby('platform')[matrix_cols].mean()
df_platform_matrix.reset_index(inplace=True)
df_platform_matrix.to_csv('./csv/3_platform_matrix.csv', index=False)

# customer_gender_matrix
df_customer_gender_matrix = df_full_matrix.groupby('customer_gender')[matrix_cols].mean()
df_customer_gender_matrix.reset_index(inplace=True)
df_customer_gender_matrix.to_csv('./csv/3_gender_matrix.csv', index=False)

# CountryCode_matrix
df_customeCountryCode_matrix = df_full_matrix.groupby('country_code')[matrix_cols].mean()
df_customeCountryCode_matrix.reset_index(inplace=True)
df_customeCountryCode_matrix.to_csv('./csv/3_country_code_matrix.csv', index=False)

In [22]:
df_user_matrix

Unnamed: 0,new_id,booked_days,order_price_paid,sum_kids,sum_adults,private,group,family,cat_1,cat_2,...,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18
0,8,0.4,0.27836,0.000000,1.000000,0.576117,0.211942,0.211942,0.048396,0.048396,...,0.079269,0.048396,0.064427,0.048396,0.065824,0.048396,0.048396,0.048396,0.083253,0.048396
1,24,0.0,0.06400,0.000000,1.000000,0.576117,0.211942,0.211942,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714
2,130,0.0,0.45000,0.000000,1.000000,0.576117,0.211942,0.211942,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
3,191,0.0,0.81600,0.333333,0.666667,0.211942,0.211942,0.576117,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714
4,194,0.0,0.07500,0.000000,1.000000,0.576117,0.211942,0.211942,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4987,13734,0.0,0.21600,0.000000,1.000000,0.211942,0.576117,0.211942,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714
4988,13735,0.0,0.17200,0.200000,0.800000,0.211942,0.211942,0.576117,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714
4989,13736,11.0,0.14200,0.500000,0.500000,0.211942,0.211942,0.576117,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714
4990,13737,0.0,0.07200,0.000000,1.000000,0.576117,0.211942,0.211942,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714


# Create Recomendation Table
- user-item
- user-user
- item-item

In [74]:
df_product_matrix = pd.read_csv('./csv/3_product_matrix.csv')
df_product_matrix.rename(columns={'order_product_id': 'product_id'}, inplace=True)
# df_product_matrix

In [75]:
df_user_matrix = pd.read_csv('./csv/3_user_matrix.csv')
# df_user_matrix

In [25]:
df_product = pd.read_csv('../csv/dtt_product.csv')[['product_id','product_slug']]
test_product_id = [22, 23, 24, 25, 26, 27, 28, 32, 37, 72, 110, 116]
df_product = df_product[~df_product['product_id'].isin(test_product_id)]

# dict of product url
productID_to_url = {i[1][0] : 'www.daytriptour.com/trip/' + i[1][1] for i in df_product.iterrows()}

In [26]:
# Check which product IDs are not present in df_product_matrix
product_ids_not_present = ~df_product['product_id'].isin(df_product_matrix['product_id'])

# Get the list of product IDs not present in df_product_matrix
product_ids_list = df_product.loc[product_ids_not_present, 'product_id'].tolist()

# Print the list of product IDs not present
print("Product IDs not present in df_product_matrix:" , len(product_ids_list))
print(product_ids_list)

Product IDs not present in df_product_matrix: 71
[30, 31, 74, 36, 45, 40, 41, 44, 48, 53, 57, 59, 61, 62, 63, 64, 71, 66, 67, 77, 80, 85, 86, 88, 89, 92, 93, 94, 95, 96, 98, 99, 100, 101, 104, 106, 107, 108, 109, 111, 112, 113, 114, 126, 136, 140, 143, 144, 155, 153, 154, 161, 176, 165, 169, 168, 174, 179, 186, 183, 190, 191, 192, 194, 203, 196, 201, 205, 204, 206, 207]


In [189]:
def compute_similarity(df1, df2, index_col1, index_col2, data_cols):
    similarity_matrix = cosine_similarity(df1[data_cols], df2[data_cols])
    similarity_df = pd.DataFrame(similarity_matrix, columns=df2[index_col2], index=df1[index_col1])
    return similarity_df

def compute_distance(df1, df2, index_col1, index_col2, data_cols):
    distance_matrix = euclidean_distances(df1[data_cols], df2[data_cols])
    distance_df = pd.DataFrame(distance_matrix, columns=df2[index_col2], index=df1[index_col1])
    return distance_df

def recommend_products(similarity_df, customer_id, n=5):
    customer_scores = similarity_df.loc[customer_id].sort_values(ascending=False)
    top_n_products = customer_scores.index[:n].tolist()
    return top_n_products

def evaluation(similarity_df, df_users, top_n=5):
    evaluation_results = []
    for new_id in df_users['new_id']:
        recommended_products = recommend_products(similarity_df, new_id, n=top_n)
        has_product_in_top5 = any(product in recommended_products for product in df_users[df_users['new_id'] == new_id]['order_product_id'])
        evaluation_results.append(has_product_in_top5)
    percentage = (sum(evaluation_results) / len(evaluation_results)) * 100
    return percentage

## Cosine Similarity
> create user_item table

In [210]:
similarity_cols = [
    # 'booked_days', 
    'order_price_paid', 
    'sum_kids','sum_adults', 
    # 'private', 'group', 'family', 
    'cat_1', 'cat_2', 'cat_3','cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9', 'cat_10',
    'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17','cat_18'
    ]

df_user_item_similarity = compute_similarity(df_user_matrix, df_product_matrix, 'new_id', 'product_id', similarity_cols)
df_user_item_similarity.to_csv('./csv/4_user_item_similarity.csv', index=True)

df_item_item_similarity = compute_similarity(df_product_matrix, df_product_matrix, 'product_id', 'product_id', similarity_cols)
df_item_item_similarity.to_csv('./csv/4_item_item_similarity.csv', index=True)

df_user_user_similarity = compute_similarity(df_user_matrix, df_user_matrix, 'new_id', 'new_id', similarity_cols)
df_user_user_similarity.to_csv('./csv/4_user_user_similarity.csv', index=True)

In [129]:
# Call the evaluation function
percentage_evaluation = evaluation(df_user_item_similarity, df_full_matrix, top_n=50)

# Print the percentage evaluation
print("Percentage of new_id values with at least one product in the top 5 recommendations:",  round(percentage_evaluation,2))

Percentage of new_id values with at least one product in the top 5 recommendations: 88.44


## Euclidean Distances

In [132]:
distance_cols = [
    'booked_days', 
    'order_price_paid', 
    'sum_kids','sum_adults', 
    'private', 'group', 'family', 
    'cat_1', 'cat_2', 'cat_3','cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9', 'cat_10',
    'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17','cat_18'
    ]

df_user_item_distance = compute_distance(df_user_matrix, df_product_matrix, 'new_id', 'product_id', distance_cols)
df_user_item_distance.to_csv('./csv/4_user_item_distance.csv', index=True)

df_item_item_distance = compute_distance(df_product_matrix, df_product_matrix, 'product_id', 'product_id', distance_cols)
df_item_item_distance.to_csv('./csv/4_item_item_distance.csv', index=True)

df_user_user_distance = compute_distance(df_user_matrix, df_user_matrix, 'new_id', 'new_id', distance_cols)
df_user_user_distance.to_csv('./csv/4_user_user_distance.csv', index=True)

In [133]:
# Call the evaluation function
percentage_evaluation = evaluation(df_user_item_distance, df_full_matrix, top_n=50)

# Print the percentage evaluation
print("Percentage of new_id values with at least one product in the top 5 recommendations:", round(percentage_evaluation,2))

Percentage of new_id values with at least one product in the top 5 recommendations: 34.58


## select top 10 from top 50

In [217]:
df_user_item_similarity = pd.read_csv('./csv/4_user_item_similarity.csv', index_col = 'new_id')
df_user_item_distance = pd.read_csv('./csv/4_user_item_distance.csv', index_col = 'new_id')
df_full_matrix = pd.read_csv('./csv/3_full_matrix.csv')

In [241]:
top_50 = list(map(int,recommend_products(df_user_item_similarity, 8, n=50)))
history_list = df_full_matrix[df_full_matrix['new_id']== 8]['order_product_id'].tolist()
cleanHistory_list = list(itertools.filterfalse(lambda x: x in history_list, top_50))

customer_scores = df_user_item_distance.loc[8].sort_values(ascending=False)
distance_list = list(map(int,customer_scores.index.tolist()))

top_10 = list(itertools.filterfalse(lambda x: x not in cleanHistory_list, distance_list))

In [253]:
df_user_item_distance.loc[8]

134    0.49683
Name: 8, dtype: float64

In [247]:
len(cleanHistory_list)

46

In [249]:
cleanHistory_list[0]

56

In [190]:
feature_cols = [
    'booked_days', 
    'order_price_paid', 
    'sum_kids','sum_adults', 
    'private', 'group', 'family', 
    'cat_1', 'cat_2', 'cat_3','cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9', 'cat_10',
    'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17','cat_18'
    ]

distance_df_customer_product = compute_distance(df_user_matrix, df_product_matrix, 'new_id', 'product_id', feature_cols)
# distance_df_customer_product

for i in [5,10,20,30,40,50,80,100]:
    top_n = i
    usID = 8
    recommended_products = recommend_products(distance_df_customer_product, usID, n=top_n)
    result = list(set(recommended_products) & set(df_full_matrix[df_full_matrix['new_id'] == usID]['order_product_id']) )
    print(i,'\t', result)

5 	 []
10 	 []
20 	 []
30 	 []
40 	 [182]
50 	 [52, 182, 46]
80 	 [182, 52, 181, 46]
100 	 [46, 52, 181, 182, 156]
