In [115]:
import pandas as pd
import sweetviz as sv
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import itertools
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [116]:
product_cat_df = pd.read_csv('./raw_csv/dtt_product_category.csv')

# Perform one-hot encoding
one_hot_encoded = pd.get_dummies(product_cat_df['category_id'], prefix='cat')

# Group by 'product_id' and sum the one-hot encoded columns
grouped_df = one_hot_encoded.groupby(product_cat_df['product_id']).sum()

# Reset the index to make 'product_id' a column again
grouped_df = grouped_df.reset_index()

# Drop duplicate rows based on 'product_id'
product_cat_df = product_cat_df.drop_duplicates(subset='product_id')

# Merge the grouped DataFrame back to the original DataFrame
product_cat_df = pd.merge(product_cat_df.drop(columns=['category_id']), grouped_df, on='product_id')

In [117]:
df_product_matrix = pd.read_csv('./csv/3_product_matrix.csv')
df_product_matrix.rename(columns={'order_product_id': 'product_id'}, inplace=True)

In [118]:
df_product = pd.read_csv('./raw_csv/dtt_product.csv')
test_product_id = [
    22, 23, 24, 25, 26, 27, 28, 
    30, 31, 32, 36, 37, 
    40, 41, 45, 48, 53, 59,
    61, 62, 63, 66,
    71, 72, 74, 77,
    85, 86, 89, 93, 94, 95, 96, 99,
    100, 101, 104, 106, 107, 108,
    110, 112, 114, 116,
    126, 168, 201]
df_product = df_product[~df_product['product_id'].isin(test_product_id)]

product_ids_not_present = ~df_product['product_id'].isin(df_product_matrix['product_id'])
product_ids_list = df_product.loc[product_ids_not_present, 'product_id'].tolist()

print("Product IDs not present in df_product_matrix:" , len(product_ids_list))
product_ids_list.sort()
print(product_ids_list)

Product IDs not present in df_product_matrix: 36
[44, 57, 64, 67, 80, 88, 92, 98, 109, 111, 113, 136, 140, 143, 144, 153, 154, 155, 161, 165, 169, 174, 176, 179, 183, 186, 190, 191, 192, 194, 196, 203, 204, 205, 206, 207]


In [119]:
df_newID = product_cat_df[product_cat_df['product_id'].isin(product_ids_list)]

In [120]:
def softmax(x, axis = 1):
    return np.exp(x)/np.sum(np.exp(x),axis=axis, keepdims=True)

In [121]:
cat_cols = ['cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9',
    'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16','cat_17', 'cat_18']

# constant values for replace 0 before softmax
c = 0.0
df_newID.loc[:,cat_cols] = df_newID.loc[:,cat_cols].applymap(lambda x: c if x == 0 else x)
df_newID[cat_cols] = softmax(df_newID[cat_cols].to_numpy())
df_newID.sample(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_newID.loc[:,cat_cols] = df_newID.loc[:,cat_cols].applymap(lambda x: c if x == 0 else x)
  df_newID.loc[:,cat_cols] = df_newID.loc[:,cat_cols].applymap(lambda x: c if x == 0 else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_newID[cat_cols] = softmax(df_newID[cat_cols].to_numpy())


Unnamed: 0,product_id,cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7,cat_8,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18
182,204,0.137856,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
122,144,0.046649,0.046649,0.126806,0.046649,0.046649,0.126806,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649
121,143,0.126806,0.046649,0.046649,0.046649,0.046649,0.046649,0.126806,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649


In [122]:
def compute_similarity(df1, df2, index_col1, index_col2, data_cols):
    similarity_matrix = cosine_similarity(df1[data_cols], df2[data_cols])
    similarity_df = pd.DataFrame(similarity_matrix, columns=df2[index_col2], index=df1[index_col1])
    return similarity_df

def compute_distance(df1, df2, index_col1, index_col2, data_cols):
    distance_matrix = euclidean_distances(df1[data_cols], df2[data_cols])
    distance_df = pd.DataFrame(distance_matrix, columns=df2[index_col2], index=df1[index_col1])
    return distance_df

def recommend_products(similarity_df, customer_id, n=5):
    customer_scores = similarity_df.loc[customer_id].sort_values(ascending=False)
    top_n_products = customer_scores.index[:n].tolist()
    return top_n_products

In [123]:
df_product_matrix = pd.read_csv('./csv/3_product_matrix.csv')
df_product_matrix.tail()

Unnamed: 0,order_product_id,booked_days,order_price_paid,sum_kids,sum_adults,private,group,family,cat_1,cat_2,...,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18
98,197,0.0,0.105,0.125,0.875,0.394029,0.211942,0.394029,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714
99,198,2.666667,0.120048,0.02381,0.97619,0.394029,0.333333,0.272637,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714
100,199,2.25,0.178,0.0,1.0,0.485073,0.302985,0.211942,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
101,200,0.2,0.2276,0.0,1.0,0.357612,0.430447,0.211942,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
102,202,7.527273,0.41912,0.165498,0.834502,0.364233,0.271534,0.364233,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714


In [124]:
df_item_item_similarity = compute_similarity(df_newID, df_product_matrix, 'product_id', 'order_product_id', cat_cols)
df_item_item_similarity

order_product_id,29,33,34,35,38,39,42,43,46,47,...,187,188,189,193,195,197,198,199,200,202
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
44,0.960803,0.841124,0.761685,0.960803,0.873508,0.960803,0.960803,1.0,0.960803,0.960803,...,0.960803,0.960803,0.826139,0.873508,0.826139,0.826139,0.826139,0.924204,0.924204,0.826139
57,1.0,0.786214,0.786214,1.0,0.808143,1.0,1.0,0.960803,1.0,1.0,...,1.0,1.0,0.84513,0.808143,0.84513,0.84513,0.84513,0.84513,0.84513,0.84513
64,0.808143,0.786214,0.873508,0.808143,0.904072,0.808143,0.808143,0.786214,0.808143,0.808143,...,0.808143,0.808143,0.952893,0.808143,0.84513,0.84513,0.84513,0.84513,0.84513,0.84513
67,0.808143,0.873508,0.873508,0.808143,0.904072,0.808143,0.808143,0.873508,0.808143,0.808143,...,0.808143,0.808143,0.952893,0.904072,0.84513,0.84513,0.84513,0.952893,0.952893,0.84513
80,0.84513,0.924204,0.826139,0.84513,0.84513,0.84513,0.84513,0.826139,0.84513,0.84513,...,0.84513,0.84513,0.878942,0.952893,0.878942,0.878942,0.878942,0.878942,0.878942,0.878942
88,0.960803,0.841124,0.761685,0.960803,0.873508,0.960803,0.960803,1.0,0.960803,0.960803,...,0.960803,0.960803,0.826139,0.873508,0.826139,0.826139,0.826139,0.924204,0.924204,0.826139
92,0.84513,0.826139,0.924204,0.84513,0.84513,0.84513,0.84513,0.826139,0.84513,0.84513,...,0.84513,0.84513,0.878942,0.84513,0.878942,0.878942,0.878942,0.878942,0.878942,0.878942
98,0.808143,0.873508,0.873508,0.808143,0.904072,0.808143,0.808143,0.873508,0.808143,0.808143,...,0.808143,0.808143,0.952893,0.904072,0.84513,0.84513,0.84513,0.952893,0.952893,0.84513
109,0.960803,0.841124,0.761685,0.960803,0.786214,0.960803,0.960803,0.920562,0.960803,0.960803,...,0.960803,0.960803,0.826139,0.786214,0.826139,0.826139,0.826139,0.826139,0.826139,0.826139
111,0.960803,0.841124,0.761685,0.960803,0.873508,0.960803,0.960803,1.0,0.960803,0.960803,...,0.960803,0.960803,0.826139,0.873508,0.826139,0.826139,0.826139,0.924204,0.924204,0.826139


In [125]:
df_item_item_distance = compute_distance(df_newID, df_product_matrix, 'product_id', 'order_product_id', cat_cols)
df_item_item_distance

order_product_id,29,33,34,35,38,39,42,43,46,47,...,187,188,189,193,195,197,198,199,200,202
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
44,0.07322519,0.148417,0.181772,0.07322519,0.131372,0.07322519,0.07322519,0.0,0.07322519,0.07322519,...,0.07322519,0.07322519,0.1519688,0.131372,0.151969,0.151969,0.151969,0.100803,0.100803,0.151969
57,3.72529e-09,0.170749,0.170749,3.72529e-09,0.160313,3.72529e-09,3.72529e-09,0.073225,3.72529e-09,3.72529e-09,...,3.72529e-09,3.72529e-09,0.1419379,0.160313,0.141938,0.141938,0.141938,0.141938,0.141938,0.141938
64,0.1603132,0.170749,0.131372,0.1603132,0.113359,0.1603132,0.1603132,0.170749,0.1603132,0.1603132,...,0.1603132,0.1603132,0.0785902,0.160313,0.141938,0.141938,0.141938,0.141938,0.141938,0.141938
67,0.1603132,0.131372,0.131372,0.1603132,0.113359,0.1603132,0.1603132,0.131372,0.1603132,0.1603132,...,0.1603132,0.1603132,0.0785902,0.113359,0.141938,0.141938,0.141938,0.07859,0.07859,0.141938
80,0.1419379,0.100803,0.151969,0.1419379,0.141938,0.1419379,0.1419379,0.151969,0.1419379,0.1419379,...,0.1419379,0.1419379,0.1232368,0.07859,0.123237,0.123237,0.123237,0.123237,0.123237,0.123237
88,0.07322519,0.148417,0.181772,0.07322519,0.131372,0.07322519,0.07322519,0.0,0.07322519,0.07322519,...,0.07322519,0.07322519,0.1519688,0.131372,0.151969,0.151969,0.151969,0.100803,0.100803,0.151969
92,0.1419379,0.151969,0.100803,0.1419379,0.141938,0.1419379,0.1419379,0.151969,0.1419379,0.1419379,...,0.1419379,0.1419379,0.1232368,0.141938,0.123237,0.123237,0.123237,0.123237,0.123237,0.123237
98,0.1603132,0.131372,0.131372,0.1603132,0.113359,0.1603132,0.1603132,0.131372,0.1603132,0.1603132,...,0.1603132,0.1603132,0.0785902,0.113359,0.141938,0.141938,0.141938,0.07859,0.07859,0.141938
109,0.07322519,0.148417,0.181772,0.07322519,0.170749,0.07322519,0.07322519,0.104946,0.07322519,0.07322519,...,0.07322519,0.07322519,0.1519688,0.170749,0.151969,0.151969,0.151969,0.151969,0.151969,0.151969
111,0.07322519,0.148417,0.181772,0.07322519,0.131372,0.07322519,0.07322519,0.0,0.07322519,0.07322519,...,0.07322519,0.07322519,0.1519688,0.131372,0.151969,0.151969,0.151969,0.100803,0.100803,0.151969


In [126]:
def recommend_top_n(similarity_df, distance_df, customer_id, top_n=10):
    # need load df_full_matrix and len(cleanHistory_list) < 10
    top_50 = list(map(int,recommend_products(similarity_df, customer_id, n=30)))
    # history_list = df_full_matrix[df_full_matrix['new_id']== customer_id]['order_product_id'].tolist()
    # cleanHistory_list = list(itertools.filterfalse(lambda x: x in history_list, top_50))
    cleanHistory_list = top_50

    customer_scores = distance_df.loc[customer_id].sort_values(ascending=False).index.tolist()
    distance_list = list(map(int,customer_scores))
    recommend_ls = list(itertools.filterfalse(lambda x: x not in cleanHistory_list, distance_list))[:top_n]
    return recommend_ls

In [127]:
rec_ls = recommend_top_n(df_item_item_similarity, df_item_item_distance, 161, 10)
rec_ls

[150, 147, 146, 149, 151, 152, 52, 148, 175, 145]

In [128]:
pgf = df_product_matrix[df_product_matrix['order_product_id'].isin(rec_ls)][['booked_days','order_price_paid','sum_kids','sum_adults','private','group','family']].mean()

In [129]:
df_product_matrix[df_product_matrix['order_product_id'].isin(rec_ls)]

Unnamed: 0,order_product_id,booked_days,order_price_paid,sum_kids,sum_adults,private,group,family,cat_1,cat_2,...,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18
13,52,3.0,0.257766,0.054048,0.945952,0.482472,0.253562,0.263967,0.043188,0.043188,...,0.117396,0.043188,0.043188,0.043188,0.043188,0.043188,0.043188,0.043188,0.043188,0.043188
63,145,4.333333,0.199111,0.277778,0.722222,0.333333,0.211942,0.454725,0.050714,0.050714,...,0.137856,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
64,146,2.138889,0.213796,0.033488,0.966512,0.421005,0.333333,0.245661,0.050714,0.050714,...,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
65,147,0.05,0.32566,0.02,0.98,0.485073,0.284777,0.23015,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714,0.050714
66,148,5.688889,0.634067,0.0,1.0,0.511375,0.276684,0.211942,0.046649,0.046649,...,0.126806,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649
67,149,6.363636,0.179397,0.194589,0.805411,0.311262,0.245048,0.443689,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714
68,150,2.086957,0.293162,0.078261,0.921739,0.457364,0.251526,0.29111,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
69,151,3.483871,0.139461,0.070027,0.929973,0.44983,0.27949,0.27068,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
70,152,1.7,0.38249,0.070833,0.929167,0.412238,0.284777,0.302985,0.043188,0.043188,...,0.117396,0.043188,0.043188,0.043188,0.043188,0.043188,0.043188,0.043188,0.043188,0.043188
85,175,1.0,0.168,0.428571,0.571429,0.211942,0.211942,0.576117,0.050714,0.050714,...,0.137856,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714


In [149]:
df = pd.DataFrame(columns=['product_id','booked_days','order_price_paid','sum_kids','sum_adults','private','group','family'])
for i in df_newID['product_id']:
    rec_ls = recommend_top_n(df_item_item_similarity, df_item_item_distance, i, 10)
    pgf = df_product_matrix[df_product_matrix['order_product_id'].isin(rec_ls)][['booked_days','order_price_paid','sum_kids','sum_adults','private','group','family']].mean()
    pgf['product_id'] = i
    df = pd.concat([df,pgf.to_frame().T])
df['product_id'] = df['product_id'].astype(int)


In [150]:
df

Unnamed: 0,product_id,booked_days,order_price_paid,sum_kids,sum_adults,private,group,family
0,44,2.09672,0.253376,0.090126,0.909874,0.433946,0.273236,0.292818
0,57,2.984557,0.279291,0.122759,0.877241,0.407589,0.263308,0.329103
0,64,2.063565,0.20854,0.126709,0.873291,0.398666,0.26852,0.332814
0,67,11.286625,0.377218,0.076503,0.923497,0.462776,0.25086,0.286365
0,80,2.809162,0.194125,0.132778,0.867222,0.399821,0.253458,0.346722
0,88,2.09672,0.253376,0.090126,0.909874,0.433946,0.273236,0.292818
0,92,2.694902,0.247748,0.103366,0.896634,0.421737,0.260274,0.317989
0,98,11.286625,0.377218,0.076503,0.923497,0.462776,0.25086,0.286365
0,109,11.216174,0.512694,0.052713,0.947287,0.489077,0.243637,0.267286
0,111,2.09672,0.253376,0.090126,0.909874,0.433946,0.273236,0.292818


In [153]:
df_newID = df.merge(df_newID, how='left', on='product_id')
df_newID

Unnamed: 0,product_id,booked_days,order_price_paid,sum_kids,sum_adults,private,group,family,cat_1,cat_2,...,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18
0,44,2.09672,0.253376,0.090126,0.909874,0.433946,0.273236,0.292818,0.043188,0.043188,...,0.117396,0.043188,0.117396,0.043188,0.043188,0.043188,0.043188,0.043188,0.043188,0.043188
1,57,2.984557,0.279291,0.122759,0.877241,0.407589,0.263308,0.329103,0.046649,0.046649,...,0.126806,0.046649,0.126806,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649
2,64,2.063565,0.20854,0.126709,0.873291,0.398666,0.26852,0.332814,0.046649,0.126806,...,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649
3,67,11.286625,0.377218,0.076503,0.923497,0.462776,0.25086,0.286365,0.046649,0.046649,...,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649
4,80,2.809162,0.194125,0.132778,0.867222,0.399821,0.253458,0.346722,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
5,88,2.09672,0.253376,0.090126,0.909874,0.433946,0.273236,0.292818,0.043188,0.043188,...,0.117396,0.043188,0.117396,0.043188,0.043188,0.043188,0.043188,0.043188,0.043188,0.043188
6,92,2.694902,0.247748,0.103366,0.896634,0.421737,0.260274,0.317989,0.050714,0.050714,...,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
7,98,11.286625,0.377218,0.076503,0.923497,0.462776,0.25086,0.286365,0.046649,0.046649,...,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649,0.046649
8,109,11.216174,0.512694,0.052713,0.947287,0.489077,0.243637,0.267286,0.117396,0.043188,...,0.117396,0.043188,0.117396,0.043188,0.043188,0.043188,0.043188,0.043188,0.043188,0.043188
9,111,2.09672,0.253376,0.090126,0.909874,0.433946,0.273236,0.292818,0.043188,0.043188,...,0.117396,0.043188,0.117396,0.043188,0.043188,0.043188,0.043188,0.043188,0.043188,0.043188
