In [1]:
import pandas as pd
import sweetviz as sv
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import itertools
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [17]:
product_cat_df = pd.read_csv('./raw_csv/dtt_product_category.csv')

# Perform one-hot encoding
one_hot_encoded = pd.get_dummies(product_cat_df['category_id'], prefix='cat')

# Group by 'product_id' and sum the one-hot encoded columns
grouped_df = one_hot_encoded.groupby(product_cat_df['product_id']).sum()

# Reset the index to make 'product_id' a column again
grouped_df = grouped_df.reset_index()

# Drop duplicate rows based on 'product_id'
product_cat_df = product_cat_df.drop_duplicates(subset='product_id')

# Merge the grouped DataFrame back to the original DataFrame
product_cat_df = pd.merge(product_cat_df.drop(columns=['category_id']), grouped_df, on='product_id')

In [2]:
df_product_matrix = pd.read_csv('./csv/3_product_matrix.csv')
df_product_matrix.rename(columns={'order_product_id': 'product_id'}, inplace=True)

In [12]:
df_product = pd.read_csv('./raw_csv/dtt_product.csv')
test_product_id = [
    22, 23, 24, 25, 26, 27, 28, 
    30, 31, 32, 36, 37, 
    40, 41, 45, 48, 53, 59,
    61, 62, 63, 66,
    71, 72, 74, 77,
    85, 86, 89, 93, 94, 95, 96, 99,
    100, 101, 104, 106, 107, 108,
    110, 112, 114, 116,
    126, 168, 201]
df_product = df_product[~df_product['product_id'].isin(test_product_id)]

product_ids_not_present = ~df_product['product_id'].isin(df_product_matrix['product_id'])
product_ids_list = df_product.loc[product_ids_not_present, 'product_id'].tolist()

print("Product IDs not present in df_product_matrix:" , len(product_ids_list))
product_ids_list.sort()
print(product_ids_list)

Product IDs not present in df_product_matrix: 36
[44, 57, 64, 67, 80, 88, 92, 98, 109, 111, 113, 136, 140, 143, 144, 153, 154, 155, 161, 165, 169, 174, 176, 179, 183, 186, 190, 191, 192, 194, 196, 203, 204, 205, 206, 207]


In [19]:
df_newID = product_cat_df[product_cat_df['product_id'].isin(product_ids_list)]

In [21]:
def softmax(x, axis = 1):
    return np.exp(x)/np.sum(np.exp(x),axis=axis, keepdims=True)

In [24]:
cat_cols = ['cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9',
    'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16','cat_17', 'cat_18']

# constant values for replace 0 before softmax
c = 0.0
df_newID.loc[:,cat_cols] = df_newID.loc[:,cat_cols].applymap(lambda x: c if x == 0 else x)
df_newID[cat_cols] = softmax(df_newID[cat_cols].to_numpy())
df_newID.sample(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_newID.loc[:,cat_cols] = df_newID.loc[:,cat_cols].applymap(lambda x: c if x == 0 else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_newID[cat_cols] = softmax(df_newID[cat_cols].to_numpy())


Unnamed: 0,product_id,cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7,cat_8,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18
89,111,0.043642,0.043642,0.043642,0.043642,0.043642,0.043642,0.043642,0.115125,0.115125,0.043642,0.115125,0.043642,0.043642,0.043642,0.043642,0.043642,0.043642,0.043642
161,183,0.04073,0.04073,0.107444,0.04073,0.107444,0.04073,0.04073,0.107444,0.04073,0.04073,0.04073,0.04073,0.04073,0.04073,0.04073,0.04073,0.04073,0.107444
42,64,0.047002,0.123988,0.123988,0.047002,0.047002,0.047002,0.047002,0.047002,0.047002,0.047002,0.047002,0.047002,0.047002,0.047002,0.047002,0.047002,0.047002,0.047002


In [25]:
def compute_similarity(df1, df2, index_col1, index_col2, data_cols):
    similarity_matrix = cosine_similarity(df1[data_cols], df2[data_cols])
    similarity_df = pd.DataFrame(similarity_matrix, columns=df2[index_col2], index=df1[index_col1])
    return similarity_df

def compute_distance(df1, df2, index_col1, index_col2, data_cols):
    distance_matrix = euclidean_distances(df1[data_cols], df2[data_cols])
    distance_df = pd.DataFrame(distance_matrix, columns=df2[index_col2], index=df1[index_col1])
    return distance_df

def recommend_products(similarity_df, customer_id, n=5):
    customer_scores = similarity_df.loc[customer_id].sort_values(ascending=False)
    top_n_products = customer_scores.index[:n].tolist()
    return top_n_products

In [26]:
df_product_matrix = pd.read_csv('./csv/3_product_matrix.csv')
df_product_matrix.tail()

Unnamed: 0,order_product_id,booked_days,order_price_paid,sum_kids,sum_adults,private,group,family,cat_1,cat_2,...,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18
98,197,0.0,0.29,0.125,0.875,0.394029,0.211942,0.394029,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714
99,198,2.666667,0.389667,0.02381,0.97619,0.394029,0.333333,0.272637,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714
100,199,2.25,0.801,0.0,1.0,0.485073,0.302985,0.211942,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
101,200,0.2,0.6828,0.0,1.0,0.357612,0.430447,0.211942,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714
102,202,7.527273,1.34972,0.165498,0.834502,0.364233,0.271534,0.364233,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714


In [30]:
df_item_item_similarity = compute_similarity(df_newID, df_product_matrix, 'product_id', 'order_product_id', cat_cols)
df_item_item_similarity

order_product_id,29,33,34,35,38,39,42,43,46,47,...,187,188,189,193,195,197,198,199,200,202
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
44,0.962375,0.845742,0.768668,0.962375,0.877678,0.962375,0.962375,0.999891,0.962375,0.962375,...,0.962375,0.962375,0.832723,0.877678,0.832723,0.832723,0.832723,0.92787,0.92787,0.832723
57,0.999888,0.792623,0.792623,0.999888,0.81439,0.999888,0.999888,0.961427,0.999888,0.999888,...,0.999888,0.999888,0.851251,0.81439,0.851251,0.851251,0.851251,0.851251,0.851251,0.851251
64,0.81439,0.792623,0.877025,0.81439,0.907139,0.81439,0.81439,0.792623,0.81439,0.81439,...,0.81439,0.81439,0.955443,0.81439,0.851251,0.851251,0.851251,0.851251,0.851251,0.851251
67,0.81439,0.877025,0.877025,0.81439,0.907139,0.81439,0.81439,0.877025,0.81439,0.81439,...,0.81439,0.81439,0.955443,0.907139,0.851251,0.851251,0.851251,0.955443,0.955443,0.851251
80,0.849727,0.925148,0.830833,0.849727,0.849727,0.849727,0.849727,0.830833,0.849727,0.849727,...,0.849727,0.849727,0.883476,0.95337,0.883476,0.883476,0.883476,0.883476,0.883476,0.883476
88,0.962375,0.845742,0.768668,0.962375,0.877678,0.962375,0.962375,0.999891,0.962375,0.962375,...,0.962375,0.962375,0.832723,0.877678,0.832723,0.832723,0.832723,0.92787,0.92787,0.832723
92,0.849727,0.830833,0.925148,0.849727,0.849727,0.849727,0.849727,0.830833,0.849727,0.849727,...,0.849727,0.849727,0.883476,0.849727,0.883476,0.883476,0.883476,0.883476,0.883476,0.883476
98,0.81439,0.877025,0.877025,0.81439,0.907139,0.81439,0.81439,0.877025,0.81439,0.81439,...,0.81439,0.81439,0.955443,0.907139,0.851251,0.851251,0.851251,0.955443,0.955443,0.851251
109,0.962375,0.845742,0.768668,0.962375,0.792981,0.962375,0.962375,0.922817,0.962375,0.962375,...,0.962375,0.962375,0.832723,0.792981,0.832723,0.832723,0.832723,0.832723,0.832723,0.832723
111,0.962375,0.845742,0.768668,0.962375,0.877678,0.962375,0.962375,0.999891,0.962375,0.962375,...,0.962375,0.962375,0.832723,0.877678,0.832723,0.832723,0.832723,0.92787,0.92787,0.832723


In [31]:
df_item_item_distance = compute_distance(df_newID, df_product_matrix, 'product_id', 'order_product_id', cat_cols)
df_item_item_distance

order_product_id,29,33,34,35,38,39,42,43,46,47,...,187,188,189,193,195,197,198,199,200,202
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
44,0.071396,0.145729,0.178455,0.071396,0.128674,0.071396,0.071396,0.004309,0.071396,0.071396,...,0.071396,0.071396,0.1484,0.128674,0.1484,0.1484,0.1484,0.097798,0.097798,0.1484
57,0.004228,0.167668,0.167668,0.004228,0.157167,0.004228,0.004228,0.072529,0.004228,0.004228,...,0.004228,0.004228,0.138561,0.157167,0.138561,0.138561,0.138561,0.138561,0.138561,0.138561
64,0.157167,0.167668,0.129176,0.157167,0.111174,0.157167,0.157167,0.167668,0.157167,0.157167,...,0.157167,0.157167,0.076038,0.157167,0.138561,0.138561,0.138561,0.138561,0.138561,0.138561
67,0.157167,0.129176,0.129176,0.157167,0.111174,0.157167,0.157167,0.129176,0.157167,0.157167,...,0.157167,0.157167,0.076038,0.111174,0.138561,0.138561,0.138561,0.076038,0.076038,0.138561
80,0.139565,0.100108,0.149668,0.139565,0.139565,0.139565,0.139565,0.149668,0.139565,0.139565,...,0.139565,0.139565,0.120622,0.078148,0.120622,0.120622,0.120622,0.120622,0.120622,0.120622
88,0.071396,0.145729,0.178455,0.071396,0.128674,0.071396,0.071396,0.004309,0.071396,0.071396,...,0.071396,0.071396,0.1484,0.128674,0.1484,0.1484,0.1484,0.097798,0.097798,0.1484
92,0.139565,0.149668,0.100108,0.139565,0.139565,0.139565,0.139565,0.149668,0.139565,0.139565,...,0.139565,0.139565,0.120622,0.139565,0.120622,0.120622,0.120622,0.120622,0.120622,0.120622
98,0.157167,0.129176,0.129176,0.157167,0.111174,0.157167,0.157167,0.129176,0.157167,0.157167,...,0.157167,0.157167,0.076038,0.111174,0.138561,0.138561,0.138561,0.076038,0.076038,0.138561
109,0.071396,0.145729,0.178455,0.071396,0.167382,0.071396,0.071396,0.103091,0.071396,0.071396,...,0.071396,0.071396,0.1484,0.167382,0.1484,0.1484,0.1484,0.1484,0.1484,0.1484
111,0.071396,0.145729,0.178455,0.071396,0.128674,0.071396,0.071396,0.004309,0.071396,0.071396,...,0.071396,0.071396,0.1484,0.128674,0.1484,0.1484,0.1484,0.097798,0.097798,0.1484


In [37]:
def recommend_top_n(similarity_df, distance_df, customer_id, top_n=10):
    # need load df_full_matrix and len(cleanHistory_list) < 10
    top_50 = list(map(int,recommend_products(similarity_df, customer_id, n=50)))
    # history_list = df_full_matrix[df_full_matrix['new_id']== customer_id]['order_product_id'].tolist()
    # cleanHistory_list = list(itertools.filterfalse(lambda x: x in history_list, top_50))
    cleanHistory_list = top_50

    customer_scores = distance_df.loc[customer_id].sort_values(ascending=False).index.tolist()
    distance_list = list(map(int,customer_scores))
    recommend_ls = list(itertools.filterfalse(lambda x: x not in cleanHistory_list, distance_list))[:top_n]
    return recommend_ls

In [41]:
rec_ls = recommend_top_n(df_item_item_similarity, df_item_item_distance, 207, 5)
rec_ls

[134, 138, 137, 147, 149]

In [42]:
df_product_matrix[df_product_matrix['order_product_id'].isin(rec_ls)]

Unnamed: 0,order_product_id,booked_days,order_price_paid,sum_kids,sum_adults,private,group,family,cat_1,cat_2,...,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18
56,134,0.0,0.13,0.111111,0.888889,0.454725,0.211942,0.333333,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856
58,137,11.666667,0.23,0.222222,0.777778,0.333333,0.211942,0.454725,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856
59,138,4.047619,0.820476,0.223356,0.776644,0.29865,0.263967,0.437383,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714
65,147,0.05,0.77205,0.02,0.98,0.485073,0.284777,0.23015,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714,0.050714
67,149,6.363636,0.6786,0.194589,0.805411,0.311262,0.245048,0.443689,0.050714,0.050714,...,0.050714,0.050714,0.050714,0.050714,0.050714,0.137856,0.050714,0.050714,0.050714,0.050714
