In [1]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

customer_path = 'dataset/customer.csv'
sales_path = 'dataset/sales.csv'
goods_path = 'dataset/goods.csv'
category_path = 'dataset/x_tree.csv'
publisher_path = 'dataset/x_tag.csv'

customer_df = pd.read_csv(customer_path)
sales_df = pd.read_csv(sales_path)
goods_df = pd.read_csv(goods_path)
category_df = pd.read_csv(category_path)
publisher_df = pd.read_csv(publisher_path)


catgory_col = 'catgory'
good_id_col = 'good_id'
brand_col = 'brand'
factorize_brand_col = 'f_brand'
tags_col = 'tag'

factorize_gender_col = 'f_gender'
customer_id_col = 'customer_id'


mlb = MultiLabelBinarizer()
category_template = goods_df[[good_id_col, catgory_col]]
category_split = category_template.catgory \
    .str.split('|') \
    .apply(lambda x: list(map(int, x)) if isinstance(x, list) else []).tolist()

catgory_encoded = pd.DataFrame(mlb.fit_transform(category_split), columns=mlb.classes_, index=goods_df.index)
category_feature = pd.concat([goods_df['good_id'], catgory_encoded], axis=1)

auther_feature = goods_df[['good_id','author']]
auther_feature = auther_feature.assign(author=auther_feature['author'].str.split('|').str[0])
author_counts = auther_feature['author'].value_counts()
index_of_author_that_have_more_then_one_book = author_counts[author_counts > 1].index
auther_feature.loc[~auther_feature['author'].isin(index_of_author_that_have_more_then_one_book), 'author'] = 0
mask = auther_feature['author'] == 0
auther_feature.loc[~mask, 'author'] = pd.factorize(auther_feature.loc[~mask, 'author'])[0]+1

brands_template = goods_df[[good_id_col, brand_col]]
brands_template[factorize_brand_col] = pd.factorize(brands_template.brand)[0]
brands_feature = brands_template[[good_id_col, factorize_brand_col]]

tags_template = goods_df[[good_id_col, tags_col]]
tags_split = tags_template[tags_col].str.split('|') \
    .apply(lambda x: list(map(int, x)) if isinstance(x, list) else []).tolist()

tags_encoded = pd.DataFrame(mlb.fit_transform(tags_split), columns=mlb.classes_, index=goods_df.index)
tags_feature = pd.concat([goods_df['good_id'], tags_encoded], axis=1)

good_feature = category_feature.merge(auther_feature, on=good_id_col, how='left')
good_feature = good_feature.merge(brands_feature,on=good_id_col, how='left')
good_feature = good_feature.merge(tags_feature, on=good_id_col)

customer_df[factorize_gender_col] = pd.factorize(customer_df.gender)[0]
customer_feature = customer_df[[customer_id_col, factorize_gender_col]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brands_template[factorize_brand_col] = pd.factorize(brands_template.brand)[0]


# Sales Operation

### Customer Operation

In [2]:
sales_df.rename(columns={'member_id' : customer_id_col}, inplace=True)

In [3]:
sales_customer = sales_df[customer_id_col].drop_duplicates()

In [4]:
sales_customer_featuer = customer_feature[customer_feature.customer_id.isin(sales_customer)]

In [5]:
factorize_customer_id = 'f_customer_id'

In [6]:
sales_customer_featuer[factorize_customer_id] = pd.factorize(sales_customer_featuer.customer_id)[0]
customerIds_for_merge = sales_customer_featuer[[customer_id_col, factorize_customer_id]]
customerIds_for_merge

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_customer_featuer[factorize_customer_id] = pd.factorize(sales_customer_featuer.customer_id)[0]


Unnamed: 0,customer_id,f_customer_id
2,3,0
4,5,1
6,7,2
7,8,3
8,9,4
...,...,...
204906,205806,95689
204907,205807,95690
204908,205808,95691
204909,205809,95692


In [7]:
sales_df = sales_df.merge(customerIds_for_merge, on=customer_id_col, how='left')
sales_df

Unnamed: 0,invoice_id,register_date,customer_id,province_id,city_id,good_id,f_customer_id
0,2,2014-05-11 08:31:37,5,4.0,72.0,15347,1.0
1,5,2014-05-12 15:16:06,8,8.0,110.0,2833,3.0
2,5,2014-05-12 15:16:06,8,8.0,110.0,8714,3.0
3,7,2014-05-12 22:23:19,9,27.0,337.0,15672,4.0
4,11,2014-05-16 11:55:43,12,8.0,121.0,15001,5.0
...,...,...,...,...,...,...,...
525083,263306,2023-06-14 17:06:29,205809,4.0,977.0,35191,95692.0
525084,263306,2023-06-14 17:06:29,205809,4.0,977.0,37009,95692.0
525085,263306,2023-06-14 17:06:29,205809,4.0,977.0,77359,95692.0
525086,263307,2023-06-14 17:21:18,183076,10.0,134.0,26552,85153.0


### Good Operation

In [8]:
sales_df

Unnamed: 0,invoice_id,register_date,customer_id,province_id,city_id,good_id,f_customer_id
0,2,2014-05-11 08:31:37,5,4.0,72.0,15347,1.0
1,5,2014-05-12 15:16:06,8,8.0,110.0,2833,3.0
2,5,2014-05-12 15:16:06,8,8.0,110.0,8714,3.0
3,7,2014-05-12 22:23:19,9,27.0,337.0,15672,4.0
4,11,2014-05-16 11:55:43,12,8.0,121.0,15001,5.0
...,...,...,...,...,...,...,...
525083,263306,2023-06-14 17:06:29,205809,4.0,977.0,35191,95692.0
525084,263306,2023-06-14 17:06:29,205809,4.0,977.0,37009,95692.0
525085,263306,2023-06-14 17:06:29,205809,4.0,977.0,77359,95692.0
525086,263307,2023-06-14 17:21:18,183076,10.0,134.0,26552,85153.0


In [9]:
good_feature

Unnamed: 0,good_id,29,31,32,35_x,37,38,40,41,42_x,...,48962,52548,56067,56068,57828,59533,60904,62296,64050,67460
0,162,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,163,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,165,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,167,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57088,105771,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
57089,105772,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
57090,105773,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57091,105774,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [10]:
good_feature[good_feature.good_id == 15672]

Unnamed: 0,good_id,29,31,32,35_x,37,38,40,41,42_x,...,48962,52548,56067,56068,57828,59533,60904,62296,64050,67460


In [13]:
sales_goods = sales_df[good_id_col].drop_duplicates()
sales_goods

0          15347
1           2833
2           8714
3          15672
4          15001
           ...  
525004    102828
525005    103017
525006    103018
525036     96645
525042    105216
Name: good_id, Length: 45174, dtype: int64

In [14]:
sales_goods[sales_goods == 15672]

3    15672
Name: good_id, dtype: int64

In [15]:
factorize_good_id = 'f_good_id'

In [16]:
sales_good_feature = good_feature[good_feature.good_id.isin(sales_goods)]


In [17]:
sales_good_feature[sales_good_feature.good_id == 15672]

Unnamed: 0,good_id,29,31,32,35_x,37,38,40,41,42_x,...,48962,52548,56067,56068,57828,59533,60904,62296,64050,67460


In [18]:
sales_good_feature[factorize_good_id] = pd.factorize(sales_good_feature.good_id)[0]
goodIds_for_merge = sales_good_feature[[good_id_col, factorize_good_id]]
goodIds_for_merge

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_good_feature[factorize_good_id] = pd.factorize(sales_good_feature.good_id)[0]


Unnamed: 0,good_id,f_good_id
0,162,0
1,163,1
2,165,2
3,166,3
4,167,4
...,...,...
57025,105674,43475
57028,105677,43476
57030,105679,43477
57034,105688,43478


In [19]:
goodIds_for_merge[goodIds_for_merge.good_id == 15672]

Unnamed: 0,good_id,f_good_id


In [20]:
sales_df = sales_df.merge(goodIds_for_merge , on=good_id_col, how='left')
sales_df

Unnamed: 0,invoice_id,register_date,customer_id,province_id,city_id,good_id,f_customer_id,f_good_id
0,2,2014-05-11 08:31:37,5,4.0,72.0,15347,1.0,5255.0
1,5,2014-05-12 15:16:06,8,8.0,110.0,2833,3.0,1453.0
2,5,2014-05-12 15:16:06,8,8.0,110.0,8714,3.0,3200.0
3,7,2014-05-12 22:23:19,9,27.0,337.0,15672,4.0,
4,11,2014-05-16 11:55:43,12,8.0,121.0,15001,5.0,5136.0
...,...,...,...,...,...,...,...,...
525083,263306,2023-06-14 17:06:29,205809,4.0,977.0,35191,95692.0,13615.0
525084,263306,2023-06-14 17:06:29,205809,4.0,977.0,37009,95692.0,14570.0
525085,263306,2023-06-14 17:06:29,205809,4.0,977.0,77359,95692.0,34477.0
525086,263307,2023-06-14 17:21:18,183076,10.0,134.0,26552,85153.0,9833.0


In [21]:
sales_df.fillna(value={factorize_good_id : 0}, inplace=True)

In [22]:
sales_df

Unnamed: 0,invoice_id,register_date,customer_id,province_id,city_id,good_id,f_customer_id,f_good_id
0,2,2014-05-11 08:31:37,5,4.0,72.0,15347,1.0,5255.0
1,5,2014-05-12 15:16:06,8,8.0,110.0,2833,3.0,1453.0
2,5,2014-05-12 15:16:06,8,8.0,110.0,8714,3.0,3200.0
3,7,2014-05-12 22:23:19,9,27.0,337.0,15672,4.0,0.0
4,11,2014-05-16 11:55:43,12,8.0,121.0,15001,5.0,5136.0
...,...,...,...,...,...,...,...,...
525083,263306,2023-06-14 17:06:29,205809,4.0,977.0,35191,95692.0,13615.0
525084,263306,2023-06-14 17:06:29,205809,4.0,977.0,37009,95692.0,14570.0
525085,263306,2023-06-14 17:06:29,205809,4.0,977.0,77359,95692.0,34477.0
525086,263307,2023-06-14 17:21:18,183076,10.0,134.0,26552,85153.0,9833.0


In [25]:
sales_df[sales_df.f_good_id == 8]

Unnamed: 0,invoice_id,register_date,customer_id,province_id,city_id,good_id,f_customer_id,f_good_id
81785,45507,2018-02-19 09:28:59,40915,27.0,331.0,171,19989.0,8.0
82913,46156,2018-02-26 23:42:32,43343,5.0,97.0,171,21207.0,8.0
84417,46841,2018-03-09 13:09:22,43343,8.0,110.0,171,21207.0,8.0
86727,48022,2018-04-06 19:36:59,41921,8.0,110.0,171,20516.0,8.0
152567,81973,2019-03-31 10:00:17,73299,4.0,55.0,171,35872.0,8.0
219455,113328,2020-02-04 15:32:48,98041,8.0,110.0,171,46589.0,8.0
225508,116552,2020-03-01 11:44:54,100665,8.0,110.0,171,47639.0,8.0
309313,156836,2020-11-22 19:02:01,135183,30.0,369.0,171,63101.0,8.0
315472,159763,2020-12-08 14:55:35,126504,8.0,110.0,171,58782.0,8.0
319932,161658,2020-12-20 12:37:56,65711,8.0,119.0,171,32504.0,8.0
