In [1]:
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

In [2]:
customer_path = 'dataset/customer.csv'
sales_path = 'dataset/sales.csv'
goods_path = 'dataset/goods.csv'
category_path = 'dataset/x_tree.csv'
publisher_path = 'dataset/x_tag.csv'

customer_df = pd.read_csv(customer_path)
sales_df = pd.read_csv(sales_path)
goods_df = pd.read_csv(goods_path)
category_df = pd.read_csv(category_path)
publisher_df = pd.read_csv(publisher_path)

In [3]:
goods_df.head()

Unnamed: 0,good_id,brand,author,translator,catgory,tag
0,162,96,115,,153,1|1884|1965|3728|48864
1,163,96,115,,66,1|434|438|441|454|518|552|727|1355|1480|1884|1...
2,165,96,1724,,139|141,1|1884|1965|3728|48864
3,166,96,2043,,118,1|1884|1965|3728|48864
4,167,96,1829,,139|140,1|1884|1965|3728|48864


In [14]:
catgory_col = 'catgory'
good_id_col = 'good_id'
brand_col = 'brand'
factorize_brand_col = 'f_brand'
tags_col = 'tag'

### Category

In [None]:
mlb = MultiLabelBinarizer()

In [28]:
category_template = goods_df[[good_id_col, catgory_col]]
category_split = category_template.catgory \
    .str.split('|') \
    .apply(lambda x: list(map(int, x)) if isinstance(x, list) else []).tolist()

catgory_encoded = pd.DataFrame(mlb.fit_transform(category_split), columns=mlb.classes_, index=goods_df.index)
category_feature = pd.concat([goods_df['good_id'], catgory_encoded], axis=1)
category_feature

Unnamed: 0,good_id,29,31,32,35,37,38,40,41,42,...,170,171,173,178,189,190,191,192,193,197
0,162,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,163,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,165,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,167,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57088,105771,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57089,105772,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57090,105773,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57091,105774,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Author

In [6]:
auther_feature = goods_df[['good_id','author']]
auther_feature = auther_feature.assign(author=auther_feature['author'].str.split('|').str[0])
auther_feature

Unnamed: 0,good_id,author
0,162,115
1,163,115
2,165,1724
3,166,2043
4,167,1829
...,...,...
57088,105771,103
57089,105772,103
57090,105773,103
57091,105774,103


In [7]:
author_counts = auther_feature['author'].value_counts()
author_counts

2664      735
68        297
1777      296
1520      289
2028      219
         ... 
53149       1
53158       1
53160       1
53174       1
105747      1
Name: author, Length: 18583, dtype: int64

In [8]:
index_of_author_that_have_more_then_one_book = author_counts[author_counts > 1].index

In [9]:
auther_feature.loc[~auther_feature['author'].isin(index_of_author_that_have_more_then_one_book), 'author'] = 0

In [10]:
mask = auther_feature['author'] == 0
auther_feature.loc[~mask, 'author'] = pd.factorize(auther_feature.loc[~mask, 'author'])[0]+1

In [11]:
auther_feature

Unnamed: 0,good_id,author
0,162,1
1,163,1
2,165,2
3,166,3
4,167,4
...,...,...
57088,105771,44
57089,105772,44
57090,105773,44
57091,105774,44


### brands

In [12]:
brands_template = goods_df[[good_id_col, brand_col]]
brands_template[factorize_brand_col] = pd.factorize(brands_template.brand)[0]
brands_feature = brands_template[[good_id_col, factorize_brand_col]]
brands_feature

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brands_template[factorize_brand_col] = pd.factorize(brands_template.brand)[0]


Unnamed: 0,good_id,f_brand
0,162,0
1,163,0
2,165,0
3,166,0
4,167,0
...,...,...
57088,105771,100
57089,105772,100
57090,105773,100
57091,105774,100


### Tags

In [15]:
tags_template = goods_df[[good_id_col, tags_col]]
tags_template

Unnamed: 0,good_id,tag
0,162,1|1884|1965|3728|48864
1,163,1|434|438|441|454|518|552|727|1355|1480|1884|1...
2,165,1|1884|1965|3728|48864
3,166,1|1884|1965|3728|48864
4,167,1|1884|1965|3728|48864
...,...,...
57088,105771,1|1873|1878|1884|1965|1971|3728|46168|48864|64050
57089,105772,1|1873|1876|1878|1884|1965|1971|1984|3728|4616...
57090,105773,1|1873|1876|1878|1884|1965|1971|1984|3728|4616...
57091,105774,1|1873|1876|1878|1884|1965|1971|1984|3728|4616...


In [30]:
tags_split = tags_template[tags_col].str.split('|') \
    .apply(lambda x: list(map(int, x)) if isinstance(x, list) else []).tolist()


tags_encoded = pd.DataFrame(mlb.fit_transform(tags_split), columns=mlb.classes_, index=goods_df.index)
tags_feature = pd.concat([goods_df['good_id'], tags_encoded], axis=1)
tags_feature

Unnamed: 0,good_id,1,2,3,4,19,21,23,24,28,...,48962,52548,56067,56068,57828,59533,60904,62296,64050,67460
0,162,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,163,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,165,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,166,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,167,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57088,105771,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
57089,105772,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
57090,105773,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57091,105774,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### Concatinate All Features

In [33]:
good_feature = category_feature.merge(auther_feature, on=good_id_col, how='left')

Unnamed: 0,good_id,29,31,32,35,37,38,40,41,42,...,171,173,178,189,190,191,192,193,197,author
0,162,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,163,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,165,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,167,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57088,105771,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,44
57089,105772,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,44
57090,105773,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,44
57091,105774,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,44


In [34]:
good_feature = good_feature.merge(brands_feature,on=good_id_col, how='left')
good_feature

Unnamed: 0,good_id,29,31,32,35,37,38,40,41,42,...,173,178,189,190,191,192,193,197,author,f_brand
0,162,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,163,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,165,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
3,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0
4,167,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57088,105771,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,44,100
57089,105772,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,44,100
57090,105773,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,44,100
57091,105774,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,44,100


In [35]:
good_feature = good_feature.merge(tags_feature, on=good_id_col)
good_feature

Unnamed: 0,good_id,29,31,32,35_x,37,38,40,41,42_x,...,48962,52548,56067,56068,57828,59533,60904,62296,64050,67460
0,162,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,163,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,165,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,167,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57088,105771,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
57089,105772,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
57090,105773,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57091,105774,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
