In [None]:
import shutil
from pathlib import Path
from typing import Any
import pandas as pd
from logzero import logger
from tqdm.auto import tqdm

ARTICLES_ORIGINAL = {
    'article_id': 'object',
    'product_code': 'int64',
    'prod_name': 'object',
    'product_type_no': 'int64',
    'product_type_name': 'object',
    'product_group_name': 'object',
    'graphical_appearance_no': 'int64',
    'graphical_appearance_name': 'object',
    'colour_group_code': 'int64',
    'colour_group_name': 'object',
    'perceived_colour_value_id': 'int64',
    'perceived_colour_value_name': 'object',
    'perceived_colour_master_id': 'int64',
    'perceived_colour_master_name': 'object',
    'department_no': 'int64',
    'department_name': 'object',
    'index_code': 'object',
    'index_name': 'object',
    'index_group_no': 'int64',
    'index_group_name': 'object',
    'section_no': 'int64',
    'section_name': 'object',
    'garment_group_no': 'int64',
    'garment_group_name': 'object',
    'detail_desc': 'object',
}

CUSTOMERS_ORIGINAL = {
    'customer_id': 'object',
    'FN': 'float64',
    'Active': 'float64',
    'club_member_status': 'object',
    'fashion_news_frequency': 'object',
    'age': 'float64',
    'postal_code': 'object',
}

TRANSACTIONS_ORIGINAL = {
    'customer_id': 'object',
    'article_id': 'object',
    'price': 'float64',
    'sales_channel_id': 'int64',
}

data_dir = '/home/workspace/h-and-m-personalized-fashion-recommendations'

def _count_encoding_dict(df, col_name):
    v = df.groupby(col_name).size().reset_index(name='size').sort_values(by='size', ascending=False)[col_name].tolist()
    return {x: i for i, x in enumerate(v)}

def _dict_to_dataframe(mp):
    return pd.DataFrame(mp.items(), columns=['val', 'idx'])

def _add_idx_column(df, col_name_from, col_name_to, mp):
    df[col_name_to] = df[col_name_from].apply(lambda x: mp[x]).astype('int64')

In [None]:
logger.info("start reading dataframes")
articles = pd.read_csv(f'{data_dir}/articles.csv', dtype=ARTICLES_ORIGINAL) 
customers = pd.read_csv(f'{data_dir}/customers.csv', dtype=CUSTOMERS_ORIGINAL) 
transactions = pd.read_csv(
    f'{data_dir}/transactions_train.csv',
    dtype=TRANSACTIONS_ORIGINAL,
    parse_dates=['t_dat'] 
)

logger.info("start processing customer_id")
customer_ids = customers.customer_id.values
mp_customer_id = {x: i for i, x in enumerate(customer_ids)} 
_dict_to_dataframe(mp_customer_id).to_pickle(f'{data_dir}/mp_customer_id.pkl') 

logger.info("start processing article_id")
article_ids = articles.article_id.values
mp_article_id = {x: i for i, x in enumerate(article_ids)} 
_dict_to_dataframe(mp_article_id).to_pickle(f'{data_dir}/mp_article_id.pkl') 

In [None]:
logger.info("start processing customers")
_add_idx_column(customers, 'customer_id', 'user', mp_customer_id) 

# (None, 1) -> (0, 1)
customers['FN'] = customers['FN'].fillna(0).astype('int64') 
customers['Active'] = customers['Active'].fillna(0).astype('int64') 

customers['club_member_status'] = customers['club_member_status'].fillna('NULL') 
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('NULL') 

for col_name in ['club_member_status', 'fashion_news_frequency',]:
    mp = _count_encoding_dict(customers, col_name)
    _add_idx_column(customers, col_name, f'{col_name}_idx', mp) 
customers.to_pickle(f'{data_dir}/users.pkl') 



logger.info("start processing articles")
_add_idx_column(articles, 'article_id', 'item', mp_article_id) 

count_encoding_columns = [
    'product_type_no',
    'product_group_name',
    'graphical_appearance_no',
    'colour_group_code',
    'perceived_colour_value_id',
    'perceived_colour_master_id',
    'department_no',
    'index_code',
    'index_group_no',
    'section_no',
    'garment_group_no',
]
for col_name in count_encoding_columns:
    mp = _count_encoding_dict(articles, col_name)
    _add_idx_column(articles, col_name, f'{col_name}_idx', mp)
articles.to_pickle(f'{data_dir}/items.pkl') 


logger.info("start processing transactions")
_add_idx_column(transactions, 'customer_id', 'user', mp_customer_id) 
_add_idx_column(transactions, 'article_id',  'item', mp_article_id)

transactions['sales_channel_id'] = transactions['sales_channel_id'] - 1  
transactions['week'] = (transactions['t_dat'].max() - transactions['t_dat']).dt.days // 7 
transactions['day'] = (transactions['t_dat'].max() - transactions['t_dat']).dt.days
transactions.to_pickle(f'{data_dir}/transactions_train.pkl') 