# H&M Personalized Fashion Recommendations: 1. Data analysis

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from scipy.sparse import csr_matrix

from tqdm import tqdm
from datetime import datetime, timedelta

import logging
logging.basicConfig(level=logging.INFO)
logging.info("test")

import sys
sys.path.append("..")
from src.utils import *

INFO:root:test


In [2]:
articles = pd.read_csv("../input/articles_proc.csv", dtype={"article_id": str})
customers = pd.read_csv("../input/customers_proc.csv")
transactions = pd.read_csv("../input/transactions_full.csv", dtype={"article_id": str})

# Популярные по группам 

In [38]:
min_date = subtract_days(transactions["t_dat"].max(), 4 * 4)
cust_features = ["age_group", "price_group"]

train = (
    transactions[transactions["t_dat"] >= min_date]
        [["t_dat", "customer_id", "article_id"]]
        .merge(customers[["customer_id", *cust_features]],
               on="customer_id", how="inner")
)
train

Unnamed: 0,t_dat,customer_id,article_id,age_group,fashion_news_frequency,price_group
0,2020-08-25,00015c1a121e08bbd2552c15fbbb6e6b19d3bf8f7b6a3d...,0868405004,22-29,,high
1,2020-08-25,00015c1a121e08bbd2552c15fbbb6e6b19d3bf8f7b6a3d...,0918292001,22-29,,high
2,2020-08-25,00026b2e6fa592951b3f1f5b24fea406a356d23e84fc80...,0903062001,54+,,medium
3,2020-08-25,0010e8eb18f131e724d6997909af0808adbba057529edb...,0828067001,22-29,Regularly,high
4,2020-08-25,0010e8eb18f131e724d6997909af0808adbba057529edb...,0883033001,22-29,Regularly,high
...,...,...,...,...,...,...
1085225,2020-09-22,ffb72741f3bc3d98855703b55d34e05bc7893a5d6a99a3...,0914404001,22-29,,medium
1085226,2020-09-22,ffb72741f3bc3d98855703b55d34e05bc7893a5d6a99a3...,0751471041,22-29,,medium
1085227,2020-09-22,ffb72741f3bc3d98855703b55d34e05bc7893a5d6a99a3...,0751471038,22-29,,medium
1085228,2020-09-22,ffc92c3f7b0b302f393c2968b290f6e5c5b5510d1cf1df...,0895730002,45-54,,high


In [39]:
train.price_group.value_counts()

high      689148
medium    396082
Name: price_group, dtype: int64

In [40]:
i_group_map = dict(
    enumerate(
        map(tuple, 
            train[cust_features].drop_duplicates().values
           )
    )
)
group_i_map = {group: i for i, group in i_group_map.items()}
group_i_map

{('22-29', 'None', 'high'): 0,
 ('54+', 'None', 'medium'): 1,
 ('22-29', 'Regularly', 'high'): 2,
 ('16-21', 'None', 'medium'): 3,
 ('54+', 'Regularly', 'medium'): 4,
 ('30-44', 'Regularly', 'high'): 5,
 ('16-21', 'Regularly', 'medium'): 6,
 ('45-54', 'None', 'high'): 7,
 ('22-29', 'None', 'medium'): 8,
 ('22-29', 'Regularly', 'medium'): 9,
 ('54+', 'Regularly', 'high'): 10,
 ('30-44', 'None', 'medium'): 11,
 ('45-54', 'None', 'medium'): 12,
 ('16-21', 'Regularly', 'high'): 13,
 ('45-54', 'Regularly', 'high'): 14,
 ('30-44', 'Regularly', 'medium'): 15,
 ('30-44', 'None', 'high'): 16,
 ('54+', 'None', 'high'): 17,
 ('45-54', 'Regularly', 'medium'): 18,
 ('16-21', 'None', 'high'): 19}

In [41]:
def get_group(line):
    group = (line["age_group"], line["price_group"])
    group = tuple([line[x] for x in cust_features])
    return group_i_map[group]
train["group"] = train.apply(get_group, axis=1)
customers["group"] = customers.apply(get_group, axis=1)

In [42]:
train["group"].value_counts()

0     148839
2     106693
16    105636
8      80608
5      79936
7      66912
14     64005
11     52575
9      51495
10     39866
3      39365
12     36773
6      34670
15     33895
17     33407
18     28442
13     24106
19     19748
1      19685
4      18574
Name: group, dtype: int64

In [43]:
def get_tr_list(transactions, weeks: int = 4):
    tr_list = []
    max_date = transactions["t_dat"].max()
    for i in range(1, weeks + 1):
        min_date = subtract_days(max_date, 7 * i)
        tr_list.append(transactions[transactions['t_dat'] >= min_date])
    return tr_list

train_list = get_tr_list(train, weeks=4)

In [44]:
popular_w1 = list((train_list[0]['article_id'].value_counts()).index)[:12]
popular_w1

['0924243001',
 '0924243002',
 '0923758001',
 '0918522001',
 '0909370001',
 '0866731001',
 '0751471001',
 '0915529003',
 '0915529005',
 '0448509014',
 '0762846027',
 '0714790020']

In [45]:
popular_w1_groups = {}
for group_i in group_i_map.values():
    popular_w1_groups[group_i] = (
        list((train_list[0][train_list[0]["group"] == group_i]['article_id']
              .value_counts()).index)[:12]
    )
popular_w1_groups

{0: ['0924243001',
  '0923758001',
  '0924243002',
  '0915529005',
  '0909370001',
  '0866731001',
  '0929275001',
  '0448509014',
  '0918522001',
  '0889550002',
  '0919273002',
  '0915529003'],
 1: ['0930380001',
  '0924243001',
  '0751471043',
  '0751471001',
  '0924243002',
  '0796210001',
  '0865799006',
  '0751471038',
  '0923758001',
  '0910601003',
  '0896169002',
  '0863595006'],
 2: ['0924243001',
  '0909370001',
  '0924243002',
  '0923758001',
  '0889550002',
  '0934835001',
  '0915529005',
  '0918522001',
  '0714790020',
  '0896169005',
  '0866731001',
  '0863583001'],
 3: ['0715624001',
  '0918522001',
  '0685814001',
  '0448509014',
  '0850917001',
  '0866731001',
  '0903420001',
  '0918292001',
  '0788575002',
  '0874110016',
  '0751471001',
  '0911699002'],
 4: ['0930380001',
  '0751471043',
  '0751471001',
  '0924243001',
  '0918522001',
  '0579541001',
  '0863595004',
  '0910601002',
  '0678942054',
  '0678942001',
  '0714790020',
  '0673677002'],
 5: ['0924243001',
 

In [46]:
def get_item_dict(transactions):
    purchase_dict = {}

    for i, x in enumerate(zip(transactions['customer_id'], transactions['article_id'])):
        cust_id, art_id = x
        if cust_id not in purchase_dict:
            purchase_dict[cust_id] = {}

        if art_id not in purchase_dict[cust_id]:
            purchase_dict[cust_id][art_id] = 0

        purchase_dict[cust_id][art_id] += 1

    return purchase_dict

purchase_list = []
for tr_i in train_list:
    purchase_list.append(get_item_dict(tr_i)) 

In [47]:
def get_prediction(customer):
    customer_id = customer["customer_id"]
    group = customer["group"]
    
    prediction = []
    for purchase_w in purchase_list:
        if customer_id in purchase_w:
            l = sorted((purchase_w[customer_id]).items(), key=lambda x: -x[1])
            l = [y[0] for y in l if y[0] not in prediction]
            prediction.extend(l)

    l = [y for y in popular_w1_groups[group] if y not in prediction]
    prediction.extend(l)

    l = [y for y in popular_w1 if y not in prediction]
    prediction.extend(l)
    
    return " ".join(prediction[:12])
        
customers["prediction"] = customers.apply(get_prediction, axis=1)

In [48]:
(
    customers[["customer_id", "prediction"]]
        .to_csv("../output/33.groups_news_age_price.csv", index=False, header=True)
)