# H&M Personalized Fashion Recommendations: 1. Data analysis

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from scipy.sparse import csr_matrix

from tqdm import tqdm
from datetime import datetime, timedelta

import logging
logging.basicConfig(level=logging.INFO)
logging.info("test")

import sys
sys.path.append("..")
from src.utils import *

INFO:root:test


In [2]:
articles = pd.read_csv("../input/articles_proc.csv", dtype={"article_id": str})
customers = pd.read_csv("../input/customers_proc.csv")
transactions = pd.read_csv("../input/transactions_full.csv", dtype={"article_id": str})

# Популярные по группам 

In [3]:
min_date = subtract_days(transactions["t_dat"].max(), 7 * 4)
train = (
    transactions[transactions["t_dat"] >= min_date]
        [["t_dat", "customer_id", "article_id"]]
        .merge(customers[["customer_id", "age_group", "price_group"]],
               on="customer_id", how="inner")
)
train

Unnamed: 0,t_dat,customer_id,article_id,age_group,price_group
0,2020-08-25,00015c1a121e08bbd2552c15fbbb6e6b19d3bf8f7b6a3d...,0868405004,22-29,high
1,2020-08-25,00015c1a121e08bbd2552c15fbbb6e6b19d3bf8f7b6a3d...,0918292001,22-29,high
2,2020-08-25,00026b2e6fa592951b3f1f5b24fea406a356d23e84fc80...,0903062001,54+,medium
3,2020-08-25,0010e8eb18f131e724d6997909af0808adbba057529edb...,0828067001,22-29,high
4,2020-08-25,0010e8eb18f131e724d6997909af0808adbba057529edb...,0883033001,22-29,high
...,...,...,...,...,...
1085225,2020-09-22,ffb72741f3bc3d98855703b55d34e05bc7893a5d6a99a3...,0914404001,22-29,medium
1085226,2020-09-22,ffb72741f3bc3d98855703b55d34e05bc7893a5d6a99a3...,0751471041,22-29,medium
1085227,2020-09-22,ffb72741f3bc3d98855703b55d34e05bc7893a5d6a99a3...,0751471038,22-29,medium
1085228,2020-09-22,ffc92c3f7b0b302f393c2968b290f6e5c5b5510d1cf1df...,0895730002,45-54,high


In [8]:
train["price_group"].value_counts()

high      689148
medium    355529
low        40553
Name: price_group, dtype: int64

In [9]:
i_group_map = dict(
    enumerate(
        map(tuple, 
            train[["age_group", "price_group"]].drop_duplicates().values
           )
    )
)
group_i_map = {group: i for i, group in i_group_map.items()}
group_i_map

{('22-29', 'high'): 0,
 ('54+', 'medium'): 1,
 ('16-21', 'medium'): 2,
 ('30-44', 'high'): 3,
 ('45-54', 'high'): 4,
 ('22-29', 'medium'): 5,
 ('54+', 'high'): 6,
 ('30-44', 'medium'): 7,
 ('30-44', 'low'): 8,
 ('45-54', 'medium'): 9,
 ('45-54', 'low'): 10,
 ('16-21', 'high'): 11,
 ('22-29', 'low'): 12,
 ('54+', 'low'): 13,
 ('16-21', 'low'): 14}

In [10]:
def get_group(line):
    group = (line["age_group"], line["price_group"])
    return group_i_map[group]
train["group"] = train.apply(get_group, axis=1)
customers["group"] = customers.apply(get_group, axis=1)

In [11]:
train["group"].value_counts()

0     255532
3     185572
4     130917
5     121130
7      77004
6      73273
2      63514
9      59573
11     43854
1      34308
12     10973
14     10521
8       9466
10      5642
13      3951
Name: group, dtype: int64

In [12]:
def get_tr_list(transactions, weeks: int = 4):
    tr_list = []
    max_date = transactions["t_dat"].max()
    for i in range(1, weeks + 1):
        min_date = subtract_days(max_date, 7 * i)
        tr_list.append(transactions[transactions['t_dat'] >= min_date])
    return tr_list

train_list = get_tr_list(train, weeks=4)

In [13]:
popular_w1 = list((train_list[0]['article_id'].value_counts()).index)[:12]
popular_w1

['0924243001',
 '0924243002',
 '0923758001',
 '0918522001',
 '0909370001',
 '0866731001',
 '0751471001',
 '0915529003',
 '0915529005',
 '0448509014',
 '0762846027',
 '0714790020']

In [14]:
popular_w1_groups = {}
for group_i in group_i_map.values():
    popular_w1_groups[group_i] = (
        list((train_list[0][train_list[0]["group"] == group_i]['article_id']
              .value_counts()).index)[:12]
    )
popular_w1_groups

{0: ['0924243001',
  '0909370001',
  '0923758001',
  '0924243002',
  '0915529005',
  '0889550002',
  '0918522001',
  '0866731001',
  '0929275001',
  '0934835001',
  '0448509014',
  '0896169005'],
 1: ['0930380001',
  '0924243001',
  '0751471043',
  '0751471001',
  '0918522001',
  '0924243002',
  '0865799006',
  '0910601003',
  '0863595004',
  '0923758001',
  '0910601002',
  '0751471038'],
 2: ['0918522001',
  '0685813003',
  '0685814001',
  '0448509014',
  '0715624001',
  '0866731001',
  '0850917001',
  '0918292001',
  '0911699002',
  '0751471001',
  '0685814003',
  '0706016001'],
 3: ['0924243001',
  '0909370001',
  '0923758001',
  '0889550002',
  '0915529003',
  '0919273002',
  '0918525001',
  '0906352001',
  '0935541001',
  '0933032002',
  '0923340001',
  '0896169005'],
 4: ['0924243002',
  '0924243001',
  '0928206001',
  '0923758001',
  '0919273002',
  '0910601003',
  '0915529003',
  '0806131012',
  '0894780001',
  '0751471001',
  '0762846027',
  '0918890002'],
 5: ['0866731001',
 

In [15]:
def get_item_dict(transactions):
    purchase_dict = {}

    for i,x in enumerate(zip(transactions['customer_id'], transactions['article_id'])):
        cust_id, art_id = x
        if cust_id not in purchase_dict:
            purchase_dict[cust_id] = {}

        if art_id not in purchase_dict[cust_id]:
            purchase_dict[cust_id][art_id] = 0

        purchase_dict[cust_id][art_id] += 1

    return purchase_dict

purchase_list = []
for tr_i in train_list:
    purchase_list.append(get_item_dict(tr_i)) 

In [16]:
def get_prediction(customer):
    customer_id = customer["customer_id"]
    group = customer["group"]
    
    prediction = []
    for purchase_w in purchase_list:
        if customer_id in purchase_w:
            l = sorted((purchase_w[customer_id]).items(), key=lambda x: -x[1])
            l = [y[0] for y in l if y[0] not in prediction]
            prediction.extend(l)

    l = [y for y in popular_w1_groups[group] if y not in prediction]
    prediction.extend(l)

    l = [y for y in popular_w1 if y not in prediction]
    prediction.extend(l)
    
    return " ".join(prediction[:12])
        
customers["prediction"] = customers.apply(get_prediction, axis=1)

In [17]:
customers

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,age_group,price_min,price_max,...,Baby/Children_count,Menswear_count,Sport_count,Divided_count,common_group,sex,has_children,price_group,group,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,1.0,1.0,ACTIVE,NONE,49.0,Other,45-54,0.010153,0.054220,...,5.0,1.0,0.0,2.0,Lady,Woman,1,medium,9,0568601043 0751471001 0448509014 0751471038 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,1.0,1.0,ACTIVE,NONE,25.0,Other,22-29,0.006763,0.084729,...,4.0,0.0,1.0,19.0,Lady,Woman,1,high,0,0924243001 0909370001 0923758001 0924243002 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,1.0,1.0,ACTIVE,NONE,24.0,Other,22-29,0.013542,0.067780,...,0.0,4.0,1.0,1.0,Lady,Woman,0,high,0,0794321007 0924243001 0909370001 0923758001 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,1.0,1.0,ACTIVE,NONE,54.0,Other,45-54,0.030492,0.030492,...,0.0,0.0,2.0,0.0,Divided,Unknown,0,low,10,0889456001 0850917003 0781758003 0889456002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,Other,45-54,0.016932,0.059305,...,0.0,0.0,2.0,0.0,Lady,Woman,0,medium,9,0751471001 0448509014 0751471038 0751471043 07...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,1.0,1.0,ACTIVE,NONE,24.0,Other,22-29,0.005068,0.067780,...,0.0,1.0,4.0,0.0,Lady,Woman,0,high,0,0713997002 0720125039 0740922009 0791587007 08...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,1.0,1.0,ACTIVE,NONE,21.0,Other,16-21,0.005068,0.076254,...,7.0,2.0,0.0,48.0,Divided,Woman,1,high,11,0918522001 0929275001 0898692006 0865799006 04...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,ACTIVE,Regularly,21.0,Other,16-21,0.004559,0.042356,...,0.0,5.0,3.0,18.0,Divided,Woman,0,medium,2,0689365050 0884081001 0794819001 0762846027 09...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,ACTIVE,Regularly,18.0,Other,16-21,0.013542,0.050831,...,0.0,0.0,0.0,4.0,Divided,Woman,0,medium,2,0918522001 0685813003 0685814001 0448509014 07...


In [18]:
customers[["customer_id", "prediction"]].to_csv("../output/31.group_time_bench_age_price_max.csv", index=False, header=True)