<a href="https://colab.research.google.com/github/QSLV/personalized_fashion_recs/blob/main/sample_construct.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 目的： 负样本构建测试， 使用简单LGBMRanker做快速validation

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install kaggle
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
!wget https://raw.githubusercontent.com/benhamner/Metrics/master/Python/ml_metrics/average_precision.py

--2022-04-21 11:18:29--  https://raw.githubusercontent.com/benhamner/Metrics/master/Python/ml_metrics/average_precision.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1654 (1.6K) [text/plain]
Saving to: ‘average_precision.py’


2022-04-21 11:18:29 (16.2 MB/s) - ‘average_precision.py’ saved [1654/1654]



In [67]:
# Load Packages
import pandas as pd
import numpy as np
from lightgbm.sklearn import LGBMRanker
from average_precision import apk, mapk
import random
from sklearn.base import BaseEstimator, TransformerMixin

In [8]:
# Helper funcitons

def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

In [4]:
%%time
transactions = pd.read_parquet('/content/drive/MyDrive/H&M/data/transactions_train.parquet')
customers = pd.read_parquet('/content/drive/MyDrive/H&M/data/customers.parquet')
articles = pd.read_parquet('/content/drive/MyDrive/H&M/data/articles.parquet')

CPU times: user 2.27 s, sys: 2.65 s, total: 4.92 s
Wall time: 19.8 s


In [178]:
# Separate train / validation 
test_week = transactions.week.max()
valid = transactions[transactions.week == test_week].copy()
train = transactions[(transactions.week < test_week) & (transactions.week >= transactions.week.max() - 10)].copy()

In [134]:
# 构建label (默认正样本)
train['purchased'] = 1
valid['purchased'] = 1

In [14]:
train.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
28777300,2020-07-08,857913002275398,599580068,0.008458,1,94,1
28777301,2020-07-08,857913002275398,776237011,0.025407,1,94,1
28777302,2020-07-08,857913002275398,844294001,0.011847,1,94,1
28787123,2020-07-08,1658289241058394,877773001,0.00761,1,94,1
28788562,2020-07-08,3828854365940846,507883009,0.013542,1,94,1


In [17]:
len(train)

2809233

In [25]:
# 每周有多少笔实际发生的交易
transactions_by_weeks = {}
for week in train.week.unique():
  transactions_by_weeks[week] = len(train[train['week'] == week])
avg_transactions_by_weeks = sum(transactions_by_weeks.values()) / len(transactions_by_weeks.values())
print("Average transactions number per week is {}".format(avg_transactions_by_weeks))

Average transactions number per week is 280923.3


In [31]:
# 每周平均有多少用户发生购买
avg_customers_by_weeks = train.groupby(['week', 'customer_id'])['customer_id'].value_counts().groupby('week').size().mean()
print("Average customers number per week is {}".format(avg_customers_by_weeks))

Average customers number per week is 76272.2


平均每周每个用户发生3个购买行为（正样本）， 考虑每个用户构建1-3个负样本。

# 负样本构建 (get_neg_samples)

## 方法1: 从每周的top12商品中，若用户没有购买，则考虑作为负样本

In [123]:
# 获取每周top12排名
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [116]:
# 每周top12的dict
weeks_candidates = {}
for item in list(sales.index):
  # candidate = '0' + str(item[1])
  candidate = item[1]
  if item[0] not in weeks_candidates.keys():
    weeks_candidates[item[0]] = [candidate]
  else:
    weeks_candidates[item[0]].append(candidate)

In [127]:
def generate_neg_samples(existed_purchased: str, candidates: list, num = 1):
  purchased = existed_purchased.strip().split()
  cnt = 0
  res = []
  while cnt < num:
    idx = random.randint(0, 11)
    selected_candidate = str(candidates[idx])
    if selected_candidate in purchased:
      continue
    else:
      cnt = cnt + 1
      res.append(selected_candidate)
  return ' '.join(res)

In [149]:
train.article_id = ' ' + train.article_id.astype('str')
train_neg_samples = pd.DataFrame(train.groupby(['week', 'customer_id']).article_id.sum().reset_index())
train_neg_samples

Unnamed: 0,week,customer_id,article_id
0,94,28847241659200,730683036 851094001 757303012
1,94,208119717816961,572797049 540334001 572797002 859105007
2,94,857913002275398,599580068 776237011 844294001
3,94,1037449031262554,801938001 801938001
4,94,1520973890714130,836286001 828933003 840607001 750422023 ...
...,...,...,...
762717,103,18445340048433064259,906612002
762718,103,18445412194736247951,913688003 751471001 865929003
762719,103,18446250046654386343,869872006
762720,103,18446420423308293068,579541001 854193004 577512001 577512001 ...


In [150]:
%%time
train_neg_samples['neg_samples'] = train_neg_samples.apply(lambda x: generate_neg_samples(x.article_id, weeks_candidates[x.week], num=2), axis=1)

CPU times: user 20.8 s, sys: 301 ms, total: 21.1 s
Wall time: 21.1 s


In [153]:
train_neg_samples

Unnamed: 0,week,customer_id,article_id,neg_samples
0,94,28847241659200,730683036 851094001 757303012,866383006 372860002
1,94,208119717816961,572797049 540334001 572797002 859105007,730683021 778064028
2,94,857913002275398,599580068 776237011 844294001,781613013 781613013
3,94,1037449031262554,801938001 801938001,372860001 805308002
4,94,1520973890714130,836286001 828933003 840607001 750422023 ...,778064028 806388001
...,...,...,...,...
762717,103,18445340048433064259,906612002,673677002 923758001
762718,103,18445412194736247951,913688003 751471001 865929003,909370001 673677002
762719,103,18446250046654386343,869872006,809238005 809238001
762720,103,18446420423308293068,579541001 854193004 577512001 577512001 ...,923758001 448509014


In [174]:
# 生成每一行都是负样本
neg_samples = train_neg_samples['neg_samples'].str.split(' ', expand=True)
neg_samples = neg_samples.stack().reset_index(level=1, drop=True)
neg_samples.name = 'neg'
neg_samples = train_neg_samples.drop(['neg_samples', 'article_id'], axis=1).join(neg_samples)
neg_samples.rename(columns={'neg':'article_id'}, inplace=True)
neg_samples.article_id = neg_samples.article_id.astype('int32')
neg_samples

Unnamed: 0,week,customer_id,article_id
0,94,28847241659200,866383006
0,94,28847241659200,372860002
1,94,208119717816961,730683021
1,94,208119717816961,778064028
2,94,857913002275398,781613013
...,...,...,...
762719,103,18446250046654386343,809238001
762720,103,18446420423308293068,923758001
762720,103,18446420423308293068,448509014
762721,103,18446630855572834764,448509014


In [169]:
# 获取商品价格信息
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()
mean_price.head()

week  article_id
0     108775015     0.008373
      108775044     0.008374
      108775051     0.005023
      110065001     0.024983
      110065002     0.024650
Name: price, dtype: float32

In [170]:
# 获取用户购买信息（购买渠道）
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price', 't_dat']) \
    .copy()
unique_transactions.head()

Unnamed: 0,customer_id,sales_channel_id,week
25784,1728846800780188,2,0
5389,2076973761519164,2,0
47429,2918879973994241,1,0
14650,4233235614030232,2,0
47464,7002608101212285,2,0


In [175]:
# 填充负样本的特征信息
neg_samples = pd.merge(neg_samples, mean_price, on=['week', 'article_id'])
neg_samples = pd.merge(neg_samples, unique_transactions, on=['week', 'customer_id'])
neg_samples['purchased'] = 0
neg_samples

Unnamed: 0,week,customer_id,article_id,price,sales_channel_id,purchased
0,94,28847241659200,866383006,0.024971,1,0
1,94,28847241659200,372860002,0.013233,1,0
2,94,2416804118419582,866383006,0.024971,1,0
3,94,2416804118419582,730683050,0.041446,1,0
4,94,4181258700444159,866383006,0.024971,1,0
...,...,...,...,...,...,...
1525439,103,18395225685106483763,918522001,0.041416,2,0
1525440,103,18405720660127377943,918522001,0.041416,1,0
1525441,103,18405720660127377943,918522001,0.041416,1,0
1525442,103,18412798578613536065,918522001,0.041416,1,0


In [176]:
# 正样本
pos_samples = train[['week', 'customer_id', 'article_id', 'price', 'sales_channel_id']].reset_index(drop=True)
pos_samples['purchased'] = 1
pos_samples

Unnamed: 0,week,customer_id,article_id,price,sales_channel_id,purchased
0,94,857913002275398,599580068,0.008458,1,1
1,94,857913002275398,776237011,0.025407,1,1
2,94,857913002275398,844294001,0.011847,1,1
3,94,1658289241058394,877773001,0.007610,1,1
4,94,3828854365940846,507883009,0.013542,1,1
...,...,...,...,...,...,...
2809228,103,18446630855572834764,568601045,0.050831,2,1
2809229,103,18446630855572834764,568601045,0.050831,2,1
2809230,103,18446630855572834764,898713001,0.067780,2,1
2809231,103,18446630855572834764,898713001,0.067780,2,1


### 整合上述流程

In [193]:
# 整合-生成负样本
def get_neg_samples(inputX):
  # 先获取用户已购商品信息
  inputX.article_id = ' ' + inputX.article_id.astype('str')
  train_neg_samples = pd.DataFrame(inputX.groupby(['week', 'customer_id']).article_id.sum().reset_index())

  # 负样本采样
  train_neg_samples['neg_samples'] = train_neg_samples.apply(lambda x: generate_neg_samples(x.article_id, weeks_candidates[x.week], num=2), axis=1)
  
  # 采样后，由一行展开成多行
  neg_samples = train_neg_samples['neg_samples'].str.split(' ', expand=True)
  neg_samples = neg_samples.stack().reset_index(level=1, drop=True)
  neg_samples.name = 'neg'
  neg_samples = train_neg_samples.drop(['neg_samples', 'article_id'], axis=1).join(neg_samples)
  neg_samples.rename(columns={'neg':'article_id'}, inplace=True)
  neg_samples.article_id = neg_samples.article_id.astype('int32')
  
  # 获取其他特征信息，如价格和购买渠道
  neg_samples = pd.merge(neg_samples, mean_price, on=['week', 'article_id'])
  neg_samples = pd.merge(neg_samples, unique_transactions, on=['week', 'customer_id'])

  # 打标签
  neg_samples['purchased'] = 0
  
  return neg_samples

# 整合-生成正样本
def get_pos_samples(inputX):
  pos_samples = inputX[['week', 'customer_id', 'article_id', 'price', 'sales_channel_id']].reset_index(drop=True)
  pos_samples['purchased'] = 1
  pos_samples.article_id = pos_samples.article_id.astype('int32')
  return pos_samples

In [180]:
train_neg_samples = get_neg_samples(train)
train_neg_samples

Unnamed: 0,week,customer_id,article_id,price,sales_channel_id,purchased
0,94,28847241659200,806388001,0.013301,1,0
1,94,28847241659200,806388001,0.013301,1,0
2,94,1037449031262554,806388001,0.013301,2,0
3,94,1037449031262554,778064028,0.006937,2,0
4,94,1520973890714130,806388001,0.013301,1,0
...,...,...,...,...,...,...
1525439,103,18268074439206914618,918292001,0.041424,2,0
1525440,103,18325073139841775546,918292001,0.041424,2,0
1525441,103,18325073139841775546,918292001,0.041424,2,0
1525442,103,18425562864985812341,918292001,0.041424,2,0


In [194]:
train_pos_samples = get_pos_samples(train)
train_pos_samples

Unnamed: 0,week,customer_id,article_id,price,sales_channel_id,purchased
0,94,857913002275398,599580068,0.008458,1,1
1,94,857913002275398,776237011,0.025407,1,1
2,94,857913002275398,844294001,0.011847,1,1
3,94,1658289241058394,877773001,0.007610,1,1
4,94,3828854365940846,507883009,0.013542,1,1
...,...,...,...,...,...,...
2809228,103,18446630855572834764,568601045,0.050831,2,1
2809229,103,18446630855572834764,568601045,0.050831,2,1
2809230,103,18446630855572834764,898713001,0.067780,2,1
2809231,103,18446630855572834764,898713001,0.067780,2,1


## 合并正负初始样本， 与customer和article进行merge，获取静态特征

In [199]:
train_samples = pd.concat([train_neg_samples, train_pos_samples]).drop_duplicates(['customer_id', 'article_id', 'week'])

In [196]:
train_samples.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3991128 entries, 0 to 2809232
Data columns (total 6 columns):
 #   Column            Dtype  
---  ------            -----  
 0   week              int64  
 1   customer_id       uint64 
 2   article_id        int32  
 3   price             float32
 4   sales_channel_id  int8   
 5   purchased         int64  
dtypes: float32(1), int32(1), int64(2), int8(1), uint64(1)
memory usage: 156.1 MB


In [200]:
train_samples = pd.merge(train_samples, articles, on='article_id', how='left')
train_samples = pd.merge(train_samples, customers, on='customer_id', how='left')


In [201]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code',]

In [203]:
train_x = train_samples[columns_to_use]
train_y = train_samples['purchased']

# 模型构建： LGBMRanker

In [206]:
train_baskets = train_samples.groupby(['week', 'customer_id'])['article_id'].count().values

In [207]:
ranker = LGBMRanker(
    objective = 'lambdarank',
    metric = 'ndcg',
    boosting_type = 'dart',
    n_estimators = 3,
    importance_type = 'gain',
    verbose = 10
)

In [208]:
ranker = ranker.fit(
    train_x,
    train_y,
    group=train_baskets,
)

In [209]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

article_id 0.9999999999999993
age 3.4315462166872547e-16
postal_code 2.94132307118057e-16
club_member_status 0.0
Active 0.0
FN 0.0
garment_group_no 0.0
section_no 0.0
fashion_news_frequency 0.0
index_code 0.0
department_no 0.0
perceived_colour_master_id 0.0
perceived_colour_value_id 0.0
colour_group_code 0.0
graphical_appearance_no 0.0
product_type_no 0.0
index_group_no 0.0
