In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import gc
import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# About my pipeline and notebook

**General idea about my pipeline:**
1) Preprocessing input tables in pyspark

2) Build features related to last week of transactions in a rolling way, based on the idea that my ranker should be reactive: consider trends, top sold items, other ideas derivable from Transactions table

3) For customers and items tables I will build simple numerical and categorical features (some features could be one hot encoded)

4) Generate negative observations: the strategy to generate those is the most important step of the challenge. At the moment I leverage on the analysis I performed on the transactions table. The final train table will contain *x customers times n test rows*, therefore the dataset becomes quickly heavy. The same can be said about application table: with 20 tests per user, the table becomes around 25mil rows.

5) At the end of the pipeline I want to save locally a train dataset and the application dataset in order to leave the Ram as free as possible for model development. I will do this with pyspark to csv command because it is not possible to perform toPandas() with big tables.

6) My model will be a LightGBM ranker.

**Current attention points**

1) Need to generate new strategies for candidate items
2) Though the pipeline runs, Ram is becoming a problem again

# This section contains prints of descriptive information about the input datasets

In [None]:
# articles: (105542, 25)
#######################################
# unique for each col: article_id                      105542
# product_code                     47224
# prod_name                        45875
# product_type_no                    132
# product_type_name                  131
# product_group_name                  19
# graphical_appearance_no             30
# graphical_appearance_name           30
# colour_group_code                   50
# colour_group_name                   50
# perceived_colour_value_id            8
# perceived_colour_value_name          8
# perceived_colour_master_id          20
# perceived_colour_master_name        20
# department_no                      299
# department_name                    250
# index_code                          10
# index_name                          10
# index_group_no                       5
#index_group_name                     5
# section_no                          57
# section_name                        56
# garment_group_no                    21
# garment_group_name                  21
# detail_desc                      43404
# dtype: int64
#######################################
# null count: article_id                        0
# detail_desc                     416
# dtype: int64
#######################################

In [None]:
# customers: (1371980, 7)
#######################################
# unique for each col: customer_id               1371980
# FN                              1
# Active                          1
# club_member_status              3
# fashion_news_frequency          4
# age                            84
# postal_code                352899
#dtype: int64
#######################################
# null count: customer_id                    0
# FN                        895050
# Active                    907576
# club_member_status          6062
# fashion_news_frequency     16009
# age                        15861
# postal_code                    0
# dtype: int64
#######################################


In [None]:
# transactions: (31788324, 5)
#######################################
# unique for each col: t_dat                   734
# customer_id         1362281
# article_id           104547
# price                  9857
# sales_channel_id          2
# dtype: int64
#######################################
# null count: t_dat               0
#######################################


In [None]:
#application: (1371980, 2)
#######################################
# unique for each col: customer_id    1371980
# prediction           1
# dtype: int64
#######################################
# null count: customer_id    0
# prediction     0
# dtype: int64
#######################################


# Pyspark

pyspark will be used for feature engineering and preprocessing

In [None]:
!pip install pyspark -q
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, ArrayType, DoubleType, BooleanType

sc = SparkSession.builder.appName("Recommendations").config("spark.sql.files.maxPartitionBytes", 5000000).getOrCreate()
spark = SparkSession(sc)

In [None]:
articles = spark.read.option("header",True) \
                .csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = spark.read.option("header",True) \
                .csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions = spark.read.option("header",True) \
                .csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

Articles table, simplified

In [None]:
articles = articles\
    .selectExpr('cast (article_id as int) article_id', 'cast (product_type_no as int) product_type_no', 'cast (graphical_appearance_no as int) graphical_appearance_no',
                'cast (colour_group_code as int) colour_group_code ','cast (perceived_colour_value_id as int) perceived_colour_value_id',
                'cast (department_no as int) department_no',  'cast (index_group_no as int) index_group_no',
                'cast (section_no as int) section_no', 'cast (garment_group_no as int) garment_group_no')\
    .dropDuplicates()

articles.show(10)

Customer table

In [None]:
customers = customers\
    .fillna({'age': '25'})\
    .drop('FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'postal_code')


# Weeks preprocessing

In [None]:
start_date = '2020-07-22'

min_week = 1
max_week = 9
application_week = max_week + 1
#week: changed to tuesday
print(application_week)

In [None]:
transactions = transactions\
    .withColumn('article_id', transactions['article_id'].cast(IntegerType()))\
    .filter(F.col('t_dat') >= start_date)\
    .withColumn('week', F.when((F.col('t_dat') >= '2020-09-16') & (F.col('t_dat') <= '2020-09-22'), 9)
                         .when((F.col('t_dat') >= '2020-09-09') & (F.col('t_dat') <= '2020-09-15'), 8)
                         .when((F.col('t_dat') >= '2020-09-02') & (F.col('t_dat') <= '2020-09-08'), 7)
                         .when((F.col('t_dat') >= '2020-08-26') & (F.col('t_dat') <= '2020-09-01'), 6)
                         .when((F.col('t_dat') >= '2020-08-19') & (F.col('t_dat') <= '2020-08-25'), 5)
                         .when((F.col('t_dat') >= '2020-08-12') & (F.col('t_dat') <= '2020-08-18'), 4)
                         .when((F.col('t_dat') >= '2020-08-05') & (F.col('t_dat') <= '2020-08-11'), 3)
                         .when((F.col('t_dat') >= '2020-07-29') & (F.col('t_dat') <= '2020-08-04'), 2)
                         .when((F.col('t_dat') >= '2020-07-22') & (F.col('t_dat') <= '2020-07-28'), 1)
                        .otherwise(999))\
    .drop('t_dat', 'price', 'sales_channel_id')\
    .orderBy(['week', 'customer_id'], ascending=True)

transactions.show(10)
# code to generate unique transaction id:
# .withColumn('t_id', F.concat_ws('_',transactions.t_dat, transactions.customer_id))\

# A bit of useful statistics

In [None]:
transactions_per_week = transactions\
    .groupBy('week').count().orderBy('week', ascending=True)\

transactions_per_week.show(10)
transactions_per_week.unpersist()

# check transactions loaded
# remove data from memory

In [None]:
from pyspark.sql.functions import countDistinct

unique_customers = transactions\
    .select(countDistinct('customer_id'))

print("customer_id in current perimeter : "+ str(unique_customers.collect()[0][0]))
unique_customers.unpersist()

In [None]:
# number of orders for each customer each week
# shift week to +1 to make it a feature for next week

customers_orders_lw = transactions\
    .groupBy('customer_id', 'week').count().orderBy('count', ascending=False)\
    .withColumnRenamed('count', 'lw_orders_count')\
    .withColumn('week', F.col('week')+1)

customers_orders_lw.show(10)

Generate list of most sold items and top 12 rank for each week

In [None]:
#rank articles for each week

articles_rank = transactions\
    .groupBy('article_id', 'week').count().orderBy('count', ascending=False)\
    .withColumnRenamed('count', 'articles_order_count')

w_articles = Window.partitionBy(['week']).orderBy(articles_rank.articles_order_count.desc())

articles_rank = articles_rank\
    .withColumn('rank', F.row_number().over(w_articles))\
    .filter(F.col('rank') <= 12)\
    .drop('articles_order_count')\
    .orderBy(['rank', 'week'])

articles_top12 = articles_rank\
    .filter(F.col('rank') <= 12)\
    .drop('articles_order_count')\
    .orderBy(['rank', 'week'])

articles_rank.show(20)
articles_top12.show(25)

In [None]:
# after counting rows, I can drop duplicates
transactions = transactions\
    .dropDuplicates()\

transactions.show(10)

Previously bought items

In [None]:
# the goal of this transfrom is to shift last bought basket to next week in which the customer bought something
# this is done to create negative observations in "next week" (from a copy of last purchased basked)
# because the customer could've skipped some weeks, I need to put a number to each week partition and shift it +1

# add a reference number for customers who bought something in a certain week (somewhat of a transaction identifier)
rn_transactions = transactions\
    .select('customer_id', 'week')\
    .dropDuplicates()

w_transactions = Window.partitionBy('customer_id').orderBy(rn_transactions.week.asc())

# enumerate rows by week partition
rn_transactions = rn_transactions\
    .withColumn('week_rn', F.row_number().over(w_transactions))\
    .select('customer_id', 'week', 'week_rn')\
    .orderBy(['customer_id', 'week'], ascending = True)

rn_transactions.show(10)

rn_transactions0 = rn_transactions.drop('article_id')\
    .withColumnRenamed('week', 'new_week')\
    .dropDuplicates()

# shift rn to next row
# keep week info from rn_transactions0 and join on shifted week_rn

last_purchase = rn_transactions\
    .withColumn('week_rn', F.col('week_rn')+1)\
    .join(rn_transactions0, ['customer_id', 'week_rn'], 'inner')\
    .join(transactions, ['customer_id', 'week'], 'left')\

last_purchase.show(10)

last_purchase = last_purchase\
    .drop('week')\
    .withColumnRenamed('new_week', 'week')\
    .orderBy(['customer_id', 'week'], ascending = True)\
    .select('customer_id', 'article_id', 'week')

last_purchase.show(20)

lp_per_week = last_purchase\
    .groupBy('week').count().orderBy('week', ascending=True)\

lp_per_week.show(10)
lp_per_week.unpersist()
rn_transactions0.unpersist()

Fix the uncertain Y

In [None]:
transactions_pos = transactions\
    .select('customer_id', 'article_id', 'week')\
    .withColumn('y', F.lit(1))

In [None]:
# keep only negative obs by excluding stuff that the customer bought in next week (true y)
last_purchase = last_purchase\
    .join(transactions_pos, ['customer_id', 'article_id', 'week'], 'left')\
    .fillna({'y': 0})\
    .filter(F.col('y').isin(0))\
    .select('customer_id', 'article_id', 'week', 'y')

last_purchase.show(10)

Add top 12 items for each week

In [None]:
# create negative observations based on top 12 items for each week

articles_top12_pw = articles_top12\
    .withColumn('week', F.col('week')+1)

listona = transactions\
    .select('customer_id', 'week')\
    .dropDuplicates()\
    .join(articles_top12_pw, ['week'], 'left')\
    .join(transactions_pos, ['customer_id', 'article_id', 'week'], 'left')\
    .fillna({'y': 0})\
    .filter(F.col('y').isin(0))\
    .select('customer_id', 'article_id', 'week', 'y')\
    .orderBy('customer_id', 'week')\

listona.show(10)
articles_top12_pw.unpersist()
transactions_pos.unpersist()

Put everything together and join features to create train df

In [None]:
# exclude from train the most remote week of observation since I don't generate a strategy for it
# join all features

train = transactions\
    .select('customer_id', 'article_id', 'week')\
    .withColumn('y', F.lit(1))\
    .unionByName(listona)\
    .unionByName(last_purchase)\
    .join(customers, 'customer_id', 'left')\
    .join(articles_rank, ['article_id', 'week'], 'left')\
    .join(articles, 'article_id', 'left')\
    .join(customers_orders_lw, ['customer_id', 'week'], 'left')\
    .orderBy(['week', 'customer_id'])\
    .filter(~F.col('week').isin(min_week))\
    .fillna({'rank': 999})\
    .fillna({'lw_orders_count': 0})\
    .orderBy(['week', 'customer_id'], ascending=True)

train.show(10)

Replicate the same schema on Application table

Note that by exploding this table the result will always be 1.3 Mil rows times the number of tests.

In [None]:
application = spark.read.option("header",True) \
                .csv("../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")

# Candidate observations for Application

In [None]:
# top 12 sold from last week
articles_top12_app = articles_top12\
    .filter(F.col('week').isin(max_week))\
    .drop('week')\
    .withColumn('week', F.lit(application_week))\
    .dropDuplicates()

In [None]:
# top 12 items last week list

top_12 = articles_top12_app.toPandas()
top_12_lw = top_12['article_id'].tolist()

print(top_12_lw)

In [None]:
# articles rank feature
last_week_rank = articles_rank\
    .filter(F.col('week').isin(max_week))\
    .drop('week')\
    .withColumn('week', F.lit(application_week))\
    .dropDuplicates()

In [None]:
#last purchased basket

w_lp = Window.partitionBy('customer_id').orderBy(transactions.week.desc())

last_purchased_app = transactions\
    .select('customer_id', 'week')\
    .dropDuplicates()\
    .withColumn('rn', F.row_number().over(w_lp))\
    .filter(F.col('rn').isin(1))\
    .drop('rn')\
    .join(transactions, ['customer_id', 'week'], 'left')

In [None]:
application = application\
    .select('customer_id')\
    .join(customers, 'customer_id', 'left')\
    .dropDuplicates()\
    .withColumn('week', F.lit(application_week))\
    .join(articles_top12_app, 'week', 'left')\
    .select('customer_id', 'article_id', 'week')\
    .unionByName(last_purchased_app)\
    .dropDuplicates()\
    .join(customers, 'customer_id', 'left')\
    .join(last_week_rank, 'article_id', 'left')\
    .join(articles, 'article_id', 'left')\
    .join(customers_orders_lw, ['customer_id', 'week'], 'left')\
    .fillna({'rank': 999})\
    .fillna({'lw_orders_count': 0})\
    .drop('week')\
    .dropDuplicates()

application.show(10)

Clear memory as much as possible before writing csv

In [None]:
import gc
last_week_rank.unpersist()
articles_top12_app.unpersist()
last_purchased_app.unpersist()

transactions.unpersist()
customers.unpersist()
articles.unpersist()

gc.collect()

# Save datasets from pyspark to csv

In [None]:
import pandas as pd
import gc 

train.repartition(1).write.csv('/kaggle/working/train_df', header = 'true')

train.unpersist()
gc.collect()

save train save path location

In [None]:
print(os.listdir("../"))
print(os.listdir("../working/train_df"))

In [None]:
path_t = os.listdir("../working/train_df")
trim_t = [x for x in path_t if x.startswith('part')]
stringa_t = ''.join(trim_t)
train_path = '../working/train_df/'+stringa_t

print(train_path)

In [None]:
# save application
application.repartition(1).write.csv('/kaggle/working/application', header = 'true')

application.unpersist()

# Model



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMRanker

train = pd.read_csv(train_path)
train.sort_values(['week', 'customer_id'], inplace=True)
train.reset_index(drop=True, inplace=True)
print('train:', train.shape)

In [None]:
import re
#train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
# columns renamed because for some reason one hot encoding creates invalid characters
train_cols = list(train.columns)
print(train_cols)

The model requires 3 inputs: query id (quids), X, y

In [None]:
qids_train = train.groupby(['week', 'customer_id'])['article_id'].count().values

X_train = train.drop(["y", 'customer_id', 'week'], axis=1)
y_train = train["y"]

In [None]:
print(qids_train)

Basic model

In [None]:
# n_estimators is recommended as default to 100 by documentation

model = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=100,
    importance_type='gain',
    verbose=10,
    random_state = 17
)

model.fit(
    X=X_train,
    y=y_train,
    group=qids_train,
)

In [None]:
# feature importance
x_train_cols = list(X_train.columns)

for i in model.feature_importances_.argsort()[::-1]:
    print(x_train_cols[i], model.feature_importances_[i]/model.feature_importances_.sum())

Application

In [None]:
print(os.listdir("../"))
print(os.listdir("../working/application"))

Before loading application I save its path to app_path variable, since pyspark saves the file name everytime with a different name in the kaggle/working/application folder which I specified earlier

In [None]:
path = os.listdir("../working/application")
trim = [x for x in path if x.startswith('part')]
stringa = ''.join(trim)
app_path = '../working/application/'+stringa

print(app_path)

In [None]:
application = pd.read_csv(app_path)
application.sort_values('customer_id', inplace=True)
application.reset_index(drop=True, inplace=True)

application_x = application.drop('customer_id', axis = 1)
print('application_x:', application_x.shape)
app_cols = list(application_x.columns)
print(app_cols)

In [None]:
application['prediction'] = model.predict(application_x)

In [None]:
pred_dict = application \
    .sort_values(['customer_id', 'prediction'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

# Submission

In [None]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [None]:
preds = []
for c_id in sub.customer_id:
    pred = pred_dict.get(c_id, [])
    pred = pred + top_12_lw
    preds.append(pred[:12])

In [None]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [None]:
sub_name = 'submission'
sub.to_csv(f'{sub_name}.csv', index=False)

Empty for comments
* keep same features on application once finished -> **done**
* out of memory on application set -> **solved**
* to be tested: customers that buy the same items each week / month ?

# Credits:
* Ideas for preprocessing and ALS model https://www.kaggle.com/code/nadianizam/h-m-fashion-recommendation-with-pyspark
* Ranker model: https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/307288
* Submission functions: https://www.kaggle.com/code/marcogorelli/radek-s-lgbmranker-starter-pack/notebook
* Huge thanks to: Radek, Paweł Jankiewicz, Hao, everyone who took time to comment

# Changelog

**Changelog**

First release (0,005): 
* basic pipeline, added comments to the workbook 

Ver 33 (0,0083): 
* Updated strategy. Now the workbook covers two weeks and looks at previously bought items in week -3. 
* I generate 17 negative observations based on top sold items,and 4 negative based on previously bought items from the customer in week -3.

Ver 47: 
* Added some week -4 observations to generate candidates. 
* Added combo items (testing): who bought this also bought that. 
* Fixed typo in model setup: qids_train = train_df.groupby("customer_id")['article_id'].count().to_numpy() used to be *qids_train = train_df.groupby("customer_id")["customer_id"].count().to_numpy()*

Ver 48 (0,0098): 
* Reduced top sold items to a list of 12 instead of 17 based on the assumptions that this is my least customized strategy and acts as a filler

Ver 54 (0,011):
* Removed combo items because the strategy is not solid enough, I only found a few 100s of purchased together
* Basic fine tuning of model params, n_estimators = 100

Ver 55 (0,0106):
* Major notebook rework. Now the model works on week-rolling style.
* Candidates are generated in a more coherent way with respect of the given problem, since they are created by looking at previous week

Ver 68 (0,0167):
* To avoid entropy I now keep only Rank feature up to 12 and fill na with 999
* Reworked weeks to better match the periods of observation, now I consider 7 days rather than calendar week
* Added to final prediction function a fill part to make sure I submit 12 elements of the list. I fill by appending the top 12 most sold items from last week