<a id="contents"></a>
# Contents
1. [Import libraries](#libs)
2. [Load data](#df)

3. [Preprocess](#preprocess)

    3.1. [Embed texts into vectors using BERT](#etb)

4. [Retrieval](#retrieval)
    
    4.1. [BERT + Neareast Neighbors](#bert_NN)
    
    4.2. [Recommend Items Repurchased](#rec_repur)
    
    4.3. [Trending Products Weekly](#trend)
    
    4.4. [SVD](#SVD)
    
5. [Rerank](#rerank)

<a id="libs"></a>
# 1. Import libraries

In [3]:
import os
import re
from typing import List, Union, Any
from dataclasses import dataclass
from datetime import datetime
import numpy as np
import pandas as pd
from torch import nn
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from transformers import AutoTokenizer, AutoModel
import warnings
warnings.filterwarnings("ignore")
import glob
import reco
from tqdm import tqdm
import datetime
from collections import Counter



<a id = "df"></a>
# 2. Load data

In [2]:
# Defining the base paths.
BASE_IN_PATH = "/kaggle/input/h-and-m-personalized-fashion-recommendations"
BASE_OUT_PATH = "/kaggle/working"

In [None]:
articles_df = pd.read_csv(os.path.join(BASE_IN_PATH, "articles.csv"))
customers_df = pd.read_csv(os.path.join(BASE_IN_PATH, "customers.csv"))
transactions_df = pd.read_csv(os.path.join(BASE_IN_PATH, "transactions_train.csv"), dtype={"article_id": str, "customer_id": str}, parse_dates=["t_dat"])
submission_df = pd.read_csv(os.path.join(BASE_IN_PATH, "sample_submission.csv"))

<a id = "preprocess"></a>
# 3. Preprocessing

<a id = "etb"></a>
## 3.1. Embed texts into vectors using BERT

The preprocessing pipeline will include:

- Fill in the NaN values of `detail_desc` with an empty string 
- Merge *the chosen* textual columns into one called `text`
- Lowercase the new column `text`
- Using *BERT*, embed the values in `text` into vectors
- Left join the transactions dataframe with the articles dataframe (aquire the new vector column)
- Calculate the average transaction vector for each customer and add it to `customers_df` (or create a new DataFrame)
- Add a new column called `bought_articles` in `customers_df`, in which all article IDs of the bough articles for each customer will be saved

In [None]:
# Let's start by filling the NaN values in `detail_desc` with an empty string.
articles_df["detail_desc"] = articles_df["detail_desc"].fillna("")

In [None]:
# These were the selected textual columns to be merged.
text_cols = [
    "prod_name",
    "product_type_name",
    "product_group_name",
    "graphical_appearance_name",
    "colour_group_name",
    "department_name",
    "index_name",
    "detail_desc",
]

def merge_text_columns(row, columns):
    texts = []
    
    # Looping through the columns except for `detail_desc`.
    # It will be appended with a '-' separator.
    for col in columns[:-1]:
        texts.append(row[col])
        
    texts = ", ".join(texts)
    texts = " - ".join([texts, row[columns[-1]]])
    
    return texts

articles_df["text"] = articles_df.apply(lambda row: merge_text_columns(row, text_cols), axis=1)
articles_df["text"].head()

In [None]:
# Lowercase the newly created `text` column.
articles_df["text"] = articles_df["text"].apply(lambda text: text.lower())

<a id = "retrieval"></a>
# 4. Retrieval


<a id = "bert_NN"></a>
## 4.1. BERT + Neareast Neighbors

In [None]:
# The fraction of the articles that we are going to embed. I use a subset of the whole dataset
# because I want to speed up the whole process. A larger subset might also be used, but the preprocessing
# will take a lot more time.
EMBED_FRAC = 0.1
# If this is set to True, the `EMBED_FRAC` fraction of the dataset will be shuffled randomly.
RANDOMNESS = False

# Maximum length of a tokenized sequence. I chose these values based on the histograms above.
# BERT uses a subword tokenizer, but still, a lot of samples have much less than 60 words.
MAX_LEN = 60

In [None]:
class BertVectorizer:
    
    def __init__(self):
        self._model_id = "bert-base-uncased"
        self._tokenizer = AutoTokenizer.from_pretrained(self._model_id)
        self._base_model = AutoModel.from_pretrained(self._model_id)
        
    def embed(self, texts: List[str], max_length=60) -> np.ndarray:
        """Embed `text` into a vector of size 768.
        Args:
            text (List[str]): Input text.
            max_length (int): The maximum length of a text in `texts`. Defaults to 60.
        
        Returns:
            numpy.ndarray: The vector representation of `text`.
        """
        # Since the input size vary, I pad or truncate, based on the lengths.
        inputs = self._tokenizer(
            texts, 
            max_length=max_length, 
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        # Getting the output tensor of the model. It is of shape (batch_size, seq_len, embedding_size).
        # Then I get only the vectors for each [CLS] corresponding to each input in `text`.
        embedding = self._base_model(**inputs).last_hidden_state[:, 0, :].detach()
        # `output` shape: (batch_size, embedding_size)
        
        return embedding.numpy()


bert = BertVectorizer()
embedding = bert.embed(articles_df["text"][:10].tolist(), max_length=MAX_LEN)
print(f"Vector shape: {embedding.shape}")

In [None]:
# I could use multithreading here. This method is too slow.
def create_embeddings(dataframe: pd.DataFrame, vectorizer: nn.Module, batch_size=5) -> pd.DataFrame:
    vectors = []
    
    for i in tqdm(range(0, len(dataframe), batch_size)):
        curr_df = dataframe.iloc[i:i + batch_size]
        vectors.extend(vectorizer.embed(curr_df["text"].tolist()))

    dataframe["embedding"] = vectors
        
    return dataframe


if RANDOMNESS:
    print("Shuffling the articles dataframe...")
    articles_sample = articles_df.sample(frac=EMBED_FRAC, random_state=1)
else:
    articles_sample = articles_df.iloc[:int(EMBED_FRAC * len(articles_df))]
        
embedded_articles = create_embeddings(articles_sample, vectorizer=bert, batch_size=100)
embedded_articles["embedding"]

### Add the vectors into the transactions

In [None]:
# Getting the subset of `transactions_train.csv` which has these particular article IDs.
sample_transaction_df = transactions_df[
    transactions_df["article_id"].isin(
        embedded_articles["article_id"].tolist()
    )
]
print("Num. of transactions with these article IDs:", len(sample_transaction_df))
sample_transaction_df.head()

In [None]:
# Executing a left join on the transactions with the embedded articles.
# This maps the embedding vectors to each transaction.
embedded_transactions = sample_transaction_df.merge(
    embedded_articles, 
    how="left", left_on="article_id", right_on="article_id"
)[[
    "customer_id",
    "article_id",
    "price",
    "embedding"
]]
embedded_transactions.head()

### Calculate the average transaction vector for each customer

In [None]:
embedded_customer_ids = embedded_transactions["customer_id"].unique().tolist()
print(f"Num. Customers in the embedded transactions: {len(embedded_customer_ids)}")

In [None]:
customer_embeddings = embedded_transactions.groupby(["customer_id"])["embedding"].apply(
    lambda emb: emb.mean()
).reset_index()
customer_embeddings.head()

### Add new column for bought articles

In [None]:
# Group the article IDs based on the customer ID.
embedded_transactions["article_id"] = embedded_transactions["article_id"].astype(str)
bought_articles = embedded_transactions.groupby(["customer_id"]).agg({
    "article_id": ",".join
})
bought_articles.head()

In [None]:
# Add the list of bought articles into the `customer_embeddings` DataFrame.
# Here, it doesn't matter if it is inner, left or right, since the customer IDs are the same.
customer_embeddings= customer_embeddings.merge(bought_articles, how="left", left_on="customer_id", right_on="customer_id")
customer_embeddings.head()

### Modeling

In [None]:
@dataclass
class SystemMetadata:
    articles_metadata: pd.DataFrame
    customers_metadata: pd.DataFrame


class ArticleRecommender:
    """Recommendation system for H&M products. Based on previous purchases it
    suggests new products that the customers might like.
    
    Args:
        metadata (SystemMetadata): Dataclass consisting of vectors describing each article and each customer.   
    """
    
    def __init__(self, metadata: SystemMetadata):
        self._customers_metadata = metadata.customers_metadata
        self._articles_metadata = metadata.articles_metadata
        self._articles_metadata["article_id"] = self._articles_metadata["article_id"].astype(str)
        
        self._model = NearestNeighbors(n_neighbors=12)
    
    def recommend(self, customer_id: str, topk: int = 12) -> List[str]:
        """Recommends `topk` articles based on `customer_id`'s previous purchases.
        
        Args:
            customer_id (str): ID of the customer to which you want to recommend new products.
            topk (int): Denotes how many suggestions to make. They are ordered (top K) suggestions. Defaults to 12.
        
        Returns:
            List[str]: List of article IDs.
        """
        
        # Creating deep copies, since I don't want to alter the original DataFrames.
        # Also, when we call `recommend()` multiple times, each time we want to have
        # all the metadata.
        articles_metadata = self._articles_metadata.copy(deep=True)
        customers_metadata = self._customers_metadata.copy(deep=True)
        
        # Getting the already purchased articles. We want to suggest new things to our Customers, right?
        customer_purchases = self._get_customer_field_value(
            customer_id=customer_id,
            field_name="article_id"
        ).split(",")
        
        # Get the DataFrame IDs of the articles that were already purchased by this customer.
        # Then, remove these entries from the DataFrames.
        article_df_ids = self._articles_metadata[
            self._articles_metadata["article_id"].isin(customer_purchases)
        ].index.tolist()
        articles_metadata.drop(article_df_ids, inplace=True)
        customers_metadata.drop(article_df_ids, inplace=True)
        
        train_embeddings = self._col2numpy(
            column=articles_metadata["embedding"].tolist()
        )
        
        # Fitting the model on the article vectors.
        self._model.fit(train_embeddings)
        
        # Getting the vector of the Customer with ID `customer_id`.
        customer_embedding = self._get_customer_field_value(
            customer_id, field_name="embedding"
        )
        customer_embedding = np.expand_dims(customer_embedding, 0)
        # Here `customer_embedding` is a NumPy array with shape (1, 768).
        
        # Making a prediction.
        predictions = self._model.kneighbors(
            customer_embedding, 
            n_neighbors=topk,
            return_distance=False
        )[0]
        
        # Returning the respective article IDs, based on the predicted indices.
        return articles_metadata.iloc[
            predictions.tolist()
        ]["article_id"].tolist()
        
    def _col2numpy(self, column: List[np.ndarray]) -> np.ndarray:
        # Stacking the list of NumPy arrays on the row axis.
        array = np.stack(column, axis=0)
        
        return array
    
    def _get_customer_field_value(self, customer_id: str, field_name: str) -> Any:
        return self._customers_metadata[
             self._customers_metadata["customer_id"] == customer_id
        ][field_name].tolist()[0]


# Selecting an arbitrary customer.
customer_id = customer_embeddings["customer_id"][42]
    
metadata = SystemMetadata(
    articles_metadata=embedded_articles,
    customers_metadata=customer_embeddings
)
# Making a recommendation
article_recommender = ArticleRecommender(metadata)
recommended_articles = article_recommender.recommend(
    customer_id=customer_id
)
print(f"Recommended articles for customer with ID '{customer_id}':\n{recommended_articles}")

In [None]:
def generate_submission(system: ArticleRecommender, customer_ids: List[str]) -> pd.DataFrame:
    recommendations = []
    progressbar = tqdm(customer_ids)
    
    for i, customer_id in enumerate(progressbar):
        progressbar.set_description(f"Customer {i + 1}/{len(customer_ids)}")
        current_recommendations = system.recommend(
            customer_id=customer_id
        )
        recommendations.append(" ".join(current_recommendations))
        
    return pd.DataFrame.from_dict({
        "customer_id": customer_ids,
        "prediction": recommendations,
    })


# Generating a submission for a small subset of all Customers, just as an example.
submission_df = generate_submission(
    system=article_recommender,
    customer_ids=customer_embeddings["customer_id"].tolist()
)
submission_df.head()

In [None]:
submission_df.to_csv('submission_1.csv', index=False)

<a id = "rec_repur"></a>
## 4.2. Recommend Items Repurchased

In [None]:
transactions_df['t_dat'] = pd.to_datetime(transactions_df['t_dat'])
transactions_df['month'] = transactions_df['t_dat'].dt.month

In [None]:
#Considering the predicted purchase time is Sep, Only take 6,7,8,9,10,11,12 month into consideration. 
#Using last week of final purchase as validation.
original_transactions = transactions_df.loc[transactions_df['month'] >= 6]
transactions_df = original_transactions.loc[transactions_df['t_dat']<datetime(2020, 9, 7)]
#valid_transactions = original_data.loc[transactions['t_dat']>=datetime(2020, 9, 7)]

In [None]:
#Pursue a dict the key is customer_id the value is also a dict of the article and corresponding purchase times.
def create_dict(transactions_df,purchase_dict):
    for i,x in enumerate(zip(transactions_df['customer_id'], transactions_df['article_id'])):
        cust_id, art_id = x
        if cust_id not in purchase_dict:
            purchase_dict[cust_id] = {}
    
        if art_id not in purchase_dict[cust_id]:
            purchase_dict[cust_id][art_id] = 0
    
        purchase_dict[cust_id][art_id] += 1

In [None]:
#Avoid Out of Memory
n_split_prediction = 1000
purchase_dict = {}
n_chunk = (len(transactions_df) + n_split_prediction - 1)// n_split_prediction
for i in range(0, len(transactions_df), n_chunk):
    #print(f"chunk: {i}")
    
    target_transactions = transactions_df.iloc[i:i+n_chunk]
    create_dict(target_transactions,purchase_dict)
    

In [None]:
#Caculate top12 items to make up for the absent position of previous purchase
top12 = list(transactions_df["article_id"].value_counts().index[:12])

In [None]:
sub = submission_df[["customer_id"]]
string_top12 = ' '.join(map(str, top12))

def generate_prediction(submission_df, purchase_dict, prediction_list, top12, string_top12):
    for i, cust_id in enumerate(submission_df['customer_id'].values):
        if cust_id in purchase_dict:
            l = sorted(purchase_dict[cust_id].items(), key=lambda x: x[1], reverse=True)
            l = [str(y[0]) for y in l]  # Convert integers to strings
            if len(l) > 12:
                s = ' '.join(l[:12])
            else:
                s = ' '.join(l + list(map(str, top12[:(12 - len(l))])))
        else:
            s = string_top12
        prediction_list.append(s)
    return prediction_list

sub["prediction"] = generate_prediction(submission_df, purchase_dict, [], top12, string_top12)
print(sub.head())


In [None]:
sub.to_csv('submission_2.csv', index=False)

<a id = "trend"></a>
## 4.3. Trending Products Weekly

In [None]:
from tqdm import tqdm
tqdm.pandas()
N = 12

In [None]:
transactions_df['article_id'] = transactions_df['article_id'].astype(str)

transactions_df['t_dat'] = pd.to_datetime(transactions_df['t_dat'])
last_ts = transactions_df['t_dat'].max()

In [None]:
transactions_df['ldbw'] = transactions_df['t_dat'].progress_apply(lambda d: last_ts - (last_ts - d).floor('7D'))

In [None]:
weekly_sales = transactions_df.drop('customer_id', axis=1).groupby(['ldbw', 'article_id']).count()
weekly_sales = weekly_sales.rename(columns={'t_dat': 'count'})

In [None]:
selected_columns = ['t_dat', 'customer_id', 'article_id', 'ldbw']
transactions_df = transactions_df[selected_columns]

In [None]:
transactions_df = transactions_df.join(weekly_sales, on=['ldbw', 'article_id'])

In [None]:
weekly_sales = weekly_sales.reset_index().set_index('article_id')
last_day = last_ts.strftime('%Y-%m-%d')

transactions_df = transactions_df.join(
    weekly_sales.loc[weekly_sales['ldbw']==last_day, ['count']],
    on='article_id', rsuffix="_targ")

transactions_df['count_targ'].fillna(0, inplace=True)
del weekly_sales

In [None]:
transactions_df['quotient'] = transactions_df['count_targ'] / transactions_df['count']

In [None]:
target_sales = transactions_df.drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()
general_pred = target_sales.nlargest(N).index.tolist()
del target_sales

In [None]:
purchase_dict = {}

for i in tqdm(transactions_df.index):
    cust_id = transactions_df.at[i, 'customer_id']
    art_id = transactions_df.at[i, 'article_id']
    t_dat = transactions_df.at[i, 't_dat']

    if cust_id not in purchase_dict:
        purchase_dict[cust_id] = {}

    if art_id not in purchase_dict[cust_id]:
        purchase_dict[cust_id][art_id] = 0
    
    x = max(1, (last_ts - t_dat).days)

    a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
    y = a / np.sqrt(x) + b * np.exp(-c*x) - d

    value = transactions_df.at[i, 'quotient'] * max(0, y)
    purchase_dict[cust_id][art_id] += value

In [None]:
pred_list = []
for cust_id in tqdm(sub['customer_id']):
    if cust_id in purchase_dict:
        series = pd.Series(purchase_dict[cust_id])
        series = series[series > 0]
        l = series.nlargest(N).index.tolist()
        if len(l) < N:
            l = l + general_pred[:(N-len(l))]
    else:
        l = general_pred
    pred_list.append(' '.join(l))

submission_df['prediction'] = pred_list
submission_df.to_csv('submission_3.csv', index=None)

In [None]:
import gc
sub0 = pd.read_csv('/kaggle/working/submission_1.csv').sort_values('customer_id').reset_index(drop=True)                                             # 0.0231
sub1 = pd.read_csv('/kaggle/working/submission_2.csv').sort_values('customer_id').reset_index(drop=True)                # 0.0225
sub2 = pd.read_csv('/kaggle/working/submission_3.csv').sort_values('customer_id').reset_index(drop=True)   

sub0.columns = ['customer_id', 'prediction0']
sub0['prediction1'] = sub1['prediction']
sub0['prediction2'] = sub2['prediction']

del sub1, sub2
gc.collect()
sub0.head()

In [None]:
def cust_blend(dt, W = [1,1,1]):
    #Global ensemble weights
    #W = []

    #Create a list of all model predictions
    REC = []

    # Second Try
    REC.append(dt['prediction0'].split())
    REC.append(dt['prediction1'].split())
    REC.append(dt['prediction2'].split())
    #Create a dictionary of items recommended.
    #Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            if v in res:
                res[v] += (W[M]/(n+1))
            else:
                res[v] = (W[M]/(n+1))

    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())

    # Return the top 12 items only
    return ' '.join(res[:12])

sub0['prediction'] = sub0.apply(cust_blend, W = [1.05,0.78,1.17], axis=1)
sub0.head()

In [None]:
del sub0['prediction0']
del sub0['prediction1']
del sub0['prediction2']
gc.collect()


sub0.to_csv('submission.csv', index=False)

### SVD

In [1]:
!pip install git+https://github.com/mayukh18/reco

Collecting git+https://github.com/mayukh18/reco
  Cloning https://github.com/mayukh18/reco to /tmp/pip-req-build-dgilc6t7
  Running command git clone --filter=blob:none --quiet https://github.com/mayukh18/reco /tmp/pip-req-build-dgilc6t7
  Resolved https://github.com/mayukh18/reco to commit 3a61898cd5bb7c980732090a10e49e9d8c786a99
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: reco
  Building wheel for reco (setup.py) ... [?25ldone
[?25h  Created wheel for reco: filename=reco-0.2.1-cp310-cp310-linux_x86_64.whl size=9383703 sha256=b9c80e263f269c9cf354cbb47e3d549b6d55eaec599657f4785171be1703b9f9
  Stored in directory: /tmp/pip-ephem-wheel-cache-s5_c348n/wheels/08/84/1f/4f54fb9df6f7483c6d24d46bca75446401623e035556af4c70
Successfully built reco
Installing collected packages: reco
Successfully installed reco-0.2.1


In [4]:
data = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", dtype={'article_id':str})
data["t_dat"] = pd.to_datetime(data["t_dat"])
data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [5]:
print("All Transactions Date Range: {} to {}".format(data['t_dat'].min(), data['t_dat'].max()))

data["t_dat"] = pd.to_datetime(data["t_dat"])
train1 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,8)) & (data['t_dat'] < datetime.datetime(2020,9,16))]
train2 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,1)) & (data['t_dat'] < datetime.datetime(2020,9,8))]
train3 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,23)) & (data['t_dat'] < datetime.datetime(2020,9,1))]
train4 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,15)) & (data['t_dat'] < datetime.datetime(2020,8,23))]

val = data.loc[data["t_dat"] >= datetime.datetime(2020,9,16)]

All Transactions Date Range: 2018-09-20 00:00:00 to 2020-09-22 00:00:00


In [6]:
# List of all purchases per user (has repetitions)
positive_items_per_user1 = train1.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user2 = train2.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user3 = train3.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user4 = train4.groupby(['customer_id'])['article_id'].apply(list)

In [8]:
train = pd.concat([train1, train2, train3, train4], axis=0)

#time decay popularity of each article
train['pop_factor'] = train['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,16) - x).days**2)
popular_items_group = train.groupby(['article_id'])['pop_factor'].sum()

# purchase count of each article
items_total_count = train.groupby(['article_id'])['article_id'].count()
# purchase count of each user
users_total_count = train.groupby(['customer_id'])['customer_id'].count()


train['feedback'] = 1
train = train.groupby(['customer_id', 'article_id']).sum().reset_index()
train['feedback'] = train.apply(lambda row: row['feedback']/popular_items_group[row['article_id']], axis=1)

train['feedback'] = train['feedback'].apply(lambda x: 5.0 if x>5.0 else x)
train.drop(['price', 'sales_channel_id'], axis=1, inplace=True)

# shuffling
train = train.sample(frac=1).reset_index(drop=True)
train['feedback'].describe()

TypeError: datetime64 type does not support sum operations

In [None]:
train_pop = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,1)) & (data['t_dat'] < datetime.datetime(2020,9,16))]
train_pop['pop_factor'] = train_pop['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,16) - x).days)
popular_items_group = train_pop.groupby(['article_id'])['pop_factor'].sum()

_, popular_items = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1])

train_pop['pop_factor'].describe()

In [None]:
def get_most_freq_next_item(user_group):
    next_items = {}
    for user in tqdm(user_group.keys()):
        items = user_group[user]
        for i,item in enumerate(items[:-1]):
            if item not in next_items:
                next_items[item] = []
            if item != items[i+1]:
                next_items[item].append(items[i+1])

    pred_next = {}
    for item in next_items:
        if len(next_items[item]) >= 5:
            most_common = Counter(next_items[item]).most_common()
            ratio = most_common[0][1]/len(next_items[item])
            if ratio >= 0.1:
                pred_next[item] = most_common[0][0]
            
    return pred_next

user_group = train.groupby(['customer_id'])['article_id'].apply(list)
pred_next = get_most_freq_next_item(user_group)

In [None]:
from reco.recommender import FunkSVD
from reco.metrics import rmse

# k = number of dimensions of the latent embedding. formatizer dict takes in names of the columns
# for user, item and values/feedback/ratings respectively.

svd = FunkSVD(k=8, learning_rate=0.008, regularizer = .01, iterations = 80, method = 'stochastic', bias=True)
svd.fit(X=train, formatizer={'user':'customer_id', 'item':'article_id', 'value':'feedback'},verbose=True)

In [None]:
def apk(actual, predicted, k=12):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
positive_items_val = val.groupby(['customer_id'])['article_id'].apply(list)
val_users = positive_items_val.keys()
val_items = []

for i,user in tqdm(enumerate(val_users)):
    val_items.append(positive_items_val[user])
    
print("Total users in validation:", len(val_users))

In [None]:
from collections import Counter
outputs = []
cnt = 0

popular_items = list(popular_items)

for user in tqdm(val_users):
    user_output = []
    if user in positive_items_per_user1.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user2.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user3.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user4.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    
    user_output += [pred_next[item] for item in user_output if item in pred_next and pred_next[item] not in user_output]      
    
    user_output += list(popular_items[:12 - len(user_output)])
    outputs.append(user_output)
    
print("mAP Score on Validation set:", mapk(val_items, outputs))

In [None]:
from collections import Counter
outputs = []
cnt = 0

popular_items = list(popular_items)
userindexes = {svd.users[i]:i for i in range(len(svd.users))}

for user in tqdm(val_users):
    user_output = []
    if user in positive_items_per_user1.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = svd.items.index(k)
                pred_value = np.dot(svd.userfeatures[user_index], svd.itemfeatures[itemindex].T) + svd.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    if user in positive_items_per_user2.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = svd.items.index(k)
                pred_value = np.dot(svd.userfeatures[user_index], svd.itemfeatures[itemindex].T) + svd.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    if user in positive_items_per_user3.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = svd.items.index(k)
                pred_value = np.dot(svd.userfeatures[user_index], svd.itemfeatures[itemindex].T) + svd.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    if user in positive_items_per_user4.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = svd.items.index(k)
                pred_value = np.dot(svd.userfeatures[user_index], svd.itemfeatures[itemindex].T) + svd.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    user_output += [pred_next[item] for item in user_output if item in pred_next and pred_next[item] not in user_output]      
    
    user_output += list(popular_items[:12 - len(user_output)])
    outputs.append(user_output)
    
print("mAP Score on Validation set:", mapk(val_items, outputs))

In [None]:
outputs = []
cnt = 0

popular_items = list(popular_items)
userindexes = {f.users[i]:i for i in range(len(f.users))}

for user in tqdm(submission['customer_id']):
    user_output = []
    if user in positive_items_per_user1.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
        
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = f.items.index(k)
                pred_value = np.dot(f.userfeatures[user_index], f.itemfeatures[itemindex].T) + f.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    if user in positive_items_per_user2.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
        
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = f.items.index(k)
                pred_value = np.dot(f.userfeatures[user_index], f.itemfeatures[itemindex].T) + f.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    if user in positive_items_per_user3.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
        
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = f.items.index(k)
                pred_value = np.dot(f.userfeatures[user_index], f.itemfeatures[itemindex].T) + f.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    if user in positive_items_per_user4.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
        
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = f.items.index(k)
                pred_value = np.dot(f.userfeatures[user_index], f.itemfeatures[itemindex].T) + f.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    user_output += [pred_next[item] for item in user_output if item in pred_next and pred_next[item] not in user_output]      
    
    user_output += list(popular_items[:12 - len(user_output)])
    outputs.append(user_output)
    
str_outputs = []
for output in outputs:
    str_outputs.append(" ".join([str(x) for x in output]))

In [None]:
train1 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,16)) & (data['t_dat'] < datetime.datetime(2020,9,23))]
train2 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,8)) & (data['t_dat'] < datetime.datetime(2020,9,16))]
train3 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,31)) & (data['t_dat'] < datetime.datetime(2020,9,8))]
train4 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,23)) & (data['t_dat'] < datetime.datetime(2020,8,31))]

positive_items_per_user1 = train1.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user2 = train2.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user3 = train3.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user4 = train4.groupby(['customer_id'])['article_id'].apply(list)

train = pd.concat([train1, train2], axis=0)
train['pop_factor'] = train['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,23) - x).days)
popular_items_group = train.groupby(['article_id'])['pop_factor'].sum()

_, popular_items = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1])

user_group = pd.concat([train1, train2, train3, train4], axis=0).groupby(['customer_id'])['article_id'].apply(list)

In [None]:
submission['prediction'] = str_outputs
submission.to_csv("submissions.csv", index=False)

<a id = "rerank"></a>
# 5. Rerank

In [12]:
from lightgbm.sklearn import LGBMRanker
from datetime import timedelta
from pathlib import Path
import pandas as pd
import numpy as np
import os
import re
from typing import List, Union, Any
from dataclasses import dataclass
from datetime import datetime
import numpy as np
import pandas as pd
from torch import nn
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from transformers import AutoTokenizer, AutoModel


In [13]:
user_features = pd.read_parquet('../input/ranking-features/user_features.parquet')
item_features = pd.read_parquet('../input/ranking-features/item_features.parquet')
transactions_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
transactions_df.t_dat = pd.to_datetime( transactions_df.t_dat )

In [14]:
df_4w = transactions_df[transactions_df['t_dat'] >= pd.to_datetime('2020-08-24')].copy()
df_3w = transactions_df[transactions_df['t_dat'] >= pd.to_datetime('2020-08-31')].copy()
df_2w = transactions_df[transactions_df['t_dat'] >= pd.to_datetime('2020-09-07')].copy()
df_1w = transactions_df[transactions_df['t_dat'] >= pd.to_datetime('2020-09-15')].copy()

In [15]:
user_features[['club_member_status', 'fashion_news_frequency']] = (
                   user_features[['club_member_status', 'fashion_news_frequency']]
                   .apply(lambda x: pd.factorize(x)[0])
).astype('int8')

In [16]:
transactions_df = (
    transactions_df
    .merge(user_features, on = ('customer_id'))
    .merge(item_features, on = ('article_id'))
)
transactions_df.sort_values(['t_dat', 'customer_id'], inplace=True)

In [17]:
#for simplicity let's take only 1M rows
N_ROWS = 1_000_000

train = transactions_df.loc[ transactions_df.t_dat <= pd.to_datetime('2020-09-15') ].iloc[:N_ROWS]
valid = transactions_df.loc[ transactions_df.t_dat >= pd.to_datetime('2020-09-16') ]

In [18]:
#delete transactions to save memory
del transactions_df

In [19]:
train.shape, valid.shape

((1000000, 88), (240311, 88))

In [20]:
purchase_dict_4w = {}

for i,x in enumerate(zip(df_4w['customer_id'], df_4w['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_4w:
        purchase_dict_4w[cust_id] = {}
    
    if art_id not in purchase_dict_4w[cust_id]:
        purchase_dict_4w[cust_id][art_id] = 0
    
    purchase_dict_4w[cust_id][art_id] += 1

dummy_list_4w = list((df_4w['article_id'].value_counts()).index)[:12]

In [21]:
purchase_dict_3w = {}

for i,x in enumerate(zip(df_3w['customer_id'], df_3w['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_3w:
        purchase_dict_3w[cust_id] = {}
    
    if art_id not in purchase_dict_3w[cust_id]:
        purchase_dict_3w[cust_id][art_id] = 0
    
    purchase_dict_3w[cust_id][art_id] += 1

dummy_list_3w = list((df_3w['article_id'].value_counts()).index)[:12]

In [22]:
purchase_dict_2w = {}

for i,x in enumerate(zip(df_2w['customer_id'], df_2w['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_2w:
        purchase_dict_2w[cust_id] = {}
    
    if art_id not in purchase_dict_2w[cust_id]:
        purchase_dict_2w[cust_id][art_id] = 0
    
    purchase_dict_2w[cust_id][art_id] += 1

dummy_list_2w = list((df_2w['article_id'].value_counts()).index)[:12]

In [23]:
purchase_dict_1w = {}

for i,x in enumerate(zip(df_1w['customer_id'], df_1w['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_1w:
        purchase_dict_1w[cust_id] = {}
    
    if art_id not in purchase_dict_1w[cust_id]:
        purchase_dict_1w[cust_id][art_id] = 0
    
    purchase_dict_1w[cust_id][art_id] += 1

dummy_list_1w = list((df_1w['article_id'].value_counts()).index)[:12]

In [24]:
def prepare_candidates(customers_id, n_candidates = 12):
  """
  df - basically, dataframe with customers(customers should be unique)
  """
  prediction_dict = {}
  dummy_list = list((df_2w['article_id'].value_counts()).index)[:n_candidates]

  for i, cust_id in tqdm(enumerate(customers_id)):
    # comment this for validation
    if cust_id in purchase_dict_1w:
        l = sorted((purchase_dict_1w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>n_candidates:
            s = l[:n_candidates]
        else:
            s = l+dummy_list_1w[:(n_candidates-len(l))]
    elif cust_id in purchase_dict_2w:
        l = sorted((purchase_dict_2w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>n_candidates:
            s = l[:n_candidates]
        else:
            s = l+dummy_list_2w[:(n_candidates-len(l))]
    elif cust_id in purchase_dict_3w:
        l = sorted((purchase_dict_3w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>n_candidates:
            s = l[:n_candidates]
        else:
            s = l+dummy_list_3w[:(n_candidates-len(l))]
    elif cust_id in purchase_dict_4w:
        l = sorted((purchase_dict_4w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>n_candidates:
            s = l[:n_candidates]
        else:
            s = l+dummy_list_4w[:(n_candidates-len(l))]
    else:
        s = dummy_list
    prediction_dict[cust_id] = s

  k = list(map(lambda x: x[0], prediction_dict.items()))
  v = list(map(lambda x: x[1], prediction_dict.items()))
  negatives_df = pd.DataFrame({'customer_id': k, 'negatives': v})
  negatives_df = (
      negatives_df
      .explode('negatives')
      .rename(columns = {'negatives': 'article_id'})
  )
  return negatives_df

In [25]:
#take only last 15 transactions
train['rank'] = range(len(train))
train = (
    train
    .assign(
        rn = train.groupby(['customer_id'])['rank']
                  .rank(method='first', ascending=False))
    .query("rn <= 15")
    .drop(columns = ['price', 'sales_channel_id'])
    .sort_values(['t_dat', 'customer_id'])
)
train['label'] = 1

del train['rank']
del train['rn']

valid.sort_values(['t_dat', 'customer_id'], inplace = True)

In [26]:
last_dates = (
    train
    .groupby('customer_id')['t_dat']
    .max()
    .to_dict()
)

negatives = prepare_candidates(train['customer_id'].unique(), 15)
negatives['t_dat'] = negatives['customer_id'].map(last_dates)


0it [00:00, ?it/s]

In [27]:
# negatives = negatives.drop('t_dat', axis=1)
negatives

Unnamed: 0,customer_id,article_id,t_dat
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,794321007,2018-09-24
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,924243001,2018-09-24
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,924243002,2018-09-24
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,923758001,2018-09-24
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,918522001,2018-09-24
...,...,...,...
214935,461802adf572dddc90bf68e441db0039809338e20768d7...,751471043,2018-10-11
214935,461802adf572dddc90bf68e441db0039809338e20768d7...,924243002,2018-10-11
214935,461802adf572dddc90bf68e441db0039809338e20768d7...,706016001,2018-10-11
214935,461802adf572dddc90bf68e441db0039809338e20768d7...,850917001,2018-10-11


In [None]:
submission_1 = pd.read_csv('/kaggle/input/submit/submission_1.csv')
submission_2 = pd.read_csv('/kaggle/input/submit/submission_2 (1).csv')
submission_3 = pd.read_csv('/kaggle/input/submit/submission_3 (1).csv')
# submission_4 = pd.read_csv('/kaggle/input/submit/')
merged_sub = pd.merge(pd.merge(submission_1, submission_2, how='outer', on='customer_id'), submission_3, how='outer', on='customer_id')


In [None]:
def combine_predictions(row):
    return f"{row['prediction_x']} {row['prediction_y']} {row['prediction']}"

# Áp dụng hàm cho mỗi hàng của DataFrame
merged_sub['combined_prediction'] = merged_sub.apply(combine_predictions, axis=1)

# Convert combined_prediction column to a list of strings
merged_sub['combined_prediction'] = merged_sub['combined_prediction'].apply(lambda x: x.split())

# Explode the list of strings in combined_prediction column
merged_sub = merged_sub.explode('combined_prediction')

del merged_sub['prediction_x']
del merged_sub['prediction_y']
del merged_sub['prediction']

In [None]:
merged_sub = merged_sub.iloc[:100000, :]
merged_sub

In [None]:
customer_list = merged_sub['customer_id'].unique()
actual_purchases = train[train['customer_id'].isin(customer_list)][['customer_id', 'article_id', 'label']]
actual_purchases = actual_purchases.groupby('customer_id')['article_id'].apply(list).reset_index(name='actual_purchases')

merged_sub['label'] = 0

# Duyệt qua từng dòng trong actual_purchases
for index, row in actual_purchases.iterrows():
    # Lấy danh sách các mua hàng thực tế của khách hàng hiện tại
    purchases = row['actual_purchases']
    
    # Lọc các dòng trong merged_sub có customer_id trùng khớp và combined_prediction thuộc purchases
    mask = (merged_sub['customer_id'] == row['customer_id']) & (merged_sub['combined_prediction'].isin(purchases))
    
    # Gán nhãn 1 cho các dòng thỏa mãn điều kiện
    merged_sub.loc[mask, 'label'] = 1

# Hiển thị DataFrame kết quả
print(merged_sub)

In [None]:
merged_sub = merged_sub.rename(columns={'combined_prediction': 'article_id'})

# Giả sử negatives là DataFrame chứa thông tin về các mặt hàng có label = 0
# Tạo DataFrame chứa các hàng có label = 0 từ merged_sub
negatives_from_merged_sub = merged_sub[merged_sub['label'] == 0]

# Concatenate DataFrame negatives_from_merged_sub với DataFrame negatives
negatives_combined = pd.concat([negatives, negatives_from_merged_sub[['customer_id', 'article_id']]], ignore_index=True)

# Hiển thị DataFrame kết quả
print(negatives_combined)

In [None]:
negatives_combined = (
    negatives_combined
    .merge(user_features, on = ('customer_id'))
    .merge(item_features, on = ('article_id'))
)
negatives_combined['label'] = 0

In [None]:
train = pd.concat([train, negatives_combined])
train.sort_values(['customer_id', 't_dat'], inplace = True)
train_baskets = train.groupby(['customer_id'])['article_id'].count().values

In [None]:
train.to_parquet('/kaggle/working/train_label.parquet')

In [9]:
train = pd.read_parquet('/kaggle/input/train-label/train_label.parquet')

In [10]:
train

Unnamed: 0,t_dat,customer_id,article_id,mean_transactions,max_transactions,min_transactions,median_transactions,sum_transactions,max_minus_min_transactions,n_transactions,...,graphical_appearance_name_3,colour_group_name_3,perceived_colour_value_name_3,perceived_colour_master_name_3,department_name_3,index_name_3,index_group_name_3,section_name_3,garment_group_name_3,label
5785873,2018-09-21,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,640244003,0.030255,0.084729,0.006763,0.025407,2.601932,0.077966,86,...,1,1,1,1,0,1,1,1,0,1
11011079,2018-09-21,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,639677008,0.030255,0.084729,0.006763,0.025407,2.601932,0.077966,86,...,1,0,1,1,0,1,1,1,0,1
14930470,2018-09-21,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,583558001,0.030255,0.084729,0.006763,0.025407,2.601932,0.077966,86,...,0,0,0,1,0,1,1,1,0,1
1316293,2018-09-25,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,666448006,0.030255,0.084729,0.006763,0.025407,2.601932,0.077966,86,...,1,0,0,1,1,1,1,1,0,1
18915918,2018-09-25,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,521269001,0.030255,0.084729,0.006763,0.025407,2.601932,0.077966,86,...,1,1,1,1,0,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2051405,2018-09-20,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,918292001,0.017532,0.042356,0.004559,0.015237,0.788932,0.037797,45,...,1,1,1,1,0,0,0,0,1,0
2243670,2018-09-20,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,751471043,0.017532,0.042356,0.004559,0.015237,0.788932,0.037797,45,...,0,1,1,1,1,1,1,1,0,0
3116205,2018-09-20,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,794819001,0.017532,0.042356,0.004559,0.015237,0.788932,0.037797,45,...,1,1,1,1,1,1,1,0,0,0
3125001,2018-09-20,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,884081001,0.017532,0.042356,0.004559,0.015237,0.788932,0.037797,45,...,0,1,1,1,1,1,1,0,1,0


In [11]:
print("Sum of query counts:", sum(train_baskets))
print("Number of data points in the training set:", len(train))

NameError: name 'train_baskets' is not defined

In [None]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    num_leaves = 20,
    boosting_type="dart",
    max_depth=15,
    n_estimators=500,
    importance_type='gain',
    verbose=10
)

In [None]:
ranker = ranker.fit(
    train.drop(columns = ['t_dat', 'customer_id', 'article_id', 'label']),
    train.pop('label'),
    group=train_baskets,
)

In [None]:
import os
submission_df = pd.read_csv(os.path.join(BASE_IN_PATH, "sample_submission.csv"))
candidates = prepare_candidates(submission_df.customer_id.unique(), 12)
candidates = (
    candidates
    .merge(user_features, on = ('customer_id'))
    .merge(item_features, on = ('article_id'))
)

In [None]:
preds = []
batch_size = 1_000_000
for bucket in tqdm(range(0, len(candidates), batch_size)):
  outputs = ranker.predict(
      candidates.iloc[bucket: bucket+batch_size]
      .drop(columns = ['customer_id', 'article_id'])
      )
  preds.append(outputs)

In [None]:
preds = np.concatenate(preds)
candidates['preds'] = preds
preds = candidates[['customer_id', 'article_id', 'preds']]
preds.sort_values(['customer_id', 'preds'], ascending=False, inplace = True)
preds = (
    preds
    .groupby('customer_id')[['article_id']]
    .aggregate(lambda x: x.tolist())
)
preds['article_id'] = preds['article_id'].apply(lambda x: ' '.join(['0'+str(k) for k in x]))

In [None]:
preds = submission_df[['customer_id']].merge(
    preds
    .reset_index()
    .rename(columns = {'article_id': 'prediction'}), how = 'left')
preds['prediction'].fillna(' '.join(['0'+str(art) for art in dummy_list_2w]), inplace = True)
preds.to_csv('submisssion_lightgbm_21_500.csv', index = False)

In [None]:
preds

In [None]:
import pandas as pd
sub0 = pd.read_csv('/kaggle/input/sub-ensemble/submission (1).csv').sort_values('customer_id').reset_index(drop=True)
sub1 = pd.read_csv('/kaggle/input/sub-ensemble/submission (2).csv').sort_values('customer_id').reset_index(drop=True)
sub2 = pd.read_csv('/kaggle/input/sub-ensemble/submission.csv').sort_values('customer_id').reset_index(drop=True)
sub3 = pd.read_csv('/kaggle/input/sub-ensemble/submissions.csv').sort_values('customer_id').reset_index(drop=True)
sub4 = pd.read_csv('/kaggle/input/sub-gbm/submisssion_lightgbm_15_400.csv').sort_values('customer_id').reset_index(drop=True)
sub5 = pd.read_csv('/kaggle/input/sub-gbm/submisssion_lightgbm_5_200.csv').sort_values('customer_id').reset_index(drop=True)

In [None]:
sub0.columns = ['customer_id', 'prediction0']
sub0['prediction1'] = sub1['prediction']
sub0['prediction2'] = sub2['prediction']
sub0['prediction3'] = sub3['prediction']
sub0['prediction4'] = sub4['prediction']
sub0['prediction5'] = sub5['prediction']
del sub1, sub2, sub3, sub4, sub5
sub0.head()

In [None]:
def cust_blend(dt, W = [1,1,1,1,1,1]):
    #Global ensemble weights
    #W = [1.15,0.95,0.85]
    
    #Create a list of all model predictions
    REC = []
    REC.append(dt['prediction0'].split())
    REC.append(dt['prediction1'].split())
    REC.append(dt['prediction2'].split())
    REC.append(dt['prediction3'].split())
    REC.append(dt['prediction4'].split())
    REC.append(dt['prediction5'].split())
    
    #Create a dictionary of items recommended. 
    #Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            if v in res:
                res[v] += (W[M]/(n+1))
            else:
                res[v] = (W[M]/(n+1))
    
    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())
    
    # Return the top 12 itens only
    return ' '.join(res[:12])

sub0['prediction'] = sub0.apply(cust_blend, W = [1.05,1.00,0.9,1.05,1.00, 1.00], axis=1)
sub0.head()

In [None]:
del sub0['prediction0']
del sub0['prediction1']
del sub0['prediction2']
del sub0['prediction3']
del sub0['prediction4']
del sub0['prediction5']
sub0.to_csv('submission-blend.csv', index=False)