
<font size="5">My solution based on generating top N purchases by some groups and using all previuos orders by each customers. 
Then the candidates filltered using some heuristic and ranked with LGBMRanker.</font>

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
tqdm.pandas()
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import cudf
import cupy
import lightgbm as lgb
from sklearn.base import BaseEstimator, TransformerMixin

pd.set_option('display.max_columns', 500)
sns.set(rc={'figure.figsize':(11.7,8.27)})
cudf.set_allocator("managed")

<font size="5">Preprocessing</font>

In [None]:
customers = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")

id_to_index_dict = dict(zip(customers["customer_id"], customers.index))
index_to_id_dict = dict(zip(customers.index, customers["customer_id"]))

customers["customer_id2"] = customers["customer_id"].map(id_to_index_dict)
customers = cudf.DataFrame(customers)

In [None]:
class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

In [None]:
to_nan_fashion = ~customers['fashion_news_frequency'].isin(['Regularly', 'Monthly'])
customers.loc[to_nan_fashion, 'fashion_news_frequency'] = 'nan'

for col in ['FN', 'Active', 'age']:
    if col != 'age':
        customers[col].fillna(-1, inplace=True)
    customers[col] = customers[col].astype('int8')
    
for col in ['club_member_status', 'postal_code','fashion_news_frequency']:
    customers_pd = customers[[col]].copy().to_pandas()
    customers[col] = Categorize().fit_transform(customers_pd[[col]])[col]
    
del(customers_pd)
gc.collect()

In [None]:
df = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
df = df.merge(customers, on='customer_id')
df['customer_id'] = df['customer_id2']
df.drop(columns=['customer_id2'], inplace=True)
df['t_dat'] = cudf.to_datetime(df['t_dat'])

customers = customers.drop(columns='customer_id') \
                     .rename(columns={'customer_id2':'customer_id'})

In [None]:
tmp = cudf.DataFrame(df.t_dat.unique())
tmp['day'] = tmp.t_dat.rank(method='dense')
tmp['week'] = tmp.day // 7 + 1
df = df.merge(tmp, on='t_dat')
df.week = df.week.astype('int8')
df.drop(columns=['t_dat', 'day'],inplace=True)

del(tmp)
gc.collect()

In [None]:
df = df.groupby(['customer_id', 'article_id', 'week']) \
       .agg({'price': ['sum', 'min', 'count'],
             'sales_channel_id': ['mean']}) \
       .reset_index()

df.columns = ['customer_id', 'article_id','week', 
              'ord_amt', 'price', 'qty','sales_channel_id']   

df.qty = df.qty.astype('int16')
df.sales_channel_id = df.sales_channel_id.astype('float32')
gc.collect()

In [None]:
age_id = 0
age=16
age_group = cudf.DataFrame(columns=["age","age_id"])

for i in range(53) :
    if age < 30 :
        temp_group = cudf.DataFrame({"age": [age, age+1], "age_id": [age_id, age_id]})
        age_group = age_group.append(temp_group)
        age += 2
        age_id += 1
    elif age < 60 :
        temp_group = cudf.DataFrame({"age": [age, age+1, age+2, age+3, age+4], "age_id": [age_id, age_id, age_id, age_id, age_id]})
        age_group = age_group.append(temp_group)
        age += 5
        age_id += 1
    else:
        temp_group = cudf.DataFrame({"age": [age], "age_id": [age_id]})
        age_group = age_group.append(temp_group)
        age += 1

customers = cudf.merge(customers, age_group, on="age", how="left")

gc.collect()
del(temp_group)

In [None]:
dtypes = {'article_id':'int32', 'product_type_no':'int16', 'department_no':'int16', 
          'graphical_appearance_no':'int8', 'garment_group_no':'int8', 'colour_group_code':'int8',
          'perceived_colour_value_id':'int8', 'perceived_colour_master_id':'int8', 'index_group_no':'int8',
          'section_no':'int8'}

articles = cudf.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv",
                         usecols=[key for key in dtypes.keys()],
                         dtype=dtypes
                        )


In [None]:
def add_sex(add_to, week: int, df=df):  

  
    cust_sex = df[df.week <= week].merge(articles[['article_id', 'index_group_no']], 
                                         on="article_id", how="left")  
    cust_sex = cust_sex.groupby(["customer_id", "index_group_no"]) \
                       .qty \
                       .sum() \
                       .reset_index()
                       
    
    sex = cust_sex.sort_values(['qty', "index_group_no"])\
                  .to_pandas() \
                  .groupby('customer_id') \
                  .tail(1)[['customer_id', 'index_group_no']]
    
    sex = cudf.DataFrame(sex)
    sex.columns = ['customer_id', 'sex']

    add_to = add_to.merge(sex, on='customer_id', how='left')
    add_to["sex"] = add_to["sex"].fillna(1).astype('int8')

    del(cust_sex, sex)
    gc.collect()
    return add_to

In [None]:
customers = add_sex(customers, 105) 

age_mean = customers.groupby('sex') \
                    .age \
                    .median() \
                    .reset_index()

age_mean.columns = ['sex', "statistic_age"]

customers = customers.merge(age_mean, on='sex', how="left")
customers.loc[customers["age"].isnull(), "age"] = customers["statistic_age"]
customers.drop(columns=["statistic_age", "age_id"], inplace=True)

customers = customers.merge(age_group, on="age", how="left")
customers[['age', 'sex', 'age_id']] = customers[['age', 'sex', 'age_id']].astype('int8')

del(age_mean, age_group)
gc.collect()

In [None]:
def add_rich(add_to, week: int, df=df, quantile=7):  

    df = df[df.week <= week]    
    
    cust_boughts = df.groupby('customer_id') \
                     .agg({'ord_amt': 'sum', 'qty': 'sum'}) \
                     .reset_index() 
    
    cust_boughts['avg_art_price'] = cust_boughts.ord_amt / cust_boughts.qty
    cust_boughts.avg_art_price, _ = pd.factorize(pd.qcut(cust_boughts.avg_art_price.to_pandas(), quantile))
    cust_boughts.avg_art_price += 1
    cust_boughts.drop(columns=['ord_amt', 'qty'], inplace=True)
       
    cust_week_boughts_by_week = df.groupby(['customer_id', 'week']) \
                                  .ord_amt \
                                  .sum() \
                                  .reset_index() 
    
    cust_week_boughts = cust_week_boughts_by_week.groupby('customer_id') \
                                                 .ord_amt \
                                                 .mean() \
                                                 .reset_index()
    
    cust_week_boughts.columns = ['customer_id', 'avg_week_ord_amt']
    cust_week_boughts.avg_week_ord_amt, _ = pd.factorize(pd.qcut(cust_week_boughts.avg_week_ord_amt.to_pandas(), quantile))
    cust_week_boughts.avg_week_ord_amt += 1
    
    add_to = add_to.merge(cust_week_boughts, on='customer_id', how='left') \
                   .merge(cust_boughts, on='customer_id', how='left')
    
    add_to[["avg_art_price", 'avg_week_ord_amt']] = add_to[["avg_art_price", 'avg_week_ord_amt']] \
                                                          .fillna(0) \
                                                          .astype('int8')
    
    del(cust_week_boughts, cust_boughts)
    gc.collect()
    return add_to

def add_sales_chanel(add_to, week: int, df=df):
    df = df[df.week <= week]
    df = df.groupby('customer_id') \
           .sales_channel_id \
           .mean() \
           .reset_index()
    
    df.loc[df['sales_channel_id'] <= 1.33, 'sales_channel_id'] = 1
    df.loc[df['sales_channel_id'] >= 1.66, 'sales_channel_id'] = 2
    df.loc[~df['sales_channel_id'].isin([1,2]), 'sales_channel_id'] = 1.5

    add_to = add_to.merge(df, on='customer_id', how='left')

    add_to.sales_channel_id = add_to.sales_channel_id.fillna(2).astype('int8')

    gc.collect()
    return add_to

In [None]:
prices = cudf.DataFrame()
for week in range(103,106):
    prices_temp = df[df.week == week].groupby('article_id') \
                                     .price \
                                     .mean() \
                                     .reset_index()
    prices_temp['week'] = np.int8(week)
    
    prices = cudf.concat([prices, prices_temp], ignore_index=True)
    
prices['price'] = prices['price'].astype('float32') 
del(prices_temp)
gc.collect()

richs = cudf.DataFrame()
for week in range(103,106):
    richs_temp = add_rich(customers[['customer_id']], week)
    richs_temp['week'] = np.int8(week)
    richs = cudf.concat([richs, richs_temp], ignore_index=True)
    
richs = richs[['customer_id', 'avg_art_price', 'week']]
del(richs_temp)
gc.collect()

<font size="5">Сandidate generating functions</font>


In [None]:
def best_sellers_by_group(data, groups: list, top_n: int, 
                          target_value='qty', target_column=['article_id']):
    
    df = data.groupby(groups + target_column) \
             [target_value] \
             .sum() \
             .reset_index() \
             .sort_values(groups+[target_value]) \
             .to_pandas()
    
    if groups == []:
        df = df.tail(top_n)
        df['key'] = np.int8(1)
    else:
        df = df.groupby(groups, as_index=False).tail(top_n)
    
    if df[target_value].max() >= 32767:
        df[target_value] = df[target_value].astype('int32')
    elif df[target_value].max() >= 126:
        df[target_value] = df[target_value].astype('int16') 
    else:
        df[target_value] = df[target_value].astype('int8')
        
    name = f'top_{top_n}_best_sell_' + '_'.join(groups) + f'_{target_value}'  
    
    df.rename(columns={target_value: name}, inplace=True)
        
    del(name)
    gc.collect()
    return cudf.DataFrame(df)

In [None]:
def last_weeks_purchase_by_group(data, group_by: list, 
                                 last_weeks_range: list, target_col=['article_id']):
    last_week_perchases = cudf.DataFrame(columns=group_by+target_col)
    
    for last_weeks in last_weeks_range:
        last_week_perchase = data[data.week >= (data.week.max() - last_weeks)] \
                                 .groupby(group_by + target_col) \
                                 .qty \
                                 .sum() \
                                 .reset_index()

        names = '_'.join(group_by+target_col)
        last_week_perchase.columns = group_by + target_col + [f'{names}_last_weeks_{last_weeks}']
        
        if last_week_perchases.shape[0] == 0:
            last_week_perchases = last_week_perchase.copy()
        else:    
            last_week_perchases = last_week_perchases.merge(last_week_perchase, 
                                                            on=group_by+target_col,
                                                            how='outer')
        del(last_week_perchase)
        gc.collect()

    new_columns = [col for col in last_week_perchases.columns if col not in group_by + target_col + ['week']]
    
    for col in new_columns:
        if last_week_perchases[col].max() >= 126:
            last_week_perchases[col] = last_week_perchases[col].fillna(0).astype('int16') 
        else:
            last_week_perchases[col] = last_week_perchases[col].fillna(0).astype('int8')
            
    last_week_perchases['article_id'] = last_week_perchases['article_id'].astype('int32')    
    gc.collect()
    return cudf.DataFrame(last_week_perchases)

In [None]:
def eval_train_candidates(train, test, method=None):
    train = cudf.DataFrame(train).copy()
        
    test_len = test.shape[0]
    train_len = train.shape[0]
    train.week = train.week + 1
    true_preds_len = test.merge(train, on=['week','customer_id','article_id']).shape[0]
    
    train_recall = round(true_preds_len / test_len * 100, 2)
    train_precision = round(true_preds_len / train_len * 100, 2)
#     train_recall = round(test.merge(train, on=['week','customer_id','article_id']).shape[0] / test_len * 100, 2)
#     train_precision = round(test.merge(train, on=['week','customer_id','article_id']).shape[0] / train_len * 100, 2)
    print(method,  train_len, 'pr: ', train_precision, 'rec: ', train_recall)
    return train_len, train_precision, train_recall

<font size="5">Create train-test sets and train ranker</font>

In [None]:
def create_train_set(df, test_week, train_weeks):
    train_set = cudf.DataFrame()
    for week in range(test_week-train_weeks, test_week):
        
        train_set_temp = cudf.DataFrame()
        
        test = df[df.week == week + 1][['customer_id','article_id']] \
                 .drop_duplicates(['customer_id', 'article_id']) \
                 .merge(customers[['customer_id', 'age_id']], on='customer_id')
        test['y'] = np.int8(1)
        
        train = df[df.week <= week].merge(customers[['customer_id','age_id']], on='customer_id') \
                                   .drop(columns='sales_channel_id')
                
                
        #if purchases were every week and then stoped
        stoped = train.groupby(['week','article_id']) \
                      .agg({'qty':'sum'}) \
                      .reset_index() 
        stoped = stoped.groupby('article_id') \
                       .week \
                       .agg(['max','min', 'count']) \
                       .reset_index()
        stop_factor_1 = (stoped['max'] - stoped['min'] == stoped['count']) & (stoped['max'] <= week-1)
        stop_factor_2 = (stoped['max'] < week-1)
        stoped = stoped.loc[stop_factor_1 | stop_factor_2, 'article_id']
      
            
        week_range = [0, 2, 4, 105]          
        last_group = ['customer_id']

        if week != 105:
            last_purchase_by_cust = last_weeks_purchase_by_group(train[train.customer_id.isin(test.customer_id)], 
                                                                 last_group, week_range)
        else:
            last_purchase_by_cust = last_weeks_purchase_by_group(train, last_group, week_range)
            
        last_purchase_by_cust.customer_id = last_purchase_by_cust.customer_id.astype('int64')
        last_purchase_by_cust.article_id = last_purchase_by_cust.article_id.astype('int32') 

        if train_set_temp.shape[0] == 0:
            train_set_temp = last_purchase_by_cust.copy()
        else:
            train_set_temp = train_set_temp.merge(last_purchase_by_cust, 
                                                  on=['customer_id', 'article_id'], how='outer')
            
        del(last_purchase_by_cust)
        gc.collect()
        
        for best_sell_group, top_n in [(['sex', 'age_id'], 40), 
                                       ([], 15)
                                      ]:

            train = train[train.week == week]

            if 'sex' in best_sell_group:
                train = add_sex(train, week, df)
                test = add_sex(test, week, df)
            
            best_sellers = best_sellers_by_group(train, groups=best_sell_group, 
                                                 top_n=top_n, target_value='qty')
            if test.shape[0] == 0: 
                best_sellers_by = customers[['customer_id'] + best_sell_group] 
            else:
                best_sellers_by = test.drop_duplicates('customer_id')[['customer_id'] + best_sell_group]
                
            if best_sell_group == []:
                best_sellers_by['key'] = np.int8(1)
                best_sellers_by = best_sellers_by.merge(best_sellers, on='key')
                best_sellers_by.drop(columns='key', inplace=True)    
                
            else:
                best_sellers_by = best_sellers_by.merge(best_sellers, on=best_sell_group)
                
            train_set_temp = train_set_temp.merge(best_sellers_by, 
                                                  on=['customer_id', 'article_id'], how='outer')

            del(best_sellers_by)
            gc.collect()
            
        train_set_temp_0 = train_set_temp.shape[0]
        train_set_temp = train_set_temp[~train_set_temp['article_id'].isin(stoped)]
        print(f'delected {train_set_temp_0 - train_set_temp.shape[0]}')
        del(stoped)
        gc.collect()
        
        if test.shape[0] > 0:
            train_set_temp = train_set_temp.merge(test[['customer_id','article_id', 'y']], 
                                                  on=['customer_id', 'article_id'], how='left')

        train_set_temp['week'] = np.int8(week)
        train_set_temp.fillna(0, inplace=True)
        train_set = cudf.concat([train_set, train_set_temp], ignore_index=True)
        
        del(train, test, train_set_temp)
        gc.collect() 

    return train_set

In [None]:
def add_cust_and_art_features(to_add, df=df, prices=prices, richs=richs):
    art_columns = ['product_type_no', 'colour_group_code', 'department_no', 
                   'section_no', 'garment_group_no'] 
    cust_columns = ['sex', 'age_id']

    art_columns = [col for col in art_columns if col not in to_add.columns]
    cust_columns = [col for col in cust_columns if col not in to_add.columns]
    
    if art_columns: 
        to_add = to_add.merge(articles[['article_id'] + art_columns], on='article_id')
    if cust_columns: 
        to_add = to_add.merge(customers[['customer_id'] + cust_columns], on='customer_id')
  
    to_add = to_add.merge(prices, on=['article_id', 'week'], how='left') \
                   .merge(richs, on=['customer_id', 'week'], how='left')

    col_to_fill = [col for col in to_add.columns if 'qty_' in col]
    to_add[col_to_fill] = to_add[col_to_fill].fillna(0)   
    to_add.price = to_add.price.fillna(-1).astype('float32')
    
    return to_add

In [None]:
def create_train_test(train_set, test_week):
    
    train = train_set[train_set.week <= test_week-2]
    test = train_set[(train_set.week == test_week-1)] 
    del(train_set)  
    gc.collect()
    
    val_data = df[(df.week >= train.week.min()+1) 
                  & (df.week <= train.week.max()+1)].drop_duplicates(['customer_id', 'article_id', 'week'])
    eval_train_candidates(train, val_data)

    have_purc = train[train.y == 1].groupby(['week', 'customer_id']) \
                                   .y \
                                   .max() \
                                   .reset_index() \
                                   .drop(columns=['y'])

    train = train.merge(have_purc, on=['week', 'customer_id'])
    del(have_purc)
    gc.collect()

    train = add_cust_and_art_features(train)
    test = add_cust_and_art_features(test)

    return train, test

In [None]:
def create_x_y_for_train(train, test, test_week):

    train = train.sort_values(['week', 'customer_id', 'article_id'])
    train_baskets = cupy.asnumpy(train.groupby(['week', 'customer_id']) \
                                      .article_id \
                                      .count() \
                                      .values)
    columns_to_use = [x for x in train.columns if x not in ['customer_id', 'y', 'week']]
    X = train.sort_values(['week', 'customer_id', 'article_id'])[columns_to_use].to_pandas()
    
    if test_week != 106:
        test = test.sort_values(['week', 'customer_id', 'article_id'])
        eval_baskets = cupy.asnumpy(test.groupby(['week', 'customer_id']) \
                                        .article_id \
                                        .count() \
                                        .values)
        y = test['y'].to_pandas()

    Y = train['y'].to_pandas()
    del(train)
    gc.collect()
    
    x = test.sort_values(['week', 'customer_id', 'article_id']) \
            [columns_to_use] \
            .to_pandas()
    del(test)
    gc.collect()

    if test_week != 106:
        return X, Y, train_baskets, x, y, eval_baskets
    else: 
        return X, Y, train_baskets, x
    
    

In [None]:
def train_ranker(X, Y, train_baskets, x=None, y=None, eval_baskets=None, 
                 n_estimators=100 ,learning_rate=0.1, max_depth=10, num_leaves=31, 
                 boosting_type='gbdt', reg_lambda=0, verbose=10):
    
    eval_result=None
    ranker = lgb.LGBMRanker(boosting_type="gbdt",
                            learning_rate=learning_rate,
                            max_depth=max_depth,
                            objective="lambdarank",
                            metric="ndcg",
                            num_leaves=num_leaves,
                            n_estimators=n_estimators,
                            importance_type="gain",
                            reg_lambda=reg_lambda,
                            device="gpu",
                            max_bin=63,
                            gpu_use_dp=False,
                            random_seed=0,
                            verbose=verbose
                            )
    if y is not None:
        eval_result = {}
        callbacks = [lgb.record_evaluation(eval_result)]

        ranker = ranker.fit(X, Y, group=train_baskets,
                            eval_metric="ndcg",
                            eval_group=[eval_baskets],
                            eval_set=[(x, y)],
                            eval_names="y",
                            callbacks=callbacks,
                            verbose=verbose)
    else:
        ranker = ranker.fit(X, Y, group=train_baskets)
        del(X, Y, train_baskets)
        gc.collect()
    
    return ranker, eval_result

In [None]:
def plot_eval_results(eval_result) -> None:
    eval_results = pd.DataFrame()
    
    for i in range(1,6):
        fold_name = f'ndcg@{i}'
        eval_results_temp = pd.DataFrame({'fold': [i]*len(eval_result['y'][fold_name]),
                                          'iteration': range(1,len(eval_result['y'][fold_name])+1) , 
                                          'val': eval_result['y'][fold_name]})
        
        eval_results = pd.concat([eval_results, eval_results_temp], ignore_index=True)
        
    sns.lineplot(data=eval_results, x='iteration', y='val', hue='fold')
    plt.show()
    return 

<font size="5">Evaluate predictions</font>


In [None]:
def apk(actual: list, predicted: list, k=12) -> float:
    """
    Computes the average prescision at k between two lists of
    items.
    """
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def eval_sub(predictions, val_set, k=12) -> float:
    """
    Computes the MAP between two Dataframes
    """
    apks = []
    
    for actual, pred in zip(val_set.article_id.str.split(),
                            predictions.article_id.str.split()):
        apks.append(apk(actual, pred, k=k))
        
    return np.mean(apks)

def evaluate(predictions, test_week, val_data=df) -> float:
    
    if os.path.exists('../input/hmmmm/val_data.csv'):
        val_data = pd.read_csv('../input/hmmmm/val_data.csv')
        val_data = val_data[val_data.week == test_week]
    else:
        val_data = val_data[val_data.week == test_week].to_pandas() \
                                                       .groupby('customer_id') \
                                                       .article_id \
                                                       .apply(list) \
                                                       .reset_index()
        val_data.article_id = val_data.article_id.apply(lambda articles: ' '.join('0' + str(x) for x in articles))
    
    predictions = predictions.sort_values(['customer_id', 'preds'], ascending=False) \
                             .to_pandas()
    predictions = predictions.groupby('customer_id').head(12)
    predictions = predictions.groupby('customer_id') \
                             .article_id \
                             .apply(list) \
                             .reset_index()
    
    predictions.article_id = predictions.article_id.apply(lambda articles: ' '.join('0' + str(x) for x in articles))
    
    score = eval_sub(predictions, val_data)
    print(str(score).replace('.', ','))
    
    del(predictions, val_data)
    gc.collect()
    return score

<font size="5">Get predictions</font>

In [None]:
def get_preds(test_week, ranker, test, x, batch_size=5000000):
    test['preds'] = 0.
    score = 0.
    
    test = test.sort_values(['week', 'customer_id','article_id'])
    
    if test_week == 106:
        preds_len = x.shape[0]
        
        for idc in tqdm(range(0, preds_len, batch_size)):
            left_boarder = idc + batch_size
            if left_boarder < preds_len:
                test.iloc[idc:left_boarder, 3] = ranker.predict(x.iloc[idc:left_boarder])
            else:
                test.iloc[idc:preds_len, 3] = ranker.predict(x.iloc[idc:preds_len]) 

        del(x)
        gc.collect()
        return test
    else:
        test['preds'] = ranker.predict(x) 
        score = evaluate(test, test_week)
        gc.collect()
        return test, score

<font size="5">Create sets, train and evaluate predictions</font>

In [None]:
def print_feat_importance(ranker, columns_to_use):
    for i in ranker.feature_importances_.argsort()[::-1]:
        print(columns_to_use[i], ranker.feature_importances_[i] / ranker.feature_importances_.sum())


In [None]:
# if you want to submit you need change test weeks to ['106'] 
test_weeks = [106]
train_weeks = 2
train_weeks += 1
inerations = 1

    
plot_eval=True
plot_feat = True
model='lightgbm'
learning_rate = 0.1
verbose = -1
max_depth = -1
n_estimators = 100
num_leaves = 32
reg_lambda = 1
boosting_type= 'gbdt'


In [None]:
%%time
scores = cudf.DataFrame()
for test_week in tqdm(test_weeks):
    print(test_week)
    train_set = create_train_set(df, test_week=test_week, train_weeks=train_weeks)

    gc.collect()

    train, test = create_train_test(train_set, test_week=test_week)
    print('train_test - ready')
    
    if test_weeks == [106]:
        del(train_set)
    gc.collect()
    
    if test_week < 106:
        X, Y, train_baskets, x, y, eval_baskets = create_x_y_for_train(train, test, test_week)
        
        for iteration in tqdm(range(inerations)):
            ranker, eval_result = train_ranker(X, Y, train_baskets, 
                                               x, y, eval_baskets, 
                                               learning_rate=learning_rate, 
                                               max_depth=max_depth, 
                                               num_leaves=num_leaves,
                                               n_estimators=n_estimators, 
                                               boosting_type=boosting_type,
                                               reg_lambda=reg_lambda,
                                               verbose=verbose)
            print(f'ranker_{iteration} - ready')
            
            eval_results = plot_eval_results(eval_result)
            plt.show()
            print_feat_importance(ranker, x.columns)
                
            test, score = get_preds(test_week, ranker, test, x)
            
            scores_temp = cudf.DataFrame({'week': [test_week], 
                                          'score': [score], 
                                          'iter': [iteration]})
            
            scores = cudf.concat([scores, scores_temp], ignore_index=True)
            
    else: 
        X, Y, train_baskets, x = create_x_y_for_train(train, test, test_week)
        del(train)
        gc.collect()
        
        test = test[['week', 'customer_id', 'article_id']]
        ranker, callbacks = train_ranker(X, Y, 
                                         train_baskets, x,
                                         verbose=verbose)
        print_feat_importance(ranker, x.columns)
        
        del(X, Y, train_baskets)
        gc.collect()
        
        test = get_preds(test_week, ranker, test, x)
        
        del(x)
        gc.collect()

<font size="5">Create and save submission</font>

In [None]:
c_id2predicted_article_ids = test.sort_values(['customer_id', 'preds'], ascending=False) \
                                 .to_pandas()
c_id2predicted_article_ids = c_id2predicted_article_ids.groupby('customer_id') \
                                                       .head(12)
c_id2predicted_article_ids = c_id2predicted_article_ids.groupby('customer_id') \
                                                       .article_id \
                                                       .apply(list) \
                                                       .to_dict()

In [None]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
sub['customer_id2'] = sub.customer_id.map(id_to_index_dict)
gc.collect()

In [None]:
sub.prediction = sub.customer_id2.map(c_id2predicted_article_ids)
del(c_id2predicted_article_ids)
gc.collect()

sub.prediction = sub.prediction.apply(lambda xx: ' '.join('0' + str(x) for x in xx))

In [None]:
sub.drop(columns='customer_id2', inplace=True)
gc.collect()
sub_name = '''20'''
sub.to_csv(f'{sub_name}.csv.gz', index=False)
del(sub)
gc.collect()

<font size="3">P.S. My best submission is blending of 6 different submissions</font>