**I tried binary classification solution using LightGBM like CTR prediction**  
**Please upvote if this notebook is useful!**

In [None]:
import pandas as pd
import numpy as np
import gc

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

import os
import joblib
import re
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
class Config:
    transaction_path = "../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv"
    transaction_2020_path = "../input/h-and-m-split-dataset-by-year/transactions_train_2020.csv"
    transaction_2019_path = "../input/h-and-m-split-dataset-by-year/transactions_train_2019.csv"
    customer_path = "../input/h-and-m-personalized-fashion-recommendations/customers.csv"
    article_path = "../input/h-and-m-personalized-fashion-recommendations/articles.csv"
    image_feat_path = "../input/h-and-m-swint-image-embedding/swin_tiny_patch4_window7_224_emb.csv.gz"
    sample_submission_path = "../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv"

    output_dir = "../output/"
    #start_date = '2020-08-01'
    start_date = '2020-09-15'

    image_feat_dim = 768
    text_feat_dim = 384
    
    #n_fold = 2
    n_fold = 5
    seed = 2022
    lgbm = {"n_estimators" :50}

    label = "label"

os.makedirs(Config.output_dir, exist_ok=True)

In [None]:
# https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

# preprae truth/false

Given transaction data, that are user-item pairs, are defined as positive data.  
Negative data are created by shuffing user-item pairs.

In [None]:
df_trans = pd.read_csv(Config.transaction_2020_path)
df_trans = df_trans[df_trans["t_dat"] >= Config.start_date].reset_index(drop=True)
df_trans = reduce_mem_usage(df_trans)

In [None]:
df_truth = df_trans[["customer_id", "article_id"]]
df_truth.head()

In [None]:
del df_trans

In [None]:
df_false = df_truth.copy()
df_false.loc[:, "article_id"] = df_false["article_id"].sample(frac=1).tolist()
df_false.head()

In [None]:
df_truth.loc[:, Config.label] = 1
df_false.loc[:, Config.label] = 0

In [None]:
df_truth = pd.concat([df_truth, df_false])
df_truth.shape, 

In [None]:
df_truth[df_truth["label"] ==1].shape,  df_truth[df_truth["label"] ==0].shape, 

In [None]:
del df_false

# Preprocessing
## prepare article feat

In [None]:
df_article = import_data(Config.article_path)
#df_image = import_data(Config.image_feat_path)
#df_text = import_data(Config.text_feat_path)

In [None]:
def get_table_feat(df):
        
    article_id_cols = ["product_code", "product_type_no", "graphical_appearance_no", "colour_group_code",
              "perceived_colour_value_id", "perceived_colour_master_id", "department_no", "index_group_no", 
               "section_no", "garment_group_no"]
    
    article_dummy_cols = ["product_type_name", "product_group_name", "graphical_appearance_name", "colour_group_name",
                         "perceived_colour_value_name", "perceived_colour_master_name", 
                         #"department_name", 
                         "index_name", "index_group_name", "section_name", "garment_group_name"]
    
    article_drop_cols = ["index_code", "prod_name", "detail_desc", "department_name"]
    
    df = df.drop(article_drop_cols, axis=1)
    df = pd.get_dummies(df, columns=article_dummy_cols)
    return df
    
def create_article_feat(df_article, 
                        #df_image
                        ):

    # rename image 
    #rename_dic = {f"{i}": f"image_col_{i}" for i in range(Config.image_feat_dim)}
    #df_image = df_image.rename(columns=rename_dic)

    df_article_feat = get_table_feat(df_article)
    #df_article_feat = df_article_feat.merge(df_image, on="article_id", how="left")

    return df_article_feat

In [None]:
#df_article_feat = create_article_feat(df_article, df_image, df_text)
df_article_feat = create_article_feat(df_article)

In [None]:
df_article_feat.head()

In [None]:
#del df_article, df_image, df_text
del df_article

In [None]:
df_article_feat = reduce_mem_usage(df_article_feat)

## prepare customer feat

In [None]:
df_customer = pd.read_csv(Config.customer_path)

In [None]:
def create_customer_feat(df):
    
    customer_drop_cols = ["postal_code"]
    customer_dummy_cols = ["club_member_status", "fashion_news_frequency"]
    
    
    df = df.drop(customer_drop_cols, axis=1)
    df.loc[:, "FN"] = df["FN"].fillna(0)
    df.loc[:, "Active"] = df["Active"].fillna(0)
    df.loc[:, "club_member_status"] = df["club_member_status"].fillna("NONE")
    df.loc[:, "fashion_news_frequency"] = df["fashion_news_frequency"].fillna("NONE")
    df.loc[:, "age"] = df["age"].fillna(0)
    df.loc[:, "age"] = np.log1p(df["age"])

    df = pd.get_dummies(df, columns=customer_dummy_cols)
    
    return df

In [None]:
df_customer_feat = create_customer_feat(df_customer)

In [None]:
del df_customer
df_customer_feat = reduce_mem_usage(df_customer_feat)

## Merge all feats and construct dataset

In [None]:
gc.collect()

In [None]:
 # https://github.com/awslabs/autogluon/issues/399
df_article_feat = df_article_feat.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
df_customer_feat = df_customer_feat.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [None]:
# def create_dataset(df_truth, df_article_feat, df_customer_feat):

#     df_data = df_truth.merge(df_article_feat, on="article_id", how='left')
#     df_data = df_data.merge(df_customer_feat, on = "customer_id", how='left')    
#     df_data = df_data.drop(["customer_id", "article_id"], axis=1)    
#     df_data = df_data.fillna(0)

#     # https://github.com/awslabs/autogluon/issues/399
#     df_data = df_data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

#     return df_data

In [None]:
# https://www.kaggle.com/tkm2261/fast-pandas-left-join-357x-faster-than-pd-merge

df_article_feat = df_article_feat.set_index("article_id")
df_customer_feat = df_customer_feat.set_index("customer_id")

def create_dataset_faster(df_truth, df_article_feat, df_customer_feat): 

    df_data = pd.concat([
        df_truth.reset_index(drop=True), 
        df_article_feat.reindex(df_truth['article_id'].values).reset_index(drop=True)
    ], axis=1)
    df_data = pd.concat([
        df_data.reset_index(drop=True), 
        df_customer_feat.reindex(df_data['customer_id'].values).reset_index(drop=True)
    ], axis=1)  
    
    df_data = df_data.drop(["customer_id", "article_id"], axis=1)    
    df_data = df_data.fillna(0)


    return df_data

In [None]:
df_data = create_dataset_faster(df_truth, df_article_feat, df_customer_feat)

In [None]:
df_data.to_pickle(f"{Config.output_dir}/feature.pkl")

In [None]:
df_data.columns.shape

In [None]:
df_data[df_data["label"] ==1].shape,  df_data[df_data["label"] ==0].shape, 

# Training

In [None]:
def train(df_data):    
    cols = [col for col in df_data.columns if Config.label != col]

    folds = StratifiedKFold(n_splits=Config.n_fold, random_state=Config.seed, shuffle=True)
    es = early_stopping(1000)
    le = log_evaluation(period=100)
    scores = []    
        
    for fold, (train_idx, val_idx) in enumerate(folds.split(df_data, df_data[Config.label])):
        print(f"=====fold {fold}=======")

        df_train = df_data.loc[train_idx].reset_index(drop=True)
        df_val = df_data.loc[val_idx].reset_index(drop=True)
        
        print("train shape", df_train.shape, "test shape", df_val.shape)
        
        model = LGBMClassifier(random_state=Config.seed, **Config.lgbm)
        
        model.fit(df_train[cols], df_train[Config.label],
                eval_set=(df_val[cols], df_val[Config.label]),
                callbacks=[es, le],
                eval_metric="auc"              
                )
        
        # validation
        val_pred = model.predict(df_val[cols])
        val_score = roc_auc_score(df_val[Config.label], val_pred)
        scores.append(val_score)
        
        # save_model
        joblib.dump(model,f"lgbm_fold_{fold}.joblib")

    return scores

In [None]:
scores = train(df_data)

In [None]:
print(scores)
print(np.mean(scores))

In [None]:
df_data.shape

# Feature importance

In [None]:
def get_feat_imp(df_data):
    imps_list = []    
    cols = [col for col in df_data.columns if Config.label != col]
    for _fold in range(Config.n_fold):
        with open(f"lgbm_fold_{_fold}.joblib", "rb") as f:
            model = joblib.load(f)
        imps= model.feature_importances_
        imps_list.append(imps)

    imps = np.mean(imps_list, axis=0)
    df_imps = pd.DataFrame({"columns": df_data[cols].columns.tolist(), "feat_imp": imps})
    df_imps = df_imps.sort_values("feat_imp", ascending=False).reset_index(drop=True)

    return df_imps
 

In [None]:
df_fea_imp = get_feat_imp(df_data)
df_fea_imp.head(30)

In [None]:
_, ax = plt.subplots(figsize=(10, 8))
sns.barplot(data=df_fea_imp.head(30), x="feat_imp", y="columns")

# Inference (only 10 samples)

In [None]:
df_submission = import_data(Config.sample_submission_path)
df_submission.head()

In [None]:
len(df_submission.iloc[0, 1].split(" "))

In [None]:
df_submission.shape

In [None]:
df_article = import_data(Config.article_path)
df_article = df_article[["article_id"]]

In [None]:
df_article.head()

In [None]:

def inference(df_submission, df_article, df_article_feat, df_customer_feat, models, cols):

    article_candidates = []

    for customer in tqdm.tqdm(df_submission["customer_id"]):
        _df = df_article.copy()
        _df.loc[:, "customer_id"] = customer
        _df = create_dataset_faster(_df, df_article_feat, df_customer_feat)         
        _df = _df[cols]     

        preds = []
        for _fold in range(Config.n_fold):
            pred = models[_fold].predict_proba(_df, num_iteration=models[_fold]._best_iteration)[:, 1]
            preds.append(pred)
        
        pred = np.mean(preds, axis=0)        
        df_pred = pd.DataFrame({"article_id": df_article["article_id"].tolist() , "score": pred})
                
        df_pred = df_pred.sort_values("score", ascending=False).reset_index(drop=True)
        df_pred = df_pred.head(12)
        pred_str = [str(pred) for pred in df_pred["article_id"].tolist()]
        article_candidates.append(" ".join(pred_str))

    df_submission.loc[:, "prediction"] = article_candidates

    return df_submission

**Predict only 10 samples.  
It requires a lot of time to predict all data, 😥**

In [None]:
df_article.shape

In [None]:
models = []
for _fold in range(Config.n_fold):
    with open(f"lgbm_fold_{_fold}.joblib", "rb") as f:
        model = joblib.load(f)
        models.append(model)
    
cols = [col for col in df_data.columns if Config.label != col]
df_sub = inference(df_submission.head(10), df_article, df_article_feat, df_customer_feat, models, cols)

In [None]:
df_sub.head()

In [None]:
df_sub.to_csv("submit.csv", index=None)