In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [2]:
# --- Configuration ---
DATA_PATH = "../data/h-and-m-personalized-fashion-recommendations/"
N_POPULAR_CANDIDATES = 12  # Number of popular items to recommend
NEG_PER_USER = 5 #negative sample per user

In [3]:
# --- 1. Data Loading ---

print("1. Loading Data...")

# Load core files
df_trans = pd.read_csv(
    DATA_PATH + "transactions_train.csv",
    usecols=["customer_id", "article_id", "t_dat"]
)
df_articles = pd.read_csv(
    DATA_PATH + "articles.csv", 
    usecols=['article_id', 'product_type_no', 'product_group_name', 'department_no']
)
df_customers = pd.read_csv(
    DATA_PATH + "customers.csv", 
    usecols=['customer_id', 'age', 'club_member_status']
    )

1. Loading Data...


In [6]:
# --- 2. Feature Engineer ---

print("2. Feature Engineer...")

#encode id (for tree model)
all_users = df_customers["customer_id"].unique()
all_items = df_articles["article_id"].unique()

# user → index
user_to_idx = {u: i for i, u in enumerate(all_users)}
idx_to_user = {i: u for u, i in user_to_idx.items()}

# item → index
item_to_idx = {a: i for i, a in enumerate(all_items)}
idx_to_item = {i: a for a, i in item_to_idx.items()}

all_user_id = pd.Series(all_users).map(user_to_idx)
all_item_id = pd.Series(all_items).map(item_to_idx)

df_customers["customer_id"] = df_customers["customer_id"].map(user_to_idx)
df_articles["article_id"] = df_articles["article_id"].map(item_to_idx)
df_trans["article_id"] = df_trans["article_id"].map(item_to_idx)
df_trans["customer_id"] = df_trans["customer_id"].map(user_to_idx)


2. Feature Engineer...


In [7]:
# --- 3. Data prepare ---

print("3. Data prepare...")

#create dataset with positive sample
df_pos_data = pd.DataFrame()
df_pos_data["customer_id"] = df_trans["customer_id"]
df_pos_data["article_id"] = df_trans["article_id"]
df_pos_data["label"] = 1

#create dataset with negative sample

rep_users = np.repeat(all_user_id, NEG_PER_USER)
sampled_items = np.random.choice(all_item_id, size=len(rep_users), replace=True)

df_neg_data= pd.DataFrame()
df_neg_data["customer_id"] = rep_users
df_neg_data["article_id"] = sampled_items
df_neg_data["label"] = 0

#merge pos and neg dataset
df_all = pd.concat([df_pos_data, df_neg_data], ignore_index=True)

X = df_all[["customer_id", "article_id"]]
y = df_all["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    shuffle=True,          
    random_state=42        # for reproducibility
)


3. Data prepare...


In [9]:
# --- 4. Apply the tree model ---

print("4. Apply the tree model...")

train_data = lgb.Dataset(X_train, y_train)
valid_data = lgb.Dataset(X_test, y_test)

params = {
    "objective": "regression",
    "metric": "rmse",        # root mean squared error
    "learning_rate": 0.05,
    "num_leaves": 31,
    "min_data_in_leaf": 20,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "verbosity": -1,
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=["train", "valid"],
    num_boost_round=500
)

y_pred = model.predict(X_test)

print("Sample predictions:", y_pred[:5])

4. Apply the tree model...
Sample predictions: [0.85033614 0.8929093  0.69820417 0.82246548 0.8165944 ]


In [None]:
# --- 5. make the submission ---

print("5. make the submission...")

def recommend_for_users_batch(user_idx_batch, model, num_items, topk=12):
    batch_size = len(user_idx_batch)

    user_column = np.repeat(user_idx_batch, num_items)      #Bxnum_items
    item_column = np.tile(np.arange(num_items), batch_size) #Bxnum_items

    df_tmp = pd.DataFrame({
        "user_idx": user_column,
        "item_idx": item_column
    })

    #predict all at once
    scores = model.predict(df_tmp)
    scores = scores.reshape(batch_size, num_items)

    #get top k items
    topk_idx = np.argpartition(-scores, topk, axis=1)[:, :topk]

    return topk_idx

batch_size = 200
num_all_tiems = len(all_items)
# Precompute item lookup array (string type)
item_lookup = np.array([idx_to_item[i] for i in range(num_all_items)], dtype=str)
submission_rows = []

for i in tqdm(range(0, len(all_user_id), batch_size)):
    batch_users = all_user_id[i:i+batch_size]
    batch_size_actual = len(batch_users)

    #predict
    batch_top_items = recommend_for_users_batch(batch_users, model, num_all_tiems, topk=12)
    batch_top_items = np.array(batch_top_items) # B x topk

    decoded_items = item_lookup[batch_top_items] # B x topk

    pred_strs = [" ".join(row) for row in decoded_items]

    user_ids = [idx_to_user[u] for u in batch_users]

    submission_rows.extend(zip(user_ids, pred_strs))

submission = pd.DataFrame(submission_rows, columns=["customer_id", "prediction"])
submission.to_csv("lgb_submission.csv", index=False)



5. make the submission...


  0%|          | 3/6860 [03:26<131:27:52, 69.02s/it]

In [15]:
str(pred_articles)

'[np.int64(108775015), np.int64(399223034), np.int64(399223004), np.int64(399223015), np.int64(399136011), np.int64(399223025), np.int64(399223026), np.int64(399223028), np.int64(399223029), np.int64(399223030), np.int64(399223032), np.int64(399201045)]'

In [14]:
all_items

array([108775015, 108775044, 108775051, ..., 956217002, 957375001,
       959461001])