# レコメンドシステム作成

必要に応じて以下を実行(既にtqdmをインストール済みなら不要)

In [None]:
!pip install tqdm

### モジュールのインポート

In [1]:
# coding: UTF-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import lightgbm as lgb
from collections import defaultdict
from tqdm.notebook import trange

以下のパス名やファイル名のみ適宜変更する

In [2]:
#対象となるデータセットファイルのパス名
dataset_path = "./train/train_D.tsv"
#test.tsvのパス名
test_path = "./test.tsv"
#学習結果を保存するファイル名
save_file_path = "resultsD.tsv"

### モデル

In [3]:
# LGBMRankerのモデルを作成
model = lgb.LGBMRanker()

### データセットを整形する関数

In [4]:
"""
データセットをtrain_X/train_y/valid_X/valid_yの4つに分ける関数(4等分)
"""
def separate_dataset(dataset_path):
    data = pd.read_csv(dataset_path, sep="\t")
    data["time_stamp"] = pd.to_datetime(data["time_stamp"])
    
    users = defaultdict(int) #データセットのそれぞれのuser数を保持
    for user in data.loc[:,"user_id"]:
        users[user] += 1
    
    total = data.shape[0] #対象のデータセットの総数
    list_train_X, list_train_y = [], []
    list_val_X, list_val_y = [], []
    cnt_users = defaultdict(int)
    
    print("Separating dataset is going to start.")
    #4/10 -> train_X, 8/10 -> train_y, 9/10 -> valid_X, 10/10 -> valid_y
    for idx in trange(total):
        try:
            info = data.iloc[idx,:]
            user_name = info.loc["user_id"]
            if cnt_users[user_name] < (users[user_name]/10)*4:
                list_train_X.append(info)
            elif cnt_users[user_name] < (users[user_name]/10)*8:
                list_train_y.append(info)
            elif cnt_users[user_name] < (users[user_name]/10)*9:
                list_val_X.append(info)
            else:
                list_val_y.append(info)
            cnt_users[user_name] += 1
        except KeyError:
            continue
        except IndexError:
            continue
            
    #作成したDataFrameのcolumnsとrowsを転置
    data_train_X = pd.concat(list_train_X, axis=1).T
    data_train_y= pd.concat(list_train_y, axis=1).T
    data_val_X = pd.concat(list_val_X, axis=1).T
    data_val_y= pd.concat(list_val_y, axis=1).T
    
    return data_train_X, data_train_y, data_val_X, data_val_y

"""
user_idに関するデータセットの整形
"""
def make_user_id(dataset_X, flag):
    u_p = dataset_X.groupby("user_id").apply(lambda x: len(x["product_id"].unique()))
    u_d = dataset_X.groupby("user_id").apply(lambda x: len(x["time_stamp"].apply(lambda x: x.date()).unique()))
    u_pv = dataset_X.groupby("user_id").apply(lambda x: (x["event_type"] == 1).sum())
    u = pd.concat([u_p, u_d, u_pv], axis=1) #横方向に結合
    if flag == "val":
        u.columns = ["u_p_val", "u_d_val", "u_pv_val"]
    else:
        u.columns = ["u_p", "u_d", "u_pv"]
    
    return u
    
"""
product_idに関するデータセットの整形
"""
def make_product_id(dataset_X, flag):
    p_u = dataset_X.groupby("product_id").apply(lambda x: len(x["user_id"].unique()))
    """event_type
    0 -> カートに入れた
    1 -> 閲覧した
    2 -> クリックした
    3 -> 購入した
    """
    p_ca = dataset_X.groupby("product_id").apply(lambda x: (x["event_type"] == 0).sum())
    p_pv = dataset_X.groupby("product_id").apply(lambda x: (x["event_type"] == 1).sum())
    p_cl = dataset_X.groupby("product_id").apply(lambda x: (x["event_type"] == 2).sum())
    p_cv = dataset_X.groupby("product_id").apply(lambda x: (x["event_type"] == 3).sum())
    p = pd.concat([p_u, p_ca, p_pv, p_cl, p_cv], axis=1) #横方向に結合
    if flag == "val":
        p.columns = ['p_u_val', 'p_pv_val', 'p_ca_val', 'p_cl_val', 'p_cv_val']
    else:
        p.columns = ['p_u', 'p_pv', 'p_ca', 'p_cl', 'p_cv']
    
    return p
    
"""
user_idとproduct_idに関するデータセットの整形
"""
def make_user_product_id(dataset_X, flag):
    u_p_ca = dataset_X.groupby(["user_id", "product_id"]).apply(lambda x: (x["event_type"] == 0).sum())
    u_p_pv = dataset_X.groupby(["user_id","product_id"]).apply(lambda x: (x["event_type"] == 1).sum())
    u_p_cl = dataset_X.groupby(["user_id","product_id"]).apply(lambda x: (x["event_type"] == 2).sum())
    u_p_cv = dataset_X.groupby(["user_id","product_id"]).apply(lambda x: (x["event_type"] == 3).sum())
    u_p = pd.concat([u_p_pv, u_p_ca, u_p_cl, u_p_cv], axis=1) #横方向に結合
    if flag == "val":
        u_p.columns = ['u_p_pv_val', 'u_p_ca_val', 'u_p_cl_val', 'u_p_cv_val']
    else:
        u_p.columns = ['u_p_pv', 'u_p_ca', 'u_p_cl', 'u_p_cv']
    
    return u_p

"""
整形したデータセットを結合
"""
def let_merged(u, p, u_p):
    u = u.reset_index()
    p = p.reset_index()
    u_p = u_p.reset_index()

    merged = pd.merge(u_p, u, on = "user_id", how="inner")
    merged = pd.merge(merged, p, on="product_id", how="inner")
    
    return merged

"""
それぞれのuserの行動価値を算出
"""
def add_relation(dataset_y, merged):
    rel = dataset_y.groupby(["user_id","product_id"]).apply(lambda x: max(x["event_type"]) + 1)
    rel = rel.reset_index()
    rel = rel.rename(columns={0: 'y'})
    user_x = set(merged['user_id'])
    user_y = set(rel['user_id'])
    user_xy = user_x.intersection(user_y) #共通のユーザー
    merged = merged[merged["user_id"].isin(user_xy)] #特徴量データの抽出
    rel = pd.merge(merged[['user_id', 'product_id']], rel) #目的変数データの抽出
    alls = pd.merge(merged, rel, on=["user_id","product_id"], how="outer")
    alls = alls.fillna(0) #欠損値を0で埋める
    alls["y"] = alls["y"].astype(int)
    
    return alls

"""
クエリリストと整形が完了した最終的なデータセットの作成
"""
def make_query_list(alls, flag):
    query_list = alls["user_id"].value_counts()
    alls = alls.set_index(['user_id', 'product_id'])
    query_list = query_list.sort_index()
    alls = alls.sort_index()
    if flag == "val":
        X = alls[['p_u_val', 'p_pv_val', 'p_ca_val', 'p_cl_val', 'p_cv_val', 'u_p_val', 'u_d_val', 'u_pv_val', 'u_p_pv_val', 'u_p_ca_val', 'u_p_cl_val', 'u_p_cv_val']]
    else:
        X = alls[['p_u', 'p_pv', 'p_ca', 'p_cl', 'p_cv', 'u_p', 'u_d', 'u_pv', 'u_p_pv', 'u_p_ca', 'u_p_cl', 'u_p_cv']]
    y = alls["y"]
    
    return query_list, X, y

In [5]:
data_train_X, data_train_y, data_val_X, data_val_y = separate_dataset(dataset_path)

Separating dataset is going to start.


  0%|          | 0/3572842 [00:00<?, ?it/s]

### 学習データセットの作成

In [6]:
u = make_user_id(data_train_X, flag="train")
p = make_product_id(data_train_X, flag="train")
u_p = make_user_product_id(data_train_X, flag="train")
merged = let_merged(u, p, u_p)
train_all = add_relation(data_train_y, merged)
query_list, X_train, y_train = make_query_list(train_all, flag="train")

### 検証データセットの作成

In [7]:
u_val = make_user_id(data_val_X, flag="val")
p_val = make_product_id(data_val_X, flag="val")
u_p_val = make_user_product_id(data_val_X, flag="val")
merged_val = let_merged(u_val, p_val, u_p_val)
val_all = add_relation(data_val_y, merged_val)
query_list_val, X_val, y_val = make_query_list(val_all, flag="val")

### 学習

In [8]:
model.fit(X_train, y_train, group=query_list, eval_set=[(X_val, y_val)], eval_group=[list(query_list_val)], eval_at=(1,2,3,4,5,6,7,8,9,10))

[1]	valid_0's ndcg@1: 0.908514	valid_0's ndcg@2: 0.94477	valid_0's ndcg@3: 0.953397	valid_0's ndcg@4: 0.956399	valid_0's ndcg@5: 0.957803	valid_0's ndcg@6: 0.958583	valid_0's ndcg@7: 0.958955	valid_0's ndcg@8: 0.95917	valid_0's ndcg@9: 0.959325	valid_0's ndcg@10: 0.959445
[2]	valid_0's ndcg@1: 0.908376	valid_0's ndcg@2: 0.944696	valid_0's ndcg@3: 0.953374	valid_0's ndcg@4: 0.956403	valid_0's ndcg@5: 0.957814	valid_0's ndcg@6: 0.958542	valid_0's ndcg@7: 0.95892	valid_0's ndcg@8: 0.959135	valid_0's ndcg@9: 0.95929	valid_0's ndcg@10: 0.959404
[3]	valid_0's ndcg@1: 0.908293	valid_0's ndcg@2: 0.94463	valid_0's ndcg@3: 0.953322	valid_0's ndcg@4: 0.956399	valid_0's ndcg@5: 0.957781	valid_0's ndcg@6: 0.958506	valid_0's ndcg@7: 0.958893	valid_0's ndcg@8: 0.959099	valid_0's ndcg@9: 0.95926	valid_0's ndcg@10: 0.959369
[4]	valid_0's ndcg@1: 0.908238	valid_0's ndcg@2: 0.944587	valid_0's ndcg@3: 0.953371	valid_0's ndcg@4: 0.956404	valid_0's ndcg@5: 0.957843	valid_0's ndcg@6: 0.958604	valid_0's ndcg@

LGBMRanker()

In [None]:
#ユーザー一覧をtest.tsvより取得
users = pd.read_csv(test_path, sep='\t', header=None)

#重複を除いたユーザーの取得
user_ids = []
product_ids = []
ranks = []
user = set()
for u in users[0]:
    user.add(u)
user = sorted(list(user))

#それぞれのユーザーに対する予測結果
for u in user:
    try:
        pred = model.predict(X_val.loc[u]) # 機械学習モデルの出力
        pred_products = pd.Series(pred, index=X_val.loc[u].index) # 出力に対して商品IDを割り当てる
        products = pred_products.sort_values(ascending=False).index # 関連度が高い順に並び変える
        output = list(products)[:20] # 上位20件以内に絞る
        user_ids += [u]*len(output) # ユーザーIDを予測数の分だけ列挙
        product_ids += output  # products_idsに予測結果を追加
        ranks += list(range(1, len(output)+1)) # 各product_idに対してランキングを付与して追加
    except KeyError:
        continue
        

#結果を連結
results = pd.DataFrame({'user_id': user_ids, 'product_id': product_ids, 'rank': ranks})

In [None]:
#学習結果をtsvファイルとして保存
results.to_csv(save_file_path, sep="\t", header=False, index=None)