In [25]:
import sys
import os
import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import LabelEncoder

# RecSys Challenge 2015のデータセット

In [2]:
train_df = pd.read_csv("~/work/dataset/RC15/yoochoose-clicks.dat", header=None)
test_df = pd.read_csv("~/work/dataset/RC15/yoochoose-test.dat", header=None)

  train_df = pd.read_csv("/home/inoue/work/dataset/RC15/yoochoose-clicks.dat", header=None)
  test_df = pd.read_csv("/home/inoue/work/dataset/RC15/yoochoose-test.dat", header=None)


In [3]:
train_df.columns = ["sessionId", "timestamp", "itemId", "categoryId"]
test_df.columns = ["sessionId", "timestamp", "itemId", "categoryId"]

In [4]:
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"], format="%Y-%m-%dT%H:%M:%S.%fZ")
test_df["timestamp"] = pd.to_datetime(test_df["timestamp"], format="%Y-%m-%dT%H:%M:%S.%fZ")

In [5]:
# アイテム数が4より小さいものを削除
remove_items = train_df.groupby("itemId").size()
remove_items = remove_items[remove_items > 4].index
train_df = train_df[train_df["itemId"].isin(remove_items)]

In [6]:
# sessionの長さが1のものを消去
train_session_len = train_df.groupby("sessionId").size()
train_df = train_df[train_df["sessionId"].isin(train_session_len[train_session_len > 1].index)]

In [7]:
train_df = train_df.sort_values("timestamp")
train_df

Unnamed: 0,sessionId,timestamp,itemId,categoryId
1031562,351646,2014-04-01 03:00:00.124,214717005,0
1163646,389654,2014-04-01 03:00:00.567,214826705,0
885068,263073,2014-04-01 03:00:10.087,214716982,0
620662,210798,2014-04-01 03:00:13.070,214581827,0
1217521,375257,2014-04-01 03:00:13.768,214644307,0
...,...,...,...,...
32350125,11528554,2014-09-30 02:59:43.577,214572433,S
32653390,11422848,2014-09-30 02:59:47.397,214859908,S
32432376,11474968,2014-09-30 02:59:49.546,214685049,3
32432377,11474968,2014-09-30 02:59:53.315,214685049,3


In [8]:
# テストデータからトレインデータに含まれないアイテムを除外
test_df = test_df[test_df["itemId"].isin(train_df["itemId"].unique())]

In [9]:
# テストデータからセッション数が1のものを削除
test_session_len = test_df.groupby("sessionId").size()
test_df = test_df[test_df.sessionId.isin(test_session_len[test_session_len > 1].index)]

In [10]:
df = pd.concat([train_df, test_df], axis=0)

In [11]:
df

Unnamed: 0,sessionId,timestamp,itemId,categoryId
1031562,351646,2014-04-01 03:00:00.124,214717005,0
1163646,389654,2014-04-01 03:00:00.567,214826705,0
885068,263073,2014-04-01 03:00:10.087,214716982,0
620662,210798,2014-04-01 03:00:13.070,214581827,0
1217521,375257,2014-04-01 03:00:13.768,214644307,0
...,...,...,...,...
8251786,11299820,2014-09-25 08:17:19.053,214853094,3
8251787,11299815,2014-09-26 09:49:23.602,214854804,S
8251788,11299815,2014-09-26 09:49:41.808,214714715,S
8251789,11299810,2014-09-26 16:00:32.051,214546123,2


In [12]:
item_encoder = LabelEncoder()
df["itemId"] = item_encoder.fit_transform(df["itemId"])
train_df["itemId"] = item_encoder.transform(train_df["itemId"])
test_df["itemId"] = item_encoder.transform(test_df["itemId"])

In [13]:
session_encoder = LabelEncoder()
df["sessionId"] = session_encoder.fit_transform(df["sessionId"])
train_df["sessionId"] = session_encoder.transform(train_df["sessionId"])
test_df["sessionId"] = session_encoder.transform(test_df["sessionId"])

In [14]:
train_df.to_pickle("~/work/dataset/RC15/derived/train.df")
test_df.to_pickle("~/work/dataset/RC15/derived/test.df")

# ML-100kの前処理

In [15]:
df = pd.read_csv("~/work/dataset/ml-100k/rating.csv")
df

Unnamed: 0,userId,itemId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [16]:
df = df.sort_values("timestamp")
df

Unnamed: 0,userId,itemId,rating,timestamp
214,259,255,4,874724710
83965,259,286,4,874724727
43027,259,298,4,874724754
21396,259,185,4,874724781
82655,259,173,4,874724843
...,...,...,...,...
46773,729,689,4,893286638
73008,729,313,3,893286638
46574,729,328,3,893286638
64312,729,748,4,893286638


In [17]:
train, test = df.iloc[:int(len(df)*0.8)], df.iloc[int(len(df) * 0.8):]
train, test

(       userId  itemId  rating  timestamp
 214       259     255       4  874724710
 83965     259     286       4  874724727
 43027     259     298       4  874724754
 21396     259     185       4  874724781
 82655     259     173       4  874724843
 ...       ...     ...     ...        ...
 41910       3     271       3  889237224
 1343        3     245       1  889237247
 27975       3     355       3  889237247
 10869       3     324       2  889237247
 18651       3     349       3  889237269
 
 [80000 rows x 4 columns],
        userId  itemId  rating  timestamp
 3758        3     323       2  889237269
 38670       3     322       3  889237269
 1257        3     335       1  889237269
 18385       3     264       2  889237297
 53950       3     325       1  889237297
 ...       ...     ...     ...        ...
 46773     729     689       4  893286638
 73008     729     313       3  893286638
 46574     729     328       3  893286638
 64312     729     748       4  893286638
 7920

In [18]:
train.to_pickle("~/work/dataset/ml-100k/train.df")
test.to_pickle("~/work/dataset/ml-100k/test.df")

# Diginetica データセット

In [20]:
df = pd.read_csv("~/work/dataset/diginetica2/train-item-views.csv", sep=";")
df

Unnamed: 0,sessionId,userId,itemId,timeframe,eventdate
0,1,,81766,526309,2016-05-09
1,1,,31331,1031018,2016-05-09
2,1,,32118,243569,2016-05-09
3,1,,9654,75848,2016-05-09
4,1,,32627,1112408,2016-05-09
...,...,...,...,...,...
1235375,600684,,42906,632853,2016-04-14
1235376,600684,,33312,643522,2016-04-14
1235377,600684,,33312,52621,2016-04-14
1235378,600684,,5227,575276,2016-04-14


In [22]:
df["eventdate"] = pd.to_datetime(df["eventdate"], format="%Y-%m-%d")

In [24]:
df.nunique()

sessionId    310324
userId        87934
itemId       122993
timeframe    644845
eventdate       152
dtype: int64

In [51]:
# 長さが1のセッションを削除
session_size = df.groupby("sessionId")["itemId"].size()
remain_sess_ids = session_size[session_size > 1].index
remain_sess_ids

Int64Index([     1,      2,      5,      7,     10,     12,     13,     14,
                15,     17,
            ...
            600639, 600645, 600651, 600661, 600672, 600674, 600680, 600681,
            600683, 600684],
           dtype='int64', name='sessionId', length=204061)

In [52]:
df = df[df["sessionId"].isin(remain_sess_ids)]

In [53]:
# 出現回数が5回未満のアイテムを削除
item_size = df.groupby("itemId").size()
remain_item_ids = item_size[item_size >4].index
remain_item_ids

Int64Index([     2,      3,      6,      7,      9,     11,     12,     13,
                15,     16,
            ...
            487028, 487598, 490601, 506446, 512246, 512539, 518582, 539560,
            568637, 707327],
           dtype='int64', name='itemId', length=42171)

In [54]:
df = df[df["itemId"].isin(remain_item_ids)]
df

Unnamed: 0,sessionId,userId,itemId,timeframe,eventdate
0,1,,81766,526309,2016-05-09
1,1,,31331,1031018,2016-05-09
2,1,,32118,243569,2016-05-09
3,1,,9654,75848,2016-05-09
4,1,,32627,1112408,2016-05-09
...,...,...,...,...,...
1235374,600683,,120370,248101,2016-03-25
1235375,600684,,42906,632853,2016-04-14
1235376,600684,,33312,643522,2016-04-14
1235377,600684,,33312,52621,2016-04-14


In [55]:
session_encoder = LabelEncoder()
item_encoder = LabelEncoder()
df["sessionId"] = session_encoder.fit_transform(df["sessionId"])
df["itemId"] = item_encoder.fit_transform(df["itemId"])

In [61]:
df = df.sort_values(["eventdate", "timeframe", "sessionId"]).reset_index().drop("index", axis=1)
df

Unnamed: 0,sessionId,userId,itemId,timeframe,eventdate
0,131087,122855.0,5005,6492,2016-01-01
1,2293,2080.0,702,11262,2016-01-01
2,59944,51747.0,15183,14817,2016-01-01
3,2295,2082.0,5005,16851,2016-01-01
4,2296,2086.0,4080,32029,2016-01-01
...,...,...,...,...,...
989199,98246,,10930,1197356,2016-06-01
989200,37230,,12782,1197542,2016-06-01
989201,19892,,7412,1197743,2016-06-01
989202,34655,,15153,1198404,2016-06-01


In [136]:
df = df.drop_duplicates(["sessionId", "itemId"])

In [137]:
sess_item_df = df.groupby("sessionId")["itemId"].unique()

In [138]:
sess_item_df

sessionId
0         [5023, 14045, 13632, 6282, 14915, 15533, 25084...
1         [27493, 15212, 14013, 15603, 13911, 5496, 1556...
2                                             [2501, 15143]
3                 [16777, 20618, 23447, 13907, 4897, 24512]
4                                             [2483, 22680]
                                ...                        
204056                                  [27828, 589, 28899]
204057                                 [15592, 5326, 13143]
204058                                  [4698, 14050, 4399]
204059               [5474, 5145, 5473, 33647, 28635, 1500]
204060                                 [14172, 2539, 17180]
Name: itemId, Length: 204061, dtype: object

In [139]:
sess_sizes = sess_item_df.apply(len)
sess_sizes[sess_sizes < 5]

sessionId
2         2
4         2
5         2
6         3
7         2
         ..
204055    2
204056    3
204057    3
204058    3
204060    3
Name: itemId, Length: 142329, dtype: int64

In [140]:
sess_item_df = sess_item_df.reset_index()
sess_item_df

Unnamed: 0,sessionId,itemId
0,0,"[5023, 14045, 13632, 6282, 14915, 15533, 25084..."
1,1,"[27493, 15212, 14013, 15603, 13911, 5496, 1556..."
2,2,"[2501, 15143]"
3,3,"[16777, 20618, 23447, 13907, 4897, 24512]"
4,4,"[2483, 22680]"
...,...,...
204056,204056,"[27828, 589, 28899]"
204057,204057,"[15592, 5326, 13143]"
204058,204058,"[4698, 14050, 4399]"
204059,204059,"[5474, 5145, 5473, 33647, 28635, 1500]"


In [141]:
train = sess_item_df["itemId"].apply(lambda x: x[:int(len(x)*0.6)] if len(x)>=5 else x)

In [142]:
test = sess_item_df["itemId"].apply(lambda x: x[int(len(x)*0.6):] if len(x)>=5 else np.nan).dropna()

In [143]:
test = test.reset_index().rename({"index":"sessionId"}, axis=1)

In [144]:
train = train.reset_index().rename({"index":"sessionId"}, axis=1)

In [145]:
train = np.concatenate(train[["sessionId", "itemId"]].apply(
    lambda x: np.vstack([np.repeat(x[0], len(x[1])), np.array(x[1])]).T, axis=1
    ).tolist())

In [146]:
train = pd.DataFrame(train, columns=["sessionId", "itemId"])
train

Unnamed: 0,sessionId,itemId
0,0,5023
1,0,14045
2,0,13632
3,0,6282
4,0,14915
...,...,...
626040,204059,5145
626041,204059,5473
626042,204060,14172
626043,204060,2539


In [147]:
train = pd.merge(train, df, on=["sessionId", "itemId"], how="left")
train

Unnamed: 0,sessionId,itemId,userId,timeframe,eventdate
0,0,5023,,75848,2016-05-09
1,0,14045,,173912,2016-05-09
2,0,13632,,243569,2016-05-09
3,0,6282,,329870,2016-05-09
4,0,14915,,390072,2016-05-09
...,...,...,...,...,...
626040,204059,5145,,29055,2016-03-25
626041,204059,5473,,86624,2016-03-25
626042,204060,14172,,52621,2016-04-14
626043,204060,2539,,575276,2016-04-14


In [148]:
test = np.concatenate(test[["sessionId", "itemId"]].apply(
    lambda x: np.vstack([np.repeat(x[0], len(x[1])), np.array(x[1])]).T, axis=1
).tolist())
test = pd.DataFrame(test, columns=["sessionId", "itemId"])
test

In [149]:
test = pd.merge(test, df, on=["sessionId", "itemId"], how="left")
test

Unnamed: 0,sessionId,itemId,userId,timeframe,eventdate
0,0,15533,,487369,2016-05-09
1,0,25084,,526309,2016-05-09
2,0,13265,,1031018,2016-05-09
3,0,13859,,1112408,2016-05-09
4,1,13911,,359839,2016-05-09
...,...,...,...,...,...
205333,204028,5076,,719376,2016-03-31
205334,204028,16406,,782506,2016-03-31
205335,204059,33647,,117267,2016-03-25
205336,204059,28635,,248101,2016-03-25


In [150]:
train.to_pickle("~/work/dataset/diginetica2/derived/train.df")
test.to_pickle("~/work/dataset/diginetica2/derived/test.df")