In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import nltk
from nltk.tokenize import RegexpTokenizer

In [2]:
# Load rating file
df_ratings = pd.read_csv('./ratings.dat', sep='::', engine='python', names=['user_id', 'item_id', 'rating', 'time_stamp'])
df_ratings.head()

Unnamed: 0,user_id,item_id,rating,time_stamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# Load user and item file, join with rating
df_user = pd.read_csv('./users.dat', sep='::', engine='python', names=['user_id', 'gender', 'age', 'occupation', 'zip-code'])
df_item = pd.read_csv('./movies.dat', sep='::', engine='python', names=['item_id', 'title', 'genres'])
df_ratings = df_ratings.join(df_user.set_index('user_id'), on='user_id')
df_ratings = df_ratings.join(df_item.set_index('item_id'), on='item_id')
df_ratings.head()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [4]:
# Preprocess rating and gender
df_ratings['rating'] = df_ratings['rating'].map(lambda x: 0 if x < 4 else 1)
df_ratings['gender'] = df_ratings['gender'].map(lambda x: 0 if x == 'F' else 1)

In [5]:
# # Get records and generate slate for each user, slate length is set as 20
# df_ratings = df_ratings.sort_values(by='time_stamp')
# user_ids = list(set(df_ratings['user_id']))
# user2records = df_ratings.groupby(['user_id'])
# df_groups = []
# for k, user_id in enumerate(user_ids):
#     df_group = user2records.get_group(user_id)
#     length = df_group.shape[0]
#     slate_id_c = []
#     for i in range(length):
#         df_slate = df_group[:i]
#         clk_seq = list(df_slate[df_slate['rating']==1]['item_id'])[-20:]
#         clk_seq += [0 for i in range(20 - len(clk_seq))]
#         slate_id_c.append(clk_seq)
#     print(user_id, end = '\r')
#     df_group.insert(df_group.shape[1], 'clk_seq', slate_id_c)
#     df_groups.append(df_group)
# df_ratings = pd.concat(df_groups)
# df_ratings.head()

In [6]:
df_ratings.tail()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres
1000204,6040,1091,0,956716541,1,25,6,11106,Weekend at Bernie's (1989),Comedy
1000205,6040,1094,1,956704887,1,25,6,11106,"Crying Game, The (1992)",Drama|Romance|War
1000206,6040,562,1,956704746,1,25,6,11106,Welcome to the Dollhouse (1995),Comedy|Drama
1000207,6040,1096,1,956715648,1,25,6,11106,Sophie's Choice (1982),Drama
1000208,6040,1097,1,956715569,1,25,6,11106,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi


In [7]:
# Preprocess zip-code
zip_codes = list(set(df_ratings['zip-code']))
print("number of zip code: {}, number of users: {}".format(len(zip_codes), len(set(df_ratings['user_id']))))
zip_code2idx = {}
for idx, value in enumerate(zip_codes):
    zip_code2idx[value] = idx
df_ratings['zip-code'] = df_ratings['zip-code'].map(lambda x: zip_code2idx[x])

number of zip code: 3439, number of users: 6040


In [8]:
df_ratings.tail()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres
1000204,6040,1091,0,956716541,1,25,6,2239,Weekend at Bernie's (1989),Comedy
1000205,6040,1094,1,956704887,1,25,6,2239,"Crying Game, The (1992)",Drama|Romance|War
1000206,6040,562,1,956704746,1,25,6,2239,Welcome to the Dollhouse (1995),Comedy|Drama
1000207,6040,1096,1,956715648,1,25,6,2239,Sophie's Choice (1982),Drama
1000208,6040,1097,1,956715569,1,25,6,2239,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi


In [9]:
# Preprocess year
titles = df_ratings['title']

years = [title[-5:-1] for title in titles]
year_set = list(set(years))
year_set.sort()
year2idx = {y: idx for idx, y in enumerate(year_set)}
years = [year2idx[y] for y in years]
df_ratings['year'] = years

In [10]:
del df_ratings['title']
df_ratings.tail()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,genres,year
1000204,6040,1091,0,956716541,1,25,6,2239,Comedy,69
1000205,6040,1094,1,956704887,1,25,6,2239,Drama|Romance|War,72
1000206,6040,562,1,956704746,1,25,6,2239,Comedy|Drama,75
1000207,6040,1096,1,956715648,1,25,6,2239,Drama,62
1000208,6040,1097,1,956715569,1,25,6,2239,Children's|Drama|Fantasy|Sci-Fi,62


In [11]:
# Preprocess titles
tokenizer = RegexpTokenizer(r'\w+')
titles = df_ratings['title']
tokenized_titles = []
titles = [tokenizer.tokenize(str(title[:-6])) for title in list(titles)]
vocab = []
max_title_len = 0
for title in titles:
    vocab += title
    max_title_len = max_title_len if max_title_len > len(title) else len(title)
vocab = set(vocab)
word2idx = {word: idx for idx, word in enumerate(vocab)}
res_title = []
for title in titles:
    padding_title = [word2idx[word] for word in title]
    padding_title.extend([0 for i in range(max_title_len - len(title))])
    res_title.append(padding_title)
df_ratings['title'] = res_title
df_ratings.tail()

In [12]:
#Get records and generate slate for each user, slate length is set as 20
df_ratings = df_ratings.sort_values(by='time_stamp')
user_ids = list(set(df_ratings['user_id']))
user2records = df_ratings.groupby(['user_id'])
df_groups = []
for k, user_id in enumerate(user_ids):
    df_group = user2records.get_group(user_id)
    length = df_group.shape[0]
    slate_id_c = []
    for i in range(length):
        df_slate = df_group[:i]
        clk_seq = list(df_slate[df_slate['rating']==1]['item_id'])[-20:]
        clk_seq += [0 for i in range(20 - len(clk_seq))]
        slate_id_c.append(clk_seq)
    print(user_id, end = '\r')
    df_group.insert(df_group.shape[1], 'clk_seq', slate_id_c)
    df_groups.append(df_group)
df_ratings = pd.concat(df_groups)

6040

In [13]:
df_ratings.head(10)

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,genres,year,clk_seq
31,1,3186,1,978300019,0,1,10,324,Drama,79,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
27,1,1721,1,978300055,0,1,10,324,Drama|Romance,77,"[3186, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
22,1,1270,1,978300055,0,1,10,324,Comedy|Sci-Fi,65,"[3186, 1721, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
37,1,1022,1,978300055,0,1,10,324,Animation|Children's|Musical,30,"[3186, 1721, 1270, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
24,1,2340,0,978300103,0,1,10,324,Romance,78,"[3186, 1721, 1270, 1022, 0, 0, 0, 0, 0, 0, 0, ..."
36,1,1836,1,978300172,0,1,10,324,Drama,78,"[3186, 1721, 1270, 1022, 0, 0, 0, 0, 0, 0, 0, ..."
3,1,3408,1,978300275,0,1,10,324,Drama,80,"[3186, 1721, 1270, 1022, 1836, 0, 0, 0, 0, 0, ..."
47,1,1207,1,978300719,0,1,10,324,Drama,42,"[3186, 1721, 1270, 1022, 1836, 3408, 0, 0, 0, ..."
7,1,2804,1,978300719,0,1,10,324,Comedy|Drama,63,"[3186, 1721, 1270, 1022, 1836, 3408, 1207, 0, ..."
44,1,260,1,978300760,0,1,10,324,Action|Adventure|Fantasy|Sci-Fi,57,"[3186, 1721, 1270, 1022, 1836, 3408, 1207, 280..."


In [14]:
# Preprocess genre
genres = df_ratings['genres']
genre2idx = {}
cnt = 1
res_genre = []
max_genre_len = 0
for genre in list(genres):
    l = genre.split('|')
    max_genre_len = max_genre_len if max_genre_len > len(l) else len(l)
    for e in l:
        if e not in genre2idx:
            genre2idx[e] = cnt
            cnt += 1
    res_genre.append([genre2idx[e] for e in l])
for genre in res_genre:
    genre.extend([0 for i in range(max_genre_len - len(genre))])
df_ratings['genres'] = res_genre

In [15]:
df_ratings.tail()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,genres,year,clk_seq
1000019,6040,2917,1,997454429,1,25,6,2239,"[11, 12, 0, 0, 0, 0]",61,"[535, 3751, 1077, 3168, 3182, 2575, 1212, 3362..."
999988,6040,1921,1,997454464,1,25,6,2239,"[4, 12, 0, 0, 0, 0]",78,"[3751, 1077, 3168, 3182, 2575, 1212, 3362, 125..."
1000172,6040,1784,0,997454464,1,25,6,2239,"[3, 1, 0, 0, 0, 0]",77,"[1077, 3168, 3182, 2575, 1212, 3362, 1258, 206..."
1000167,6040,161,0,997454486,1,25,6,2239,"[1, 12, 13, 0, 0, 0]",75,"[1077, 3168, 3182, 2575, 1212, 3362, 1258, 206..."
1000042,6040,1221,1,998315055,1,25,6,2239,"[8, 11, 1, 0, 0, 0]",54,"[1077, 3168, 3182, 2575, 1212, 3362, 1258, 206..."


In [16]:
# Reorder the columns 
orders = ['user_id', 'gender', 'age', 'occupation', 'zip-code', 'clk_seq', 'item_id', 'title', 'year', 'genres', 'time_stamp', 'rating','count']
df_ratings = df_ratings[orders]
description = [
    ('user_id', 1 + np.max(df_ratings['user_id']), 'spr'),
    ('gender', 1 + np.max(df_ratings['gender']), 'spr'),
    ('age', 1 + np.max(df_ratings['age']), 'spr'),
    ('occupation', 1 + np.max(df_ratings['occupation']), 'spr'),
    ('zip-code', 1 + np.max(df_ratings['zip-code']), 'spr'),
    ('clk_seq', 1 + np.max(list(df_ratings['clk_seq'])), 'seq'),
    ('item_id', 1 + np.max(df_ratings['item_id']), 'spr'),
    ('year', 1 + np.max(df_ratings['year']), 'spr'),
    ('title', 1 + np.max(list(df_ratings['title'])), 'seq'),
    ('genres', 1 + np.max(list(df_ratings['genres'])), 'seq'),
    ('time_stamp', -1, 'ctn'),
    ('rating', 2, 'label'),
    ('count', -1, 'ctn')
]
description

[('user_id', 6041, 'spr'),
 ('gender', 2, 'spr'),
 ('age', 57, 'spr'),
 ('occupation', 21, 'spr'),
 ('zip-code', 3439, 'spr'),
 ('clk_seq', 3953, 'seq'),
 ('item_id', 3953, 'spr'),
 ('year', 81, 'spr'),
 ('genres', 19, 'seq'),
 ('time_stamp', -1, 'ctn'),
 ('rating', 2, 'label'),
 ('count', -1, 'ctn')]

In [17]:
df_ratings.tail()

Unnamed: 0,user_id,gender,age,occupation,zip-code,clk_seq,item_id,year,genres,time_stamp,rating
1000019,6040,1,25,6,2239,"[535, 3751, 1077, 3168, 3182, 2575, 1212, 3362...",2917,61,"[11, 12, 0, 0, 0, 0]",997454429,1
999988,6040,1,25,6,2239,"[3751, 1077, 3168, 3182, 2575, 1212, 3362, 125...",1921,78,"[4, 12, 0, 0, 0, 0]",997454464,1
1000172,6040,1,25,6,2239,"[1077, 3168, 3182, 2575, 1212, 3362, 1258, 206...",1784,77,"[3, 1, 0, 0, 0, 0]",997454464,0
1000167,6040,1,25,6,2239,"[1077, 3168, 3182, 2575, 1212, 3362, 1258, 206...",161,75,"[1, 12, 13, 0, 0, 0]",997454486,0
1000042,6040,1,25,6,2239,"[1077, 3168, 3182, 2575, 1212, 3362, 1258, 206...",1221,54,"[8, 11, 1, 0, 0, 0]",998315055,1


In [18]:
# Preprocess timestamp
min_time_stamp = np.min(df_ratings['time_stamp'])
max_time_stamp = np.max(df_ratings['time_stamp'])

df_ratings['time_stamp'] = df_ratings['time_stamp'].map(lambda x: (x - min_time_stamp)/(max_time_stamp - min_time_stamp))
df_ratings = df_ratings.sort_values(by='time_stamp')
df_ratings.head()

Unnamed: 0,user_id,gender,age,occupation,zip-code,clk_seq,item_id,year,genres,time_stamp,rating
1000138,6040,1,25,6,2239,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",858,52,"[8, 11, 1, 0, 0, 0]",0.0,1
1000153,6040,1,25,6,2239,"[858, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",2384,78,"[6, 3, 0, 0, 0, 0]",2.451236e-07,1
999873,6040,1,25,6,2239,"[858, 2384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",593,71,"[1, 12, 0, 0, 0, 0]",2.451236e-07,1
1000007,6040,1,25,6,2239,"[858, 2384, 593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1961,68,"[1, 0, 0, 0, 0, 0]",5.013891e-07,1
1000192,6040,1,25,6,2239,"[858, 2384, 593, 1961, 0, 0, 0, 0, 0, 0, 0, 0,...",2019,34,"[8, 1, 0, 0, 0, 0]",5.013891e-07,1


In [19]:
# plot item counts vs item
user2count = df_ratings.groupby(['user_id']).size().reset_index(name='count').sort_values(by='count')
user_ids = list(user2count['user_id'])
counts = np.array(user2count['count'])
plt.plot(np.arange(len(counts)), counts)

[<matplotlib.lines.Line2D at 0x7faa26f33190>]

In [20]:
user2count.head(500)
df_ratings.tail()

Unnamed: 0,user_id,gender,age,occupation,zip-code,clk_seq,item_id,year,genres,time_stamp,rating
825793,4958,1,18,7,2909,"[3571, 3869, 1621, 2161, 2015, 471, 2003, 1485...",2399,65,"[9, 6, 10, 0, 0, 0]",0.999997,0
825438,4958,1,18,7,2909,"[3571, 3869, 1621, 2161, 2015, 471, 2003, 1485...",1407,76,"[14, 12, 0, 0, 0, 0]",0.999998,1
825724,4958,1,18,7,2909,"[3869, 1621, 2161, 2015, 471, 2003, 1485, 3786...",3264,72,"[3, 14, 0, 0, 0, 0]",1.0,1
825731,4958,1,18,7,2909,"[1621, 2161, 2015, 471, 2003, 1485, 3786, 2413...",2634,39,"[14, 0, 0, 0, 0, 0]",1.0,0
825603,4958,1,18,7,2909,"[1621, 2161, 2015, 471, 2003, 1485, 3786, 2413...",1924,38,"[14, 4, 0, 0, 0, 0]",1.0,1


In [21]:
# Preprocess count
df_ratings = df_ratings.join(user2count.set_index('user_id'), on='user_id')
min_count = np.min(df_ratings['count'])
max_count = np.max(df_ratings['count'])
df_ratings['count'] = df_ratings['count'].map(lambda x: (x - min_count)/(max_count - min_count))

In [22]:
df_ratings.tail(100)

Unnamed: 0,user_id,gender,age,occupation,zip-code,clk_seq,item_id,year,genres,time_stamp,rating,count
549925,3391,1,18,4,1537,"[2001, 2324, 2081, 1535, 3040, 52, 2058, 3502,...",2643,67,"[8, 9, 4, 0, 0, 0]",0.997157,0,0.521360
550541,3391,1,18,4,1537,"[2001, 2324, 2081, 1535, 3040, 52, 2058, 3502,...",2907,79,"[3, 0, 0, 0, 0, 0]",0.997157,0,0.521360
549720,3391,1,18,4,1537,"[2001, 2324, 2081, 1535, 3040, 52, 2058, 3502,...",1458,77,"[2, 0, 0, 0, 0, 0]",0.997160,0,0.521360
549712,3391,1,18,4,1537,"[2001, 2324, 2081, 1535, 3040, 52, 2058, 3502,...",2253,72,"[8, 3, 10, 0, 0, 0]",0.997160,0,0.521360
550739,3391,1,18,4,1537,"[2001, 2324, 2081, 1535, 3040, 52, 2058, 3502,...",550,74,"[3, 2, 0, 0, 0, 0]",0.997160,0,0.521360
...,...,...,...,...,...,...,...,...,...,...,...,...
825793,4958,1,18,7,2909,"[3571, 3869, 1621, 2161, 2015, 471, 2003, 1485...",2399,65,"[9, 6, 10, 0, 0, 0]",0.999997,0,0.187881
825438,4958,1,18,7,2909,"[3571, 3869, 1621, 2161, 2015, 471, 2003, 1485...",1407,76,"[14, 12, 0, 0, 0, 0]",0.999998,1,0.187881
825724,4958,1,18,7,2909,"[3869, 1621, 2161, 2015, 471, 2003, 1485, 3786...",3264,72,"[3, 14, 0, 0, 0, 0]",1.000000,1,0.187881
825731,4958,1,18,7,2909,"[1621, 2161, 2015, 471, 2003, 1485, 3786, 2413...",2634,39,"[14, 0, 0, 0, 0, 0]",1.000000,0,0.187881


In [23]:
def split_0():
    # split train, test, validation dataset
    user2count = df_ratings.groupby(['user_id']).size().reset_index(name='count').sort_values(by='count')
    user_ids = list(user2count['user_id'])
    counts = np.array(user2count['count'])

    userid_group = df_ratings.groupby('user_id')
    # split insufficient and sufficient items
    user_ids = user_ids[870:] # select items with more than 30 samples
    cold_users = user_ids[:int(len(user_ids) * 0.2)] # choose the 20% items at tail as cold items
    warm_users = user_ids[int(len(user_ids) * 0.2):] # choose the 80% items at head as warm items

    train_a = pd.DataFrame()
    train_b = pd.DataFrame()
    train_c = pd.DataFrame()
    val_a_cold, val_a_warm = pd.DataFrame(), pd.DataFrame()
    val_b_cold, val_b_warm = pd.DataFrame(), pd.DataFrame()
    val_c_cold, val_c_warm = pd.DataFrame(), pd.DataFrame()
    test_cold, test_warm = pd.DataFrame(), pd.DataFrame()

    for user_id in cold_users:
        df_samples = userid_group.get_group(user_id).sort_values(by='time_stamp')
        size = len(df_samples)
        train_a = train_a.append(df_samples.iloc[:(size * 8 // 30)], ignore_index=True)
        val_a_cold = val_a_cold.append(df_samples.iloc[(size * 8 // 30):(size * 9 // 30)], ignore_index=True)
        train_b = train_b.append(df_samples.iloc[(size * 9 // 30):(size * 17 // 30)], ignore_index=True)
        val_b_cold = val_b_cold.append(df_samples.iloc[(size * 17 // 30):(size * 18 // 30)], ignore_index=True)
        train_c = train_c.append(df_samples.iloc[(size * 18 // 30):(size * 26 // 30)], ignore_index=True)
        val_c_cold = val_c_cold.append(df_samples.iloc[(size * 26 // 30):(size * 27 // 30)], ignore_index=True)
        test_cold = test_cold.append(df_samples.iloc[(size * 27 // 30):], ignore_index=True)

    for user_id in warm_users:
        df_samples = userid_group.get_group(user_id).sort_values(by='time_stamp')
        size = len(df_samples)
        train_a = train_a.append(df_samples.iloc[:(size * 8 // 30)], ignore_index=True)
        val_a_warm = val_a_warm.append(df_samples.iloc[(size * 8 // 30):(size * 9 // 30)], ignore_index=True)
        train_b = train_b.append(df_samples.iloc[(size * 9 // 30):(size * 17 // 30)], ignore_index=True)
        val_b_warm = val_b_warm.append(df_samples.iloc[(size * 17 // 30):(size * 18 // 30)], ignore_index=True)
        train_c = train_c.append(df_samples.iloc[(size * 18 // 30):(size * 26 // 30)], ignore_index=True)
        val_c_warm = val_c_warm.append(df_samples.iloc[(size * 26 // 30):(size * 27 // 30)], ignore_index=True)
        test_warm = test_warm.append(df_samples.iloc[(size * 27 // 30):], ignore_index=True)
    
    print("cold item number: {}; warm item number: {}".format(len(cold_items), len(warm_items)))
    print("train_a size: ", len(train_a))
    print("train_b size: ", len(train_b))
    print("train_c size: ", len(train_c))
    print("val_a_cold size: ", len(val_a_cold))
    print("val_b_cold size: ", len(val_b_cold))
    print("val_c_cold size: ", len(val_c_cold))
    print("val_a_warm size: ", len(val_a_warm))
    print("val_b_warm size: ", len(val_b_warm))
    print("val_c_warm size: ", len(val_c_warm))
    print("test_cold size: ", len(test_cold))
    print("test_warm size: ", len(test_warm))

    save_dic = {
        'train_a': train_a,
        'train_b': train_b,
        'train_c': train_c,
        'val_a_cold': val_a_cold,
        'val_a_warm': val_a_warm,
        'val_b_cold': val_b_cold,
        'val_b_warm': val_b_warm,
        'val_c_cold': val_c_cold,
        'val_c_warm': val_c_warm,
        'test_cold': test_cold,
        'test_warm': test_warm,
        'params': {
            'sparse_feature_dims': sparse_feature_dims,
            'continuous_feature_num': continuous_feature_num,
            'seq_feature_vocab_size': [title_vocab_size, genre_vocab_size]
        }
    }
    with open('./cold_start/preprocess_ml-1M.pkl', 'bw+') as f:
        pickle.dump(save_dic, f)

    # Check the validation of data saved to file
    with open('./cold_start/preprocess_ml-1M.pkl', 'br+') as f:
        data = pickle.load(f)
    for key in data.keys():
        if key != 'params':
            assert data[key].equals(save_dic[key])
        else:
            assert data['params'] == save_dic['params']

In [24]:
df_ratings.tail(100)

Unnamed: 0,user_id,gender,age,occupation,zip-code,clk_seq,item_id,year,genres,time_stamp,rating,count
549925,3391,1,18,4,1537,"[2001, 2324, 2081, 1535, 3040, 52, 2058, 3502,...",2643,67,"[8, 9, 4, 0, 0, 0]",0.997157,0,0.521360
550541,3391,1,18,4,1537,"[2001, 2324, 2081, 1535, 3040, 52, 2058, 3502,...",2907,79,"[3, 0, 0, 0, 0, 0]",0.997157,0,0.521360
549720,3391,1,18,4,1537,"[2001, 2324, 2081, 1535, 3040, 52, 2058, 3502,...",1458,77,"[2, 0, 0, 0, 0, 0]",0.997160,0,0.521360
549712,3391,1,18,4,1537,"[2001, 2324, 2081, 1535, 3040, 52, 2058, 3502,...",2253,72,"[8, 3, 10, 0, 0, 0]",0.997160,0,0.521360
550739,3391,1,18,4,1537,"[2001, 2324, 2081, 1535, 3040, 52, 2058, 3502,...",550,74,"[3, 2, 0, 0, 0, 0]",0.997160,0,0.521360
...,...,...,...,...,...,...,...,...,...,...,...,...
825793,4958,1,18,7,2909,"[3571, 3869, 1621, 2161, 2015, 471, 2003, 1485...",2399,65,"[9, 6, 10, 0, 0, 0]",0.999997,0,0.187881
825438,4958,1,18,7,2909,"[3571, 3869, 1621, 2161, 2015, 471, 2003, 1485...",1407,76,"[14, 12, 0, 0, 0, 0]",0.999998,1,0.187881
825724,4958,1,18,7,2909,"[3869, 1621, 2161, 2015, 471, 2003, 1485, 3786...",3264,72,"[3, 14, 0, 0, 0, 0]",1.000000,1,0.187881
825731,4958,1,18,7,2909,"[1621, 2161, 2015, 471, 2003, 1485, 3786, 2413...",2634,39,"[14, 0, 0, 0, 0, 0]",1.000000,0,0.187881


In [25]:
# def split_1():
#     df_ratings = df_ratings.sample(frac=1).reset_index(drop=True)
#     train_df = df_ratings.iloc[:int(len(df_ratings)*0.8), :]
#     val_df = df_ratings.iloc[int(len(df_ratings)*0.8):int(len(df_ratings)*0.9), :]
#     test_df = df_ratings.iloc[int(len(df_ratings)*0.9):, :]
#     save_dic = {
#         'train': train_df,
#         'val': val_df,
#         'test': test_df,
#         'params': {
#             'sparse_feature_dims': sparse_feature_dims,
#             'continuous_feature_num': continuous_feature_num,
#             'seq_feature_vocab_size': [title_vocab_size, genre_vocab_size]
#         }
#     }
#     with open('./normal_preprocess_ml-1M.pkl', 'bw+') as f:
#         pickle.dump(save_dic, f)

In [26]:
def split_2(df_ratings, description, N = 200, K = 20):
    user2count = df_ratings.groupby(['user_id']).size().reset_index(name='count').sort_values(by='count')
    user_ids = list(user2count['user_id'])
    counts = np.array(user2count['count'])

    user_ids, counts = np.asarray(user_ids), np.asarray(counts)
    hot_user_ids = user_ids[counts > N]
    cold_user_ids = user_ids[np.logical_and(counts <= N, counts >= 3 * K)]
    user_group = df_ratings.groupby('user_id')
    train_base = pd.DataFrame()
    for user_id in hot_user_ids:
        df_hot = user_group.get_group(user_id).sort_values(by='time_stamp')
        train_base = train_base.append(df_hot, ignore_index=True)
    train_warm_a, train_warm_b, train_warm_c, test = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
    for user_id in cold_user_ids:
        df_cold = user_group.get_group(user_id).sort_values(by='time_stamp')
        train_warm_a = train_warm_a.append(df_cold[: K], ignore_index=True)
        train_warm_b = train_warm_b.append(df_cold[K: 2*K], ignore_index=True)
        train_warm_c = train_warm_c.append(df_cold[2*K: 3*K], ignore_index=True)
        test = test.append(df_cold[3*K:], ignore_index=True)
    save_dic = {
        'train_base': train_base.sort_values('time_stamp'),
        'train_warm_a': train_warm_a.sort_values('time_stamp'),
        'train_warm_b': train_warm_b.sort_values('time_stamp'),
        'train_warm_c': train_warm_c.sort_values('time_stamp'),
        'test': test.sort_values('time_stamp'),
        'description': description
    }
    for name, df in save_dic.items():
        print("{} size: {}".format(name, len(df)))
    with open('./user_emb_warm_split_preprocess_ml-1M.pkl', 'bw+') as f:
        pickle.dump(save_dic, f)

In [27]:
df_ratings.head(5)

Unnamed: 0,user_id,gender,age,occupation,zip-code,clk_seq,item_id,year,genres,time_stamp,rating,count
1000138,6040,1,25,6,2239,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",858,52,"[8, 11, 1, 0, 0, 0]",0.0,1,0.13993
1000153,6040,1,25,6,2239,"[858, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",2384,78,"[6, 3, 0, 0, 0, 0]",2.451236e-07,1,0.13993
999873,6040,1,25,6,2239,"[858, 2384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",593,71,"[1, 12, 0, 0, 0, 0]",2.451236e-07,1,0.13993
1000007,6040,1,25,6,2239,"[858, 2384, 593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1961,68,"[1, 0, 0, 0, 0, 0]",5.013891e-07,1,0.13993
1000192,6040,1,25,6,2239,"[858, 2384, 593, 1961, 0, 0, 0, 0, 0, 0, 0, 0,...",2019,34,"[8, 1, 0, 0, 0, 0]",5.013891e-07,1,0.13993


In [28]:
split_2(df_ratings, description) # hot_item_ids : cold_items_ids = 8: 2, drop other; k

train_base size: 654781
train_warm_a size: 47200
train_warm_b size: 47200
train_warm_c size: 47200
test size: 127672
description size: 12


In [29]:
# Get training data for Meta-Embedding method
with open('./user_emb_warm_split_preprocess_ml-1M.pkl', 'rb+') as f:
    data = pickle.load(f)
df_base = data['train_base']
user2group = df_base.groupby('user_id')
train_a, train_b, train_c, train_d = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
for user_id, df_group in user2group:
    l, e = df_group.shape[0], df_group.shape[0] // 4     
    train_a = train_a.append(df_group.iloc[0: e,], ignore_index=True)
    train_b = train_b.append(df_group.iloc[e: 2 * e, ], ignore_index=True)
    train_c = train_c.append(df_group.iloc[2 * e: 3 * e, ], ignore_index=True)
    train_d = train_d.append(df_group.iloc[3 * e: 4 * e, ], ignore_index=True) 
shuffle_idx = np.random.permutation(train_a.shape[0])
train_a = train_a.iloc[shuffle_idx]
train_b = train_b.iloc[shuffle_idx]
train_c = train_c.iloc[shuffle_idx]
train_d = train_d.iloc[shuffle_idx]
data["metaE_a"] = train_a
data["metaE_b"] = train_b
data["metaE_c"] = train_c
data["metaE_d"] = train_d
with open('./movielens1M_data_user.pkl', 'wb+') as f:
    pickle.dump(data, f)

In [30]:
counts.sum()

1000209

In [31]:
for i in range(1, len(counts) + 1):
    if np.sum(counts[-i:]) > 500000:
        print(i)
        print(counts[-i])
        break

949
302


In [32]:
counts

array([  20,   20,   20, ..., 1743, 1850, 2314])

In [33]:
df_ratings.head()

Unnamed: 0,user_id,gender,age,occupation,zip-code,clk_seq,item_id,year,genres,time_stamp,rating,count
1000138,6040,1,25,6,2239,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",858,52,"[8, 11, 1, 0, 0, 0]",0.0,1,0.13993
1000153,6040,1,25,6,2239,"[858, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",2384,78,"[6, 3, 0, 0, 0, 0]",2.451236e-07,1,0.13993
999873,6040,1,25,6,2239,"[858, 2384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",593,71,"[1, 12, 0, 0, 0, 0]",2.451236e-07,1,0.13993
1000007,6040,1,25,6,2239,"[858, 2384, 593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1961,68,"[1, 0, 0, 0, 0, 0]",5.013891e-07,1,0.13993
1000192,6040,1,25,6,2239,"[858, 2384, 593, 1961, 0, 0, 0, 0, 0, 0, 0, 0,...",2019,34,"[8, 1, 0, 0, 0, 0]",5.013891e-07,1,0.13993


In [34]:
description

[('user_id', 6041, 'spr'),
 ('gender', 2, 'spr'),
 ('age', 57, 'spr'),
 ('occupation', 21, 'spr'),
 ('zip-code', 3439, 'spr'),
 ('clk_seq', 3953, 'seq'),
 ('item_id', 3953, 'spr'),
 ('year', 81, 'spr'),
 ('genres', 19, 'seq'),
 ('time_stamp', -1, 'ctn'),
 ('rating', 2, 'label'),
 ('count', -1, 'ctn')]

In [35]:
user2count.head(20)

Unnamed: 0,user_id,count
946,947,20
4067,4068,20
2529,2530,20
340,341,20
5257,5258,20
4382,4383,20
97,98,20
4392,4393,20
2060,2061,20
2583,2584,20
