In [35]:
from collections import defaultdict

import random
import numpy as np

import os
import os.path
import pandas as pd
import matplotlib.pyplot as plt
import statistics

In [36]:
#for reproducible results
random.seed(0)
np.random.seed(0)

In [37]:
#need to upload the zip file onto the colab session first
!unzip ml-100k.zip
MOVIELENS_DIR = "ml-100k"

Archive:  ml-100k.zip
replace ml-100k/u.item? [y]es, [n]o, [A]ll, [N]one, [r]ename: None


In [38]:
def getDF(dataset, file_name):
    #retrieve the required file from the zip file
    #this function only works for data files that contain the ratings since we're specifying the header names here.
    header_names = ['userID', 'itemID', 'rating', 'timestamp']
    data = pd.read_csv(os.path.join(dataset, file_name), sep='\t', names=header_names)
    return data 

In [39]:
#u.data is the file that contains all rating data.
rating_df = getDF(MOVIELENS_DIR, 'u.data')

In [40]:
rating_df.head(10)


Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


In [41]:
rating_df.rating[rating_df['rating'] < 4] = int((float(0)))
rating_df.rating[rating_df['rating'] >= 4] = int((float(1)))

In [42]:
rating_df.head(10)

Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,0,881250949
1,186,302,0,891717742
2,22,377,0,878887116
3,244,51,0,880606923
4,166,346,0,886397596
5,298,474,1,884182806
6,115,265,0,881171488
7,253,465,1,891628467
8,305,451,0,886324817
9,6,86,0,883603013


## train test split: 70/30 split based on each user's rating time


In [43]:
def time_ordered_train_test_split(rating_df, ratio=[0.7, 0.3]):
    num_users = len(set(rating_df['userID']))

    train_set = pd.DataFrame(columns=['userID', 'itemID', 'rating', 'timestamp'])
    test_set = pd.DataFrame(columns=['userID', 'itemID', 'rating', 'timestamp'])

    for i in range(num_users):
        user_df = rating_df[rating_df['userID'] == i]
        #sort by time of review
        user_df = user_df.sort_values(by = 'timestamp')

        #take the first part as train set, and the second part as test set
        user_df_train = user_df.head(int(len(user_df)*ratio[0]))
        user_df_test = user_df.tail(len(user_df) - int(len(user_df)*0.7))

        train_set = train_set.append(user_df_train)
        test_set = test_set.append(user_df_test)

    return train_set, test_set

In [44]:
train_df, test_df = time_ordered_train_test_split(rating_df, ratio=[0.7, 0.3])

In [45]:
train_df

Unnamed: 0,userID,itemID,rating,timestamp
59972,1,168,1,874965478
92487,1,172,1,874965478
74577,1,165,1,874965518
48214,1,156,1,874965556
22971,1,166,1,874965677
...,...,...,...,...
95675,942,478,1,891283017
93580,942,135,0,891283017
71794,942,193,1,891283043
78689,942,496,1,891283043


In [46]:
test_df

Unnamed: 0,userID,itemID,rating,timestamp
5682,1,49,0,878542478
24493,1,30,0,878542515
39865,1,131,0,878542552
6234,1,233,0,878542552
96699,1,152,1,878542589
...,...,...,...,...
98653,942,945,1,891283239
80481,942,97,1,891283239
78950,942,95,1,891283516
72746,942,31,1,891283517


### save processed train and test sets to .dat file 


In [48]:
train_df.to_csv("train.dat", sep = "\t", header=False, index = False)
test_df.to_csv("test.dat", sep = "\t", header=False, index = False)