# Data Preparation
Here we import the MovieLens 100K dataset and split it into a Train, Validation, and Test datasets.

Afterwards, we write them as CSV files to be used for the models

In [2]:

import sys
import argparse
from math import floor, ceil

import numpy as np
import pandas as pd


# Global Variables
DATA_LOCATION = "../data"
DATA_FOLDER = "ml-100k"
DATA_PATH = DATA_LOCATION + '/' + DATA_FOLDER
DATA_OUTPUT_NAME = "ml-100k"


../data/ml-100k


### Import Data
Read the data and parse it. Keep only the columns we want

In [8]:

# User
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(DATA_PATH + '/' + 'u.user', sep='|', names=u_cols, 
    encoding='latin-1')

# Rating
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(DATA_PATH + '/' + 'u.data', sep='\t', names=r_cols, 
    encoding='latin-1')

# Movie
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
m_cols_type = ['unknown', 'action', 'adventure', 'animation', 'childrens', 'comedy', 
    'crime', 'documentary', 'drama', 'fantasy', 'noir', 'horror', 'musical', 
    'mystery', 'romance', 'scifi', 'thriller', 'war', 'western']
movies = pd.read_csv(DATA_PATH + '/' + 'u.item', sep='|', names=m_cols + m_cols_type,
    encoding='latin-1')

# Merge
movie_ratings = pd.merge(movies, ratings)
lens = pd.merge(movie_ratings, users)


   movie_id  user_id sex
0         1      308   M
1         4      308   M
2         5      308   M
3         7      308   M
4         8      308   M


### Choose Columns
Only select a few columns. We take the user and movie IDs and one contextual variable for the user - Gender.

In [9]:

# Select only columns needed
y = lens['rating']
X = lens[['movie_id', 'user_id'] + ['sex']]


# Count parameter space
dct = {func.__name__:X.apply(func) for func in (pd.Series.nunique, pd.Series.count)}
X_count = pd.concat(dct, axis=1)
features_p = sum(X_count['nunique'])
col_m = len(X.columns)

# Convert to list
y = y.values.tolist()
X = X.values.tolist()

# Save
data = lens
X = X
y = y



### Split Data
We split data into three datasets

In [26]:

def SplitData(X, y, split = [0.8, 0.1, 0.1], seed = 1337):

    if seed is not None:
        np.random.seed(seed)

    n = len(X)
    index = np.arange(n)
    i_train = int(floor(n * split[0]))
    i_valid = int(floor(n * sum(split[:2])))

    # Shuffle index
    np.random.shuffle(index)

    index_train = index[:i_train]
    index_valid = index[i_train:i_valid]
    index_test  = index[i_valid:]
    
    return({
        'train': {
            'X': [X[i] for i in index_train], 
            'Y': [y[i] for i in index_train]},
        'valid': {
            'X': [X[i] for i in index_valid], 
            'Y': [y[i] for i in index_valid]},
        'test' : {
            'X': [X[i] for i in index_test], 
            'Y': [y[i] for i in index_test]}
    })

def SplitDF(df, split = [0.8, 0.1, 0.1], seed = 1337):

    if seed is not None:
        np.random.seed(seed)

    n = df.shape[0]
    index = np.arange(n)
    i_train = int(floor(n * split[0]))
    i_valid = int(floor(n * sum(split[:2])))

    # Shuffle index
    np.random.shuffle(index)

    index_train = index[:i_train]
    index_valid = index[i_train:i_valid]
    index_test  = index[i_valid:]
    
    return df.iloc[index_train], df.iloc[index_valid], df.iloc[index_test]


In [30]:

lens.shape[0]

df1, df2, df3 = SplitDF(lens)

print(df1.shape)
print(df2.shape)
print(df3.shape)

(80000, 31)
(10000, 31)
(10000, 31)


In [20]:
print(list(range(0, 10)))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
