# IMDB

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np

In [None]:
dataset = pd.read_csv('../data/MovieLens/ratings.dat',sep='::',header=None, engine='python',
                     names=["User", "Movie", "Rating", "Timestamp"])

dataset.tail()

In [None]:
%%time
# Normalize the Rating and create a Pivot with the Index as the UserId and the Movie as the Columns. The values come from the Rating, we fill the non-existent values.
dataset['Rating'] = dataset['Rating'] / 5.0
movieratings_dataframe = dataset.pivot(index='User',columns='Movie',values='Rating').fillna(0)

In [None]:
# There are a total of 3706 unique movies, but the movie IDs go untill 3952
# len(np.unique(dataset['Movie'])) # if you don't believe me
print(movieratings_dataframe.shape)

In [None]:
movieratings_dataframe = movieratings_dataframe.reindex(movieratings_dataframe.columns.union(np.arange(1, max(dataset['Movie']))), axis=1, fill_value=0.0)

In [None]:
# We need all the columns, even if there are no reviews for those movies
print(movieratings_dataframe.shape)

In [None]:
movieratings_dataframe.head()

### Creating train-test data

In [None]:
movieratings = np.asarray(movieratings_dataframe)

In [None]:
test_ratings_per_user = 10  # The amount of ratings of one user that we will copy to the Test-set
X_train = movieratings.copy() # Copy the whole list
X_test = np.zeros(movieratings.shape) # Make a list with only zeros
y_train = movieratings.copy() # Copy the whole list
y_test = movieratings.copy() # Copy the whole list

In [None]:
ratedlist = [] # This will contain all the movieratings in our test_set so we can check it later
for i in range(0, movieratings.shape[0]): # For every user, we have to set a few movieratings to 0
    ratedmovies = np.where(movieratings[i,:] > 0.0)[0] # Get a list of movies with a rating greater than 0
    ratedlist.append(ratedmovies)
    #print(ratedmovies)
    index = np.random.choice(len(ratedmovies), test_ratings_per_user, replace=False)
    
    X_train[i,ratedmovies[index]] = 0 # Set rating to 0 in training set
    X_test[i,ratedmovies[index]] = movieratings[i,ratedmovies[index]].copy() # Copy the value of the rating to the test set

In [None]:
np.save("../data/X_train.npy", X_train)
np.save("../data/y_train.npy", y_train)
np.save("../data/X_test.npy", X_test)
np.save("../data/y_test.npy", y_test)
np.save("../data/ratedlist.npy", np.asarray(ratedlist))

## Upload this dataset to the cloud

## Make sure this file is executable in one run