# IMDB

In [29]:
%matplotlib inline
import pandas as pd
import numpy as np

In [30]:
dataset = pd.read_csv('../data/MovieLens/ratings.csv',sep=',',header=None, engine='python', skiprows=1,
                     names=["User", "Movie", "Rating", "Timestamp"])

dataset.tail()

Unnamed: 0,User,Movie,Rating,Timestamp
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352
100835,610,170875,3.0,1493846415


In [33]:
%%time
# Normalize the Rating and create a Pivot with the Index as the UserId and the Movie as the Columns. The values come from the Rating, we fill the non-existent values.
dataset['Rating'] = dataset['Rating'] / 5.0
movieratings_dataframe = dataset.pivot(index='User',columns='Movie',values='Rating').fillna(0)

print(movieratings_dataframe)

Movie  1       2       3       4       5       6       7       8       9       \
User                                                                            
1       0.032   0.000   0.032     0.0     0.0   0.032    0.00     0.0     0.0   
2       0.000   0.000   0.000     0.0     0.0   0.000    0.00     0.0     0.0   
3       0.000   0.000   0.000     0.0     0.0   0.000    0.00     0.0     0.0   
4       0.000   0.000   0.000     0.0     0.0   0.000    0.00     0.0     0.0   
5       0.032   0.000   0.000     0.0     0.0   0.000    0.00     0.0     0.0   
...       ...     ...     ...     ...     ...     ...     ...     ...     ...   
606     0.020   0.000   0.000     0.0     0.0   0.000    0.02     0.0     0.0   
607     0.032   0.000   0.000     0.0     0.0   0.000    0.00     0.0     0.0   
608     0.020   0.016   0.016     0.0     0.0   0.000    0.00     0.0     0.0   
609     0.024   0.000   0.000     0.0     0.0   0.000    0.00     0.0     0.0   
610     0.040   0.000   0.00

In [34]:
# There are a total of 3706 unique movies, but the movie IDs go untill 3952
print(len(np.unique(dataset['Movie']))) # if you don't believe me
print(movieratings_dataframe.shape)

9724
(610, 9724)


In [35]:
movieratings_dataframe = movieratings_dataframe.reindex(movieratings_dataframe.columns.union(np.arange(1, max(dataset['Movie']))), axis=1, fill_value=0.0)

In [36]:
# We need all the columns, even if there are no reviews for those movies
print(movieratings_dataframe.shape)

(610, 193609)


In [37]:
movieratings_dataframe.tail()

Movie,1,2,3,4,5,6,7,8,9,10,...,193600,193601,193602,193603,193604,193605,193606,193607,193608,193609
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
606,0.02,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.02,0.016,0.016,0.0,0.0,0.0,0.0,0.0,0.0,0.032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,0.024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
610,0.04,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Creating train-test data

In [38]:
movieratings = np.asarray(movieratings_dataframe)

In [39]:
test_ratings_per_user = 10  # The amount of ratings of one user that we will copy to the Test-set
X_train = movieratings.copy() # Copy the whole list
X_test = np.zeros(movieratings.shape) # Make a list with only zeros
y_train = movieratings.copy() # Copy the whole list
y_test = movieratings.copy() # Copy the whole list

In [40]:
ratedlist = [] # This will contain all the movieratings in our test_set so we can check it later
for i in range(0, movieratings.shape[0]): # For every user, we have to set a few movieratings to 0
    ratedmovies = np.where(movieratings[i,:] > 0.0)[0] # Get a list of movies with a rating greater than 0
    ratedlist.append(ratedmovies)
    #print(ratedmovies)
    index = np.random.choice(len(ratedmovies), test_ratings_per_user, replace=False)
    
    X_train[i,ratedmovies[index]] = 0 # Set rating to 0 in training set
    X_test[i,ratedmovies[index]] = movieratings[i,ratedmovies[index]].copy() # Copy the value of the rating to the test set

In [41]:
np.save("../data/preprocessed_data/X_train.npy", X_train)
np.save("../data/preprocessed_data/y_train.npy", y_train)
np.save("../data/preprocessed_data/X_test.npy", X_test)
np.save("../data/preprocessed_data/y_test.npy", y_test)
np.save("../data/preprocessed_data/ratedlist.npy", np.asarray(ratedlist))

  return array(a, dtype, copy=False, order=order)


## Upload this dataset to the cloud

## Make sure this file is executable in one run