In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [1]:
import re
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import pickle
from pathlib import Path
import seaborn as sns

In [2]:
from movienet import MovieNet

Using TensorFlow backend.


In [3]:
from sklearn.model_selection import train_test_split

In [7]:
ratings_raw = pd.read_csv("ratings.csv")
movies_raw = pd.read_csv("movies.csv")

In [8]:
display(ratings_raw.head())
display(movies_raw.head())

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
ratings_train = ratings_raw.copy()

In [10]:
users_uniq = ratings_train.userId.unique()
user2idx = {o:i for i,o in enumerate(users_uniq)}
idx2user = {i:o for i,o in enumerate(users_uniq)}
ratings_train.userId = ratings_train.userId.apply(lambda x: user2idx[x])

movies_uniq = ratings_train.movieId.unique()
movie2idx = {o:i for i,o in enumerate(movies_uniq)}
idx2movie = {i:o for i,o in enumerate(movies_uniq)}
ratings_train.movieId = ratings_train.movieId.apply(lambda x: movie2idx[x])

n_users = int(ratings_train.userId.nunique())
n_movies = int(ratings_train.movieId.nunique())
n_users, n_movies

(610, 9724)

In [11]:
ratings_train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,4.0,964982703
1,0,1,4.0,964981247
2,0,2,4.0,964982224
3,0,3,5.0,964983815
4,0,4,5.0,964982931


In [13]:
def save_obj(obj, name):  
    with open(Path("model")/str(name+'.pkl'), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

save_obj(user2idx, "user2idx")
save_obj(idx2user, "idx2user")
save_obj(movie2idx, "movie2idx")
save_obj(idx2movie, "idx2movie")

In [14]:
movie_model = MovieNet(n_users, n_movies)
movie_model.build_model(emb_size=[50, 50], hl=[70, 10], drop=[0.4, 0.3])





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.



In [15]:
X = ratings_train.drop(['timestamp', 'rating'], axis=1)
y = ratings_train['rating']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
len(X_train), len(X_valid), len(y_train), len(y_valid)

(80668, 20168, 80668, 20168)

In [16]:
len(X_train["movieId"].unique()), n_movies, n_movies - len(X_train["movieId"].unique())

(8976, 9724, 748)

In [17]:
miss_movies = ratings_train[~ratings_train.movieId.isin(X_train["movieId"].unique())]["movieId"].unique()

In [18]:
concat = pd.DataFrame()
for i in miss_movies:
    concat = concat.append(ratings_train[ratings_train.movieId == i].sample(1))

In [19]:
concat.head()

Unnamed: 0,userId,movieId,rating,timestamp
96,0,96,4.0,964980875
290,2,281,5.0,1306463708
294,2,285,5.0,1306463670
483,3,427,5.0,986849180
496,3,440,3.0,986848665


In [20]:
X_valid.drop(concat.index, axis=0, inplace=True)
y_valid.drop(concat.index, axis=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [21]:
X_train = pd.concat([X_train, concat.drop(["rating", "timestamp"], axis=1)])
y_train = pd.concat([y_train, concat["rating"]])

In [22]:
len(X_train["movieId"].unique()), n_movies

(9724, 9724)

In [23]:
movie_model.fit(X_train, y_train, X_valid, y_valid, epochs=5, batch_size=512)


Train on 81416 samples, validate on 19420 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [24]:
movie_model.save_model(name="movie_model")

In [25]:
movie_model.fit(X_train, y_train, X_valid, y_valid, epochs=1, batch_size=512)

Train on 81416 samples, validate on 19420 samples
Epoch 1/1


In [26]:
movie_model.save_model(name="movie_model")

In [27]:
movie_model.fit(X_train, y_train, X_valid, y_valid, epochs=12, batch_size=128)

Train on 81416 samples, validate on 19420 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [28]:
movie_model.save_model(name="movie_model")