In [2]:
import time

In [3]:
import pandas as pd
import numpy as np
init_notebook_mode(connected=True)
from collections import deque
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model

import matplotlib.pyplot as plt
import plotly

from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go


Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.

Using TensorFlow backend.


In [5]:
movie_table = pd.read_csv('table_movie.csv', 
                           encoding = 'ISO-8859-1', 
                           header = 0, 
                           names = ['Name', 'Id', 'dontcare1','dontcare2','Year','dontcare3','dontcare4','Overview']).set_index('Id')
movie_titles=movie_table.drop(['dontcare1','dontcare2','dontcare3','dontcare4','Overview'],axis=1)

In [6]:
import pandas as pd                                                             

df=pd.read_csv('table_rate.csv',
               encoding = 'ISO-8859-1',
               header = 0,                                           
               names = ['Movie', 'User', 'Rating','dontcare'],
               dtype={'User': int, 'Movie':int, 'Rating':float}
              )
cols=['User', 'Rating', 'Movie']
df=df[cols]

preprocessing

In [7]:
min_movie_ratings = 100
filter_movies = (df['Movie'].value_counts()>min_movie_ratings)
filter_movies = filter_movies[filter_movies].index.tolist()

# Filter sparse users
min_user_ratings = 10
filter_users = (df['User'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

# get the final filtered movies
df_filterd = df[(df['Movie'].isin(filter_movies)) & (df['User'].isin(filter_users))]
del filter_movies, filter_users, min_movie_ratings, min_user_ratings

In [8]:
# Shuffle DataFrame
df_filterd = df_filterd.sample(frac=1).reset_index(drop=True)

# Testingsize
n = 100000

# Split train- & testset
df_train = df_filterd[:-n]
df_test = df_filterd[-n:]

In [9]:
# Create a user-movie matrix with empty values
df_p = df_train.pivot_table(index='User', columns='Movie', values='Rating')

In [10]:
# Load a movie metadata dataset
# Remove the long tail of rarly rated moves
# movie_metadata = movie_metadata[movie_metadata['vote_count']>10].drop('vote_count', axis=1)
movie_metadata = movie_table[['Name','Overview']]
movie_metadata=movie_metadata.set_index('Name')
# print('Shape Movie-Metadata:\t{}'.format(movie_metadata.shape))
movie_metadata.sample(10)
movie_metadata_sample=movie_metadata[0:8000]


#mean rating#

In [11]:
start_time = time.time()
# Top n movies
n = 10

# Compute mean rating for all movies
ratings_mean = df_p.mean(axis=0).sort_values(ascending=False).rename('Rating-Mean').to_frame()

# Count ratings for all movies
ratings_count = df_p.count(axis=0).rename('Rating-Count').to_frame()

# Combine ratings_mean, ratings_count and movie_titles
ranking_mean_rating = ratings_mean.head(n).join(ratings_count).join(movie_titles.drop(['Year'],axis=1))

# Join labels and predictions
df_prediction = df_test.set_index('Movie').join(ratings_mean)[['Rating', 'Rating-Mean']]
y_true = df_prediction['Rating']
y_pred = df_prediction['Rating-Mean']

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))


# Create trace
trace = go.Bar(x = ranking_mean_rating['Rating-Mean'],
               text = ranking_mean_rating['Name'].astype(str) +': '+ ranking_mean_rating['Rating-Count'].astype(str) + ' Ratings',
               textposition = 'outside',
               textfont = dict(color = '#000000'),
               orientation = 'h',
               y = list(range(1, n+1)),
               marker = dict(color = 'rgb(225,233,220)'))

# Create layout
layout = dict(title = 'Ranking Of Top {} Mean-Movie-Ratings: {:.4f} RMSE'.format(n, rmse),
              xaxis = dict(title = 'Mean-Rating',
                          range = (4, 5)),
              yaxis = dict(title = 'Movie'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

print(" ---mean rating method used %s seconds ---" % (time.time() - start_time))


 ---mean rating method used 4.050964593887329 seconds ---


weighted-rating

In [12]:
start_time = time.time()
# at least voted for 10 times!
m = 10

# calculate the weighted
C = df_p.stack().mean()
R = df_p.mean(axis=0).values
v = df_p.count().values
weighted_score = (v/ (v+m) *R) + (m/ (v+m) *C)


# id-rank
weighted_ranking = np.argsort(weighted_score)[::-1]
# score-rank
weighted_score = np.sort(weighted_score)[::-1]
weighted_movie_ids = df_p.columns[weighted_ranking]

# get the predictions we want
df_prediction = df_test.set_index('Movie').join(pd.DataFrame(weighted_score, index=weighted_movie_ids, columns=['Prediction']))[['Rating', 'Prediction']]
y_true = df_prediction['Rating']
y_pred = df_prediction['Prediction']

# get the loss
RMSE = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))


# Create DataFrame for plotting
df_plot = pd.DataFrame(weighted_score[:n], columns=['Rating'])
df_plot.index = weighted_movie_ids[:10]
ranking_weighted_rating = df_plot.join(ratings_count).join(movie_titles)
del df_plot

# draw the plot!
# Create trace
trace = go.Bar(x = ranking_weighted_rating['Rating'],
               text = ranking_weighted_rating['Name'].astype(str) +': '+ ranking_weighted_rating['Rating-Count'].astype(str) + ' Ratings',
               textposition = 'outside',
               textfont = dict(color = '#000000'),
               orientation = 'h',
               y = list(range(1, n+1)),
               marker = dict(color = 'rgb(225,233,220)'))
# Create layout
layout = dict(title = 'Ranking Of Top {} Weighted-Movie-Ratings: RMSE={:.4f} '.format(n, RMSE),
              xaxis = dict(title = 'Weighted Rating',
                          range = (4, 4.6)),
              yaxis = dict(title = 'Movie'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)
# print the time consumed!
print("         --- weighted rating method used %s seconds ---" % (time.time() - start_time))


         --- weighted rating method used 0.8274047374725342 seconds ---


similarity

In [13]:
start_time=time.time()

# target user 
user_index = 2
# target recommendation number
n_recommendation = 5
# get recommendation numbers
n_plot = 10

#preprocessings
df_p_imputed = df_p.T.fillna(df_p.mean(axis=1)).T


# Compute similarity between all users
similarity = cosine_similarity(df_p_imputed.values)
similarity -= np.eye(similarity.shape[0])


# Sort similar users by index
similar_user_index = np.argsort(similarity[user_index])[::-1]
# Sort similar users by score
similar_user_score = np.sort(similarity[user_index])[::-1]


# Get unrated movies
unrated_movies = df_p.iloc[user_index][df_p.iloc[user_index].isna()].index
mean_movie_recommendations = (df_p_imputed.iloc[similar_user_index[:n_recommendation]].T * similar_user_score[:n_recommendation]).T.mean(axis=0)



# recommend the friends!
print("Your recommended users are: ", similar_user_index[:n_recommendation])
# Filter for unrated movies and sort results
best_movie_recommendations = mean_movie_recommendations[unrated_movies].sort_values(ascending=False).to_frame().join(movie_titles)


# get the users
user_id_mapping = {id:i for i, id in enumerate(df_p_imputed.index)}

prediction = []


for user_id in df_test['User'].unique():
    
# sort the  similar users by index
    similar_user_index = np.argsort(similarity[user_id_mapping[user_id]])[::-1]
# sort similar users by score
    similar_user_score = np.sort(similarity[user_id_mapping[user_id]])[::-1]
    
    for movie_id in df_test[df_test['User']==user_id]['Movie'].values:

        # Compute predicted score
        score = (df_p_imputed.iloc[similar_user_index[:n_recommendation]][movie_id] * similar_user_score[:n_recommendation]).values.sum() / similar_user_score[:n_recommendation].sum()
        prediction.append([user_id, movie_id, score])
        

# get the predictions dataframe
df_pred = pd.DataFrame(prediction, columns=['User', 'Movie', 'Prediction']).set_index(['User', 'Movie'])
df_pred = df_test.set_index(['User', 'Movie']).join(df_pred)

# Get labels and predictions
y_true = df_pred['Rating'].values
y_pred = df_pred['Prediction'].values

# plot them

# Create trace
trace = go.Bar(x = best_movie_recommendations.iloc[:n_plot, 0],
               text = best_movie_recommendations['Name'],
               textposition = 'inside',
               textfont = dict(color = '#000000'),
               orientation = 'h',
               y = list(range(1, n_plot+1)),
               marker = dict(color = 'rgb(225,233,220)'))
# Create layout
layout = dict(title = 'Ranking Of Top {} Recommended Movies For A User Based On User Similarity'.format(n_plot),
              xaxis = dict(title = 'Recommendation-Rating',
                           range = (4.1, 4.5)),
              yaxis = dict(title = 'Movie'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)
print("          --- user-user similarity used %s seconds ---" % (time.time() - start_time))


Your recommended users are:  [5605 8805 4099 3383  223]



invalid value encountered in double_scalars



          --- user-user similarity used 85.25116229057312 seconds ---


TFIDF MovieDescription Similarity

In [14]:
start_time=time.time()

tfidf=TfidfVectorizer(stop_words='english')
#presumed to be uninformative in representing the context of a text
#avoid to make them used in prediction
#automatically apply a stopwords list for english language
tfidf_matrix=tfidf.fit_transform(raw_documents=movie_metadata_sample['Overview'].dropna())
# learn vocabulary and idf
# dropna is a pandas method, means remove the missing values https://pandas.pydata.org/pandas-docs/stable/missing_data.html#missing-data
similarity = cosine_similarity(tfidf_matrix)
# cosine similarity between all movie descriptions
similarity -= np.eye(similarity.shape[0])
#remove self-similarity


movie = 'Casino'
n_plot = 10
index = movie_metadata.reset_index(drop=True)[movie_metadata.index==movie].index[0]
#find the index of the movie chosen
#drop=Ture means to avoid the old index being added as a column, This resets the index to the default integer index.

# Get indices and scores of similar movies
similar_movies_index = np.argsort(similarity[index])[::-1][:n_plot]
#return the indices that would sort the array, from large to small,choose the n_plot ones
similar_movies_score = np.sort(similarity[index])[::-1][:n_plot]
#return the simlarity scores

# Get titles of similar movies
similar_movie_titles = movie_metadata.iloc[similar_movies_index].index
#select by pure integer location indexing

#plotly bar charts in pandas
# Create trace
trace = go.Bar(x = similar_movies_score,
               text = similar_movie_titles,
               textposition = 'inside',
               textfont = dict(color = '#000000'),
               orientation = 'h',
               y = list(range(1, n_plot+1)),
               marker = dict(color = 'rgb(225,233,220)'))
# Create layout
layout = dict(title = 'the Top {} Most Similar Movies based on Overview For "{}"'.format(n_plot, movie),
              xaxis = dict(title = 'Cosine TFIDF Overview Similarity',
                           range = (0.1, 0.5)),
              yaxis = dict(title = 'Movie'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)
print("--- TFIDF on movie overview used %s seconds ---" % (time.time() - start_time))


--- TFIDF on movie overview used 1.9144747257232666 seconds ---


In [15]:
# TFIDF according to the cast
start_time=time.time()
# Load a movie metadata dataset
director = pd.read_csv('table_director.csv', 
                           encoding = 'ISO-8859-1', 
                           header = 0, 
                           names = ['dName', 'dId', 'dontcare']).set_index('dId')
director=director.drop(['dontcare'],axis=1)

movie_dId=pd.read_csv('table_direct.csv', 
                           encoding = 'ISO-8859-1', 
                           header = 0, 
                           names = ['Id', 'dId']).set_index('Id')
movie_dName=movie_dId['dId'].map(lambda x: director['dName'][x]).to_frame(name='dName')

cast=pd.read_csv('table_mov_cast.csv', 
                           encoding = 'ISO-8859-1', 
                           header = 0, 
                           names = ['cName', 'cId', 'dontcare']).set_index('cId')
cast=cast.drop(['dontcare'],axis=1)

movie_cId=pd.read_csv('table_act.csv', 
                           encoding = 'ISO-8859-1', 
                           header = 0, 
                           names = ['Id', 'role', 'cId']).set_index('Id')
movie_cName=movie_cId['cId'].map(lambda x: cast['cName'][x]).to_frame(name='cName')

 
genre= pd.read_csv('table_genre.csv', 
                           encoding = 'ISO-8859-1', 
                           header = 0, 
                           names = ['gName', 'gId']).set_index('gId')
movie_gId=pd.read_csv('table_belong_to.csv', 
                           encoding = 'ISO-8859-1', 
                           header = 0, 
                           names = ['Id', 'gId']).set_index('Id')
movie_gName=movie_gId['gId'].map(lambda x: genre['gName'][x]).to_frame(name='gName')

general = pd.merge(movie_dName, movie_gName, left_index=True, right_index=True)
general = pd.merge(general, movie_cName, left_index=True, right_index=True)
general['all']=general['dName']+' '+general['dName']+' '+general['dName']+' '+general['cName']+' '+general['gName']+' '+general['gName']
#three times for director, twice for genre

tfidf=TfidfVectorizer(stop_words='english')
#presumed to be uninformative in representing the context of a text
#avoid to make them used in prediction
#automatically apply a stopwords list for english language
tfidf_matrix=tfidf.fit_transform(raw_documents=general['all'][:8000].dropna())
# learn vocabulary and idf
# dropna is a pandas method, means remove the missing values https://pandas.pydata.org/pandas-docs/stable/missing_data.html#missing-data
similarity = cosine_similarity(tfidf_matrix)
# cosine similarity between all movie descriptions
similarity -= np.eye(similarity.shape[0])
#remove self-similarity


movie = 'Toy Story'
n_plot = 10
index = movie_metadata.reset_index(drop=True)[movie_metadata.index==movie].index[0]
#find the index of the movie chosen
#drop=Ture means to avoid the old index being added as a column, This resets the index to the default integer index.

# Get indices and scores of similar movies
similar_movies_index = np.argsort(similarity[index])[::-1][:n_plot]
#return the indices that would sort the array, from large to small,choose the n_plot ones
similar_movies_score = np.sort(similarity[index])[::-1][:n_plot]
#return the simlarity scores

# Get titles of similar movies
similar_movie_titles = movie_metadata.iloc[similar_movies_index].index
#select by pure integer location indexing

#plotly bar charts in pandas
# Create trace
trace = go.Bar(x = similar_movies_score,
               text = similar_movie_titles,
               textposition = 'inside',
               textfont = dict(color = '#000000'),
               orientation = 'h',
               y = list(range(1, n_plot+1)),
               marker = dict(color = 'rgb(225,233,220)'))
# Create layout
layout = dict(title = 'Ranking Of Top {} Most Similar Movie based on Cast For "{}"'.format(n_plot, movie),
              xaxis = dict(title = 'Cosine TFIDF Cast Similarity',
                           range = (0.8, 1)),
              yaxis = dict(title = 'Movie'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)
print("   --- TFIDF on casting method used %s seconds ---" % (time.time() - start_time))


   --- TFIDF on casting method used 11.485305547714233 seconds ---


In [16]:
movie_metadata = movie_table[['Name','Overview']]
movie_metadata=movie_metadata.set_index('Name')
# print('Shape Movie-Metadata:\t{}'.format(movie_metadata.shape))
movie_metadata.sample(10)
movie_metadata_sample=movie_metadata[0:8000]

Matrix factorisation
keras and gradient descent

In [17]:
start_time=time.time()

# Create user- & movie-id mapping
user_id_mapping = {id:i for i, id in enumerate(df_filterd['User'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df_filterd['Movie'].unique())}
#.unique() return the unique values
#enumerate loops over something and have an automatic counter

# Create correctly mapped train- & testset
train_user_data = df_train['User'].map(user_id_mapping)
train_movie_data = df_train['Movie'].map(movie_id_mapping)

test_user_data = df_test['User'].map(user_id_mapping)
test_movie_data = df_test['Movie'].map(movie_id_mapping)
#Map values of Series using input correspondence 


# Get input variable-sizes
users = len(user_id_mapping)
movies = len(movie_id_mapping)
embedding_size = 10


##### Create model
# Set input layers
user_id_input = Input(shape=[1], name='user')
movie_id_input = Input(shape=[1], name='movie')
#initiate a keras tensor

# Create embedding layers,which is also the first layers, for users and movies
# embedding Turns positive integers (indexes) into dense vectors of fixed size.
user_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)
movie_embedding = Embedding(output_dim=embedding_size, 
                            input_dim=movies,
                            input_length=1, 
                            name='item_embedding')(movie_id_input)

# Reshape the embedding layers to the embedding_size
user_vector = Reshape([embedding_size])(user_embedding)
movie_vector = Reshape([embedding_size])(movie_embedding)

# Compute dot-product of reshaped embedding layers as prediction
# dot is the layer that computes a dot product between samples in two tensors.
y = Dot(1, normalize=False)([user_vector, movie_vector])
#take dot product along axis1
#

# Setup model
model = Model(inputs=[user_id_input, movie_id_input], outputs=y)
#initialize a Model using the input tensor and output tensor,
# this model will include all layers required in the computation of output given input

model.compile(loss='mse', optimizer='adam')
#configures the model for training
# loss function is defined to be mean squared error
# optimizer is defined to be adam optimizer

# Fit model
model.fit([train_user_data, train_movie_data],
          df_train['Rating'],
          batch_size=256, 
          epochs=1,
          validation_split=0.1,
          shuffle=True)
# training data, two inputs in a list of numpy arrays.
#target data is df_train['Rating']
#train the model for a given number epochs, which is 1 here
#batch_size of samples per gradient update

# Test model
y_pred = model.predict([test_user_data, test_movie_data])
# generate output predictions for the input samples test_user_data and test_movie_data
y_true = df_test['Rating'].values
# ground truth as given

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Matrix-Factorization: RMSE={:.4f} RMSE'.format(rmse))

print("-------matrix factorization and gradient descent used  %s seconds ---" % (time.time() - start_time))


Train on 608460 samples, validate on 67607 samples
Epoch 1/1


Testing Result With Keras Matrix-Factorization: RMSE=1.4216 RMSE
-------matrix factorization and gradient descent used  8.387321472167969 seconds ---


Deep learning with Keras

In [18]:
start_time=time.time()

# Setup variables
user_embedding_size = 20
movie_embedding_size = 10


##### Create model
# Set input layers
# create input tensors
user_id_input = Input(shape=[1], name='user')
movie_id_input = Input(shape=[1], name='movie')

# Create embedding layers for users and movies
user_embedding = Embedding(output_dim=user_embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)
movie_embedding = Embedding(output_dim=movie_embedding_size, 
                            input_dim=movies,
                            input_length=1, 
                            name='item_embedding')(movie_id_input)

# Reshape the embedding layers
user_vector = Reshape([user_embedding_size])(user_embedding)
movie_vector = Reshape([movie_embedding_size])(movie_embedding)


# Concatenate the reshaped embedding layers
concat = Concatenate()([user_vector, movie_vector])

# Combine with dense layers
dense = Dense(256)(concat)
# output 256, input concat
y = Dense(1)(dense)
# output 1, input 256

# Setup model
model = Model(inputs=[user_id_input, movie_id_input], outputs=y)
model.compile(loss='mse', optimizer='adam')


# Fit model
model.fit([train_user_data, train_movie_data],
          df_train['Rating'],
          batch_size=256, 
          epochs=1,
          validation_split=0.1,
          shuffle=True)

# Test model
y_pred = model.predict([test_user_data, test_movie_data])
y_true = df_test['Rating'].values

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Deep Learning: RMSE={:.4f} '.format(rmse))
print("---Keras deep learning method used %s seconds ---" % (time.time() - start_time))


Train on 608460 samples, validate on 67607 samples
Epoch 1/1


Testing Result With Keras Deep Learning: RMSE=0.9026 
---Keras deep learning method used 11.417484045028687 seconds ---
