# Setup

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [2]:
import numpy as np
import pandas as pd
from ml_metrics import mapk
from recmetrics import mark
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import pairwise_distances 

In [3]:
# reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('../Datasets/ml-100k/Text/u.user', sep='|', names=u_cols,encoding='latin-1')

# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('../Datasets/ml-100k/Text/u.data', sep='\t', names=r_cols,encoding='latin-1')

# reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('../Datasets/ml-100k/Text/u.item', sep='|', names=i_cols, encoding='latin-1')

In [4]:
print("\nUser Data :")
print("shape : ", users.shape)
users.head()


User Data :
shape :  (943, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [5]:
print("\nRatings Data :")
print("shape : ", ratings.shape)
ratings.head()


Ratings Data :
shape :  (100000, 4)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
print("\nItem Data :")
print("shape : ", items.shape)
items.head()


Item Data :
shape :  (1682, 24)


Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
ratings_train = pd.read_csv('../Datasets/ml-100k/Text/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('../Datasets/ml-100k/Text/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

((90570, 4), (9430, 4))

In [8]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

# Matrix Factorization

In [9]:
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

In [10]:
np.unique(data_matrix, return_counts=True)

(array([0., 1., 2., 3., 4., 5.]),
 array([1486126,    6110,   11370,   27145,   34174,   21201]))

In [11]:
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [12]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [13]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [14]:
user_prediction.shape

(943, 1682)

## Evaluate

In [15]:
def box(array, threshold):
    temp = []
    for x in array:
        if(x >= threshold):
            temp.append(1)
        else:
            temp.append(0)
    return(temp)

In [16]:
def evaluate(y_true, y_pred):
    score = []
    for j in range(len(data_matrix)):
        include = [i for i in range(len(data_matrix[j])) if data_matrix[j][i] != 0]
        y_true_include = [y_true[j][i] for i in include]
        y_pred_include = [y_pred[j][i] for i in include]
        score.append([jaccard_score(y_true_include, y_pred_include, average="macro"), 
                      mark([y_true_include], [y_pred_include], k = 10),
                      mapk([y_true_include], [y_pred_include], k = 10)])
    print("Jaccard Score: {}\nMAR@10: {}\nMAP@10: {}".format(*np.mean(score, axis=0)))

In [17]:
y_true = [box(x, 4) for x in data_matrix]

In [18]:
threshold = np.percentile(user_prediction, 85)
y_pred = [box(x, threshold) for x in user_prediction]

In [19]:
evaluate(y_true, y_pred)

Jaccard Score: 0.3820967009364004
MAR@10: 0.0321625210702409
MAP@10: 0.1578575636688047


# Text based Model

In [20]:
items = pd.read_csv("../Datasets/ml-100k/Text/items.csv")

In [21]:
data_text = items['Summary']

In [22]:
import keras
from utils.autoencoder import Autoencoder_Text

AE = Autoencoder_Text()
AE.train(data_text, epochs = 100, batch_size = 100)

Using TensorFlow backend.
[nltk_data] Downloading package punkt to /home/sriram/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sriram/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sriram/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_1 (Sequential)    (None, 400)               2276416   
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 240, 400)          0         
_________________________________________________________________
sequential_2 (Sequential)    (None, 240, 13672)        6444072   
Total params: 8,720,488
Trainable params: 8,720,488
Non-trainable params: 0
_________________________________________________________________




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [23]:
# AE.enc_dec_model = keras.models.load_model('./pretrained/text_model')

In [24]:
inputs = np.squeeze(AE.pre_process(data_text), axis = 2)
movie_embeddings = AE.encoder.predict(inputs)

In [25]:
user_embeddings = np.dot(data_matrix, movie_embeddings)/np.array([np.sum(data_matrix, axis = 1)]).T

In [26]:
movie_embeddings.shape

(1682, 400)

In [27]:
user_embeddings.shape

(943, 400)

In [28]:
p1=np.sqrt(np.sum(user_embeddings**2,axis=1))[:,np.newaxis]
p2=np.sqrt(np.sum(movie_embeddings**2,axis=1))[np.newaxis,:]
predict_matrix = np.dot(user_embeddings, movie_embeddings.T) / (p1*p2)

In [29]:
predict_matrix.shape

(943, 1682)

In [30]:
predict_matrix

array([[0.99997469, 0.9999714 , 0.99998506, ..., 0.99998141, 0.99999534,
        0.99997482],
       [0.99998775, 0.99998542, 0.99999234, ..., 0.99996721, 0.99999494,
        0.99998772],
       [0.99998604, 0.99998377, 0.99999166, ..., 0.99997014, 0.9999958 ,
        0.99998613],
       ...,
       [0.99998533, 0.99998291, 0.999992  , ..., 0.99997115, 0.99999516,
        0.99998488],
       [0.99995762, 0.99995296, 0.99997372, ..., 0.99998944, 0.99999084,
        0.99995757],
       [0.99996535, 0.99996196, 0.99997938, ..., 0.99998706, 0.99999312,
        0.99996575]])

## Evaluate

In [31]:
threshold = np.percentile(predict_matrix, 50)
y_pred = [box(x, threshold) for x in predict_matrix]

In [32]:
evaluate(y_true, y_pred)

Jaccard Score: 0.29932596919653
MAR@10: 0.03337140810385215
MAP@10: 0.17366341800063934
