# Setup

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [2]:
import numpy as np
import pandas as pd
from ml_metrics import mapk
from recmetrics import mark
from sklearn.metrics import jaccard_score
# from sklearn.metrics import jaccard_similarity_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics.pairwise import pairwise_distances 

In [3]:
# reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('../Datasets/ml-100k/Text/u.user', sep='|', names=u_cols,encoding='latin-1')

# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('../Datasets/ml-100k/Text/u.data', sep='\t', names=r_cols,encoding='latin-1')

# reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('../Datasets/ml-100k/Text/u.item', sep='|', names=i_cols, encoding='latin-1')

In [4]:
print("\nUser Data :")
print("shape : ", users.shape)
users.head()


User Data :
shape :  (943, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [5]:
print("\nRatings Data :")
print("shape : ", ratings.shape)
ratings.head()


Ratings Data :
shape :  (100000, 4)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
print("\nItem Data :")
print("shape : ", items.shape)
items.head()


Item Data :
shape :  (1682, 24)


Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
ratings_train = pd.read_csv('../Datasets/ml-100k/Text/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('../Datasets/ml-100k/Text/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

((90570, 4), (9430, 4))

In [8]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

In [9]:
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

In [10]:
np.unique(data_matrix, return_counts=True)

(array([0., 1., 2., 3., 4., 5.]),
 array([1486126,    6110,   11370,   27145,   34174,   21201]))

# Matrix Factorization

In [11]:
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [12]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [13]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [14]:
user_prediction.shape

(943, 1682)

## Evaluate

In [15]:
def box(array, threshold):
    temp = []
    for x in array:
        if(x >= threshold):
            temp.append(1)
        else:
            temp.append(0)
    return(temp)

In [16]:
def evaluate(y_true, y_pred, k = 5, data_matrix = data_matrix):
    score = []
    for j in range(len(data_matrix)):
        include = [i for i in range(len(data_matrix[j])) if data_matrix[j][i] != 0]
        y_true_include = [y_true[j][i] for i in include]
        y_pred_include = [y_pred[j][i] for i in include]
        score.append([jaccard_score(y_true_include, y_pred_include, average="micro"), 
                      mark([y_true_include], [y_pred_include], k = k),
                      mapk([y_true_include], [y_pred_include], k = k)])
    print("Jaccard Score: {}\nMAR@K: {}\nMAP@K: {}".format(*np.mean(score, axis=0)))

In [17]:
y_true = [box(x, 4) for x in data_matrix]

In [18]:
threshold = np.percentile(user_prediction, 50)
y_pred = [box(x, threshold) for x in user_prediction]

In [19]:
evaluate(y_true, y_pred)

Jaccard Score: 0.43119652353583593
MAR@K: 0.02246310122682545
MAP@K: 0.21888299752562523


# Textual Summaries

In [20]:
items = pd.read_csv("../Datasets/ml-100k/Text/items.csv")

In [21]:
data_text = items['Summary']

In [22]:
import keras
from utils.autoencoder import Autoencoder_Text

AE = Autoencoder_Text(data_text)
# AE.train(data_text, epochs = 10, batch_size = 100)

Using TensorFlow backend.
[nltk_data] Downloading package punkt to /home/sriram/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sriram/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sriram/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
AE.enc_dec_model = keras.models.load_model('./pretrained/text_model')

In [24]:
inputs = np.squeeze(AE.pre_process(data_text), axis = 2)
movie_embeddings = AE.encoder.predict(inputs)

In [25]:
# user_embeddings = np.dot(data_matrix, movie_embeddings)/np.array([np.sum(data_matrix, axis = 1)]).T

In [26]:
# p1=np.sqrt(np.sum(user_embeddings**2,axis=1))[:,np.newaxis]
# p2=np.sqrt(np.sum(movie_embeddings**2,axis=1))[np.newaxis,:]
# text_similarity = np.dot(user_embeddings, movie_embeddings.T) / (p1*p2)

In [27]:
movie_similarity = 1 - pairwise_distances(movie_embeddings, metric='hamming')
text_similarity = predict(data_matrix, movie_similarity, type='item')

In [28]:
threshold = np.percentile(text_similarity, 90)
y_pred = [box(x, threshold) for x in text_similarity]
evaluate(y_true, y_pred)

Jaccard Score: 0.40302383373410006
MAR@K: 0.0280626267886118
MAP@K: 0.2677553905973834


# Meta Data

## Genres

In [29]:
movie_embeddings = items.iloc[:, 9:-7]

In [30]:
user_embeddings = np.dot(data_matrix, movie_embeddings)/np.array([np.sum(data_matrix, axis = 1)]).T

In [31]:
movie_similarity = 1 - pairwise_distances(movie_embeddings, metric='hamming')
genre_similarity = predict(data_matrix, movie_similarity, type='item')

In [32]:
genre_similarity.shape

(943, 1682)

In [33]:
threshold = np.percentile(genre_similarity, 50)
print(threshold)
y_pred = [box(x, threshold) for x in genre_similarity]
evaluate(y_true, y_pred)

0.13630215378212465
Jaccard Score: 0.3545390511587052
MAR@K: 0.019724662216981746
MAP@K: 0.20357723577235534


In [34]:
np.unique(np.array(y_pred)[0])

array([1])

In [35]:
print(*zip(y_true[1][:20], y_pred[1][:20]))

(1, 0) (0, 0) (0, 0) (0, 1) (0, 1) (0, 1) (0, 1) (0, 0) (0, 1) (0, 1) (0, 0) (0, 0) (1, 0) (1, 1) (0, 1) (0, 0) (0, 0) (0, 1) (0, 1) (0, 1)


# Crew

### Cast

In [36]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

In [37]:
def get_cast(text):
    return(str(text).split("|"))

In [38]:
data_cast = items['Cast'].apply(get_cast)

In [39]:
movie_similarity = []
for i in data_cast:
    temp = []
    for j in data_cast:
        temp.append(jaccard_similarity(i, j))
    movie_similarity.append(temp)
    
movie_similarity = np.array(movie_similarity)

In [40]:
cast_similarity = predict(data_matrix, movie_similarity, type='item')

In [41]:
threshold = np.percentile(cast_similarity, 70)
# print(threshold)
y_pred = [box(x, threshold) for x in cast_similarity]
evaluate(y_true, y_pred)

Jaccard Score: 0.4119292572169637
MAR@K: 0.02697508815393729
MAP@K: 0.25343937787203813


In [42]:
print(*zip(y_true[1][:20], y_pred[1][:20]))

(1, 1) (0, 0) (0, 1) (0, 1) (0, 1) (0, 0) (0, 0) (0, 0) (0, 0) (0, 1) (0, 0) (0, 0) (1, 1) (1, 1) (0, 1) (0, 0) (0, 0) (0, 0) (0, 1) (0, 1)


### Director

In [43]:
data_dir = items['Director']

In [44]:
movie_similarity = []
for i in data_dir:
    temp = []
    for j in data_dir:
        temp.append(int(i == j))
    movie_similarity.append(temp)
    
movie_similarity = np.array(movie_similarity)

In [45]:
dir_similarity = np.nan_to_num( predict(data_matrix, movie_similarity, type='item') )

  pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])


In [46]:
threshold = np.percentile(dir_similarity, 60)
# print(threshold)
y_pred = [box(x, threshold) for x in dir_similarity]
evaluate(y_true, y_pred)

Jaccard Score: 0.42801355026250276
MAR@K: 0.019428715795154605
MAP@K: 0.19978791092258522


In [47]:
print(*zip(y_true[1][:20], y_pred[1][:20]))

(1, 1) (0, 1) (0, 1) (0, 1) (0, 1) (0, 1) (0, 1) (0, 1) (0, 1) (0, 1) (0, 1) (0, 1) (1, 1) (1, 1) (0, 1) (0, 1) (0, 1) (0, 1) (0, 1) (0, 1)


# Audio Features

In [48]:
movie_embeddings = pd.read_csv("../Datasets/ml-100k/Audio/embeddings.csv").drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1', 'Unnamed: 0.1.1.1.1', 'Unnamed: 0.1.1.1.1.1', 'Unnamed: 0.1.1.1.1.1.1'], axis=1).dropna(axis=1)

In [49]:
temp = items[items["YT-Trailer ID"].isin(list(movie_embeddings.columns))]

In [50]:
data_matrix_ = data_matrix[:, list(temp.index)]

In [51]:
movie_similarity = 1 - pairwise_distances(movie_embeddings.T, metric='hamming')
audio_similarity = predict(data_matrix_, movie_similarity, type='item')

In [57]:
threshold = np.percentile(audio_similarity, 90)
y_pred = [box(x, threshold) for x in audio_similarity]
evaluate(y_true, y_pred, data_matrix = data_matrix_, k=2)

Jaccard Score: 0.2269734259102294
MAR@K: 0.05946757041800027
MAP@K: 0.6566808059384942


# Aggregation

In [54]:
similarity = np.mean([text_similarity, genre_similarity, cast_similarity, dir_similarity], axis = 0)

In [55]:
threshold = np.percentile(similarity, 50)
# print(threshold)
y_pred = [box(x, threshold) for x in similarity]
evaluate(y_true, y_pred)

Jaccard Score: 0.423266225751047
MAR@K: 0.023719854784552637
MAP@K: 0.22381760339342321


In [56]:
print(*zip(y_true[1][:20], y_pred[1][:20]))

(1, 1) (0, 0) (0, 1) (0, 1) (0, 1) (0, 1) (0, 1) (0, 0) (0, 0) (0, 1) (0, 1) (0, 1) (1, 1) (1, 1) (0, 1) (0, 0) (0, 0) (0, 0) (0, 1) (0, 1)


## Evaluate

In [60]:
sim = ['Aggregation', 'Text', 'Genres', 'Cast', 'Director']
cases = {sim[0]:similarity, sim[1]:text_similarity, sim[2]:genre_similarity, sim[3]:cast_similarity, sim[4]:dir_similarity}

for case in sim:
    print("\nEvaluating {}....".format(case))
    threshold = np.percentile(cases[case], 50)
    y_pred = [box(x, threshold) for x in cases[case]]
    evaluate(y_true, y_pred, k=2)


Evaluating Aggregation....
Jaccard Score: 0.423266225751047
MAR@K: 0.022457015643940443
MAP@K: 0.5424178154825027

Evaluating Text....
Jaccard Score: 0.42801355026250276
MAR@K: 0.019428715795154605
MAP@K: 0.49946977730646874

Evaluating Genres....
Jaccard Score: 0.3545390511587052
MAR@K: 0.019635548729713424
MAP@K: 0.5063626723223754

Evaluating Cast....
Jaccard Score: 0.42801355026250276
MAR@K: 0.019428715795154605
MAP@K: 0.49946977730646874

Evaluating Director....
Jaccard Score: 0.42801355026250276
MAR@K: 0.019428715795154605
MAP@K: 0.49946977730646874
