# Setup

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [2]:
import numpy as np
import pandas as pd
from ml_metrics import mapk
from recmetrics import mark
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics.pairwise import pairwise_distances 

In [3]:
# reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('../Datasets/ml-100k/Text/u.user', sep='|', names=u_cols,encoding='latin-1')

# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('../Datasets/ml-100k/Text/u.data', sep='\t', names=r_cols,encoding='latin-1')

# # reading items file:
items = pd.read_csv("../Datasets/ml-100k/Text/items.csv")

In [4]:
print("\nUser Data :")
print("shape : ", users.shape)
users.head()


User Data :
shape :  (943, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [5]:
print("\nRatings Data :")
print("shape : ", ratings.shape)
ratings.head()


Ratings Data :
shape :  (100000, 4)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
print("\nItem Data :")
print("shape : ", items.shape)
items.head()


Item Data :
shape :  (1682, 35)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,movie id,movie title,release date,video release date,IMDb URL,unknown,...,Thriller,War,Western,Summary,Cast,Director,Rating,Runtime,No. of ratings,YT-Trailer ID
0,0,0,0,0,1,Toy Story (1995),01-Jan-95,,https://www.imdb.com/title/tt0114709/,0,...,0,0,0,A little boy named Andy loves to be in his roo...,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,8.3,81.0,930K,mGXHH9iAfLA
1,1,1,1,1,2,GoldenEye (1995),01-Jan-95,,https://www.imdb.com/title/tt0113189/,0,...,1,0,0,When a deadly satellite weapon system falls in...,Pierce Brosnan|Sean Bean|Izabella Scorupco|Fam...,Martin Campbell,7.2,130.0,248K,IWFtzQOX_nY
2,2,2,2,2,3,Four Rooms (1995),01-Jan-95,,https://www.imdb.com/title/tt0113101/,0,...,1,0,0,This movie features the collaborative director...,Sammi Davis|Amanda De Cadenet|Valeria Golino|M...,Allison Anders,6.8,98.0,102K,hugpRyTc1fE
3,3,3,3,3,4,Get Shorty (1995),01-Jan-95,,https://www.imdb.com/title/tt0113161/,0,...,0,0,0,"Some guys get all the luck, whether they like ...",John Travolta|Gene Hackman|Rene Russo|Danny De...,Barry Sonnenfeld,6.9,105.0,81K,yvsnu3crV8g
4,4,4,4,4,5,Copycat (1995),01-Jan-95,,https://www.imdb.com/title/tt0112722/,0,...,1,0,0,"In San Francisco, the criminal psychologist He...",Sigourney Weaver|Holly Hunter|Dermot Mulroney|...,Jon Amiel,6.6,123.0,57K,gMrMLbxEt5Y


In [7]:
ratings_train = pd.read_csv('../Datasets/ml-100k/Text/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('../Datasets/ml-100k/Text/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

((90570, 4), (9430, 4))

In [8]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

In [9]:
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

In [10]:
np.unique(data_matrix, return_counts=True)

(array([0., 1., 2., 3., 4., 5.]),
 array([1486126,    6110,   11370,   27145,   34174,   21201]))

In [11]:
movie_embeddings = pd.read_csv("../Datasets/ml-100k/Audio/embeddings.csv").drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1', 'Unnamed: 0.1.1.1.1', 'Unnamed: 0.1.1.1.1.1', 'Unnamed: 0.1.1.1.1.1.1'], axis=1).dropna(axis=1)
temp = items[items["YT-Trailer ID"].isin(list(movie_embeddings.columns))]
items = items.iloc[list(temp.index)]
data_matrix = data_matrix[:, list(temp.index)]

# Matrix Factorization

In [12]:
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [13]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [14]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [15]:
user_prediction.shape

(943, 174)

## Evaluate

In [16]:
def box(array, percentile):
    threshold = np.percentile(array, percentile)
    ret = np.where(array>threshold, 1, 0)
    return(ret)

In [17]:
def evaluate(y_true, y_pred, k = 3, percentile = 90, data_matrix = data_matrix):
    score = []
    y_true_box = box(y_true, percentile)
    y_pred_box = box(y_pred, percentile)
    for j in range(len(data_matrix)):
        include = [i for i in range(len(data_matrix[j])) if data_matrix[j][i] != 0]
        y_true_include = [y_true[j][i] for i in include]
        y_pred_include = [y_pred[j][i] for i in include]
        y_true_include_box = [y_true_box[j][i] for i in include]
        y_pred_include_box = [y_pred_box[j][i] for i in include]
        score.append([jaccard_score(y_true_include_box, y_pred_include_box, average="micro"), 
                      mark([y_true_include], [y_pred_include], k = k),
                      mapk([y_true_include], [y_pred_include], k = k)])
    print("Jaccard Score: {}\nMAR@K: {}\nMAP@K: {}".format(*np.mean(score, axis=0)))

In [18]:
y_true = data_matrix
y_pred = user_prediction

In [20]:
evaluate(y_true, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


Jaccard Score: 0.4455022604528126
MAR@K: 0.0
MAP@K: 0.1357370095440085


# Textual Summaries

In [21]:
data_text = items['Summary']

In [22]:
import keras
from utils.autoencoder import Autoencoder_Text

AE = Autoencoder_Text(data_text)
# AE.train(data_text, epochs = 10, batch_size = 100)

Using TensorFlow backend.
[nltk_data] Downloading package punkt to /home/sriram/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sriram/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sriram/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
AE.enc_dec_model = keras.models.load_model('./pretrained/text_model')

In [24]:
inputs = np.squeeze(AE.pre_process(data_text), axis = 2)
movie_embeddings = AE.encoder.predict(inputs)

In [25]:
# user_embeddings = np.dot(data_matrix, movie_embeddings)/np.array([np.sum(data_matrix, axis = 1)]).T

In [26]:
# p1=np.sqrt(np.sum(user_embeddings**2,axis=1))[:,np.newaxis]
# p2=np.sqrt(np.sum(movie_embeddings**2,axis=1))[np.newaxis,:]
# text_similarity = np.dot(user_embeddings, movie_embeddings.T) / (p1*p2)

In [27]:
movie_similarity = 1 - pairwise_distances(movie_embeddings, metric='hamming')
text_similarity = predict(data_matrix, movie_similarity, type='item')

In [28]:
y_pred = text_similarity
evaluate(y_true, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


Jaccard Score: 0.40584294334801385
MAR@K: 0.10602131005358482
MAP@K: 0.6644279486273115


# Meta Data

## Genres

In [29]:
movie_embeddings = items.iloc[:, 9:-7]

In [30]:
user_embeddings = np.dot(data_matrix, movie_embeddings)/np.array([np.sum(data_matrix, axis = 1)]).T

  user_embeddings = np.dot(data_matrix, movie_embeddings)/np.array([np.sum(data_matrix, axis = 1)]).T


In [31]:
movie_similarity = 1 - pairwise_distances(movie_embeddings, metric='hamming')
genre_similarity = predict(data_matrix, movie_similarity, type='item')

In [32]:
genre_similarity.shape

(943, 174)

In [33]:
y_pred = genre_similarity
evaluate(y_true, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


Jaccard Score: 0.4609932069761751
MAR@K: 0.0
MAP@K: 0.1357370095440085


In [34]:
print(*zip(y_true[1][:20], y_pred[1][:20]))

# Crew

### Cast

In [35]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

In [36]:
def get_cast(text):
    return(str(text).split("|"))

In [37]:
data_cast = items['Cast'].apply(get_cast)

In [38]:
movie_similarity = []
for i in data_cast:
    temp = []
    for j in data_cast:
        temp.append(jaccard_similarity(i, j))
    movie_similarity.append(temp)
    
movie_similarity = np.array(movie_similarity)

In [39]:
cast_similarity = predict(data_matrix, movie_similarity, type='item')

In [40]:
y_pred = cast_similarity
evaluate(y_true, y_pred)

Jaccard Score: 0.33664134785382155
MAR@K: 0.025699497565852502
MAP@K: 0.24867444326617152


In [41]:
print(*zip(y_true[1][:20], y_pred[1][:20]))

(4.0, 2.9999999999999996) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.4) (0.0, 0.0) (0.0, 0.0) (2.0, 1.7999999999999998) (0.0, 0.0) (0.0, 0.4) (4.0, 4.0) (0.0, 0.4166666666666667) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (3.0, 3.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0)


### Director

In [42]:
data_dir = items['Director']

In [43]:
movie_similarity = []
for i in data_dir:
    temp = []
    for j in data_dir:
        temp.append(int(i == j))
    movie_similarity.append(temp)
    
movie_similarity = np.array(movie_similarity)

In [44]:
dir_similarity = np.nan_to_num( predict(data_matrix, movie_similarity, type='item') )

  pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])


In [45]:
y_pred = dir_similarity
evaluate(y_true, y_pred)

Jaccard Score: 0.31024054480577834
MAR@K: 0.0902599476610703
MAP@K: 0.626193001060444


In [46]:
print(*zip(y_true[1][:20], y_pred[1][:20]))

(4.0, 4.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (2.0, 2.0) (0.0, 0.0) (0.0, 0.0) (4.0, 2.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (3.0, 3.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0)


# Audio Features

In [47]:
movie_embeddings = pd.read_csv("../Datasets/ml-100k/Audio/embeddings.csv").drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1', 'Unnamed: 0.1.1.1.1', 'Unnamed: 0.1.1.1.1.1', 'Unnamed: 0.1.1.1.1.1.1'], axis=1).dropna(axis=1)

In [48]:
# temp = items[items["YT-Trailer ID"].isin(list(movie_embeddings.columns))]

In [49]:
# data_matrix_ = data_matrix[:, list(temp.index)]

In [50]:
movie_similarity = 1 - pairwise_distances(movie_embeddings.T, metric='hamming')
audio_similarity = predict(data_matrix, movie_similarity, type='item')

In [51]:
y_pred = audio_similarity
evaluate(y_true, y_pred, data_matrix = data_matrix, k=2)

  _warn_prf(average, modifier, msg_start, len(result))


Jaccard Score: 0.8642629904559915
MAR@K: 0.11453127630691487
MAP@K: 0.8700954400848356


In [52]:
print(*zip(y_true[1][:20], y_pred[1][:20]))

(4.0, 4.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (2.0, 2.0) (0.0, 0.0) (0.0, 0.0) (4.0, 4.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (3.0, 3.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) (0.0, 0.0)


# Aggregation

In [53]:
similarity = np.mean([text_similarity, genre_similarity, cast_similarity, dir_similarity, audio_similarity], axis = 0)

In [54]:
y_pred = similarity
evaluate(y_true, y_pred)

Jaccard Score: 0.35372043160824057
MAR@K: 0.0
MAP@K: 0.1357370095440085


In [55]:
print(*zip(y_true[1][:20], y_pred[1][:20]))

(4.0, 3.043106796116505) (0.0, 0.041429114405169123) (0.0, 0.041669614432260346) (0.0, 0.042689837551945595) (0.0, 0.20499982783752452) (0.0, 0.04288368636194724) (0.0, 0.04221837088388215) (2.0, 1.6027581174753738) (0.0, 0.042512077294686) (0.0, 0.2052935342483284) (4.0, 2.8426434782608694) (0.0, 0.12555170421721548) (0.0, 0.04254083484573504) (0.0, 0.04289879216992919) (0.0, 0.04221837088388215) (3.0, 2.4422183708838823) (0.0, 0.041718298223874434) (0.0, 0.04276587748966554) (0.0, 0.041825368307581746) (0.0, 0.042741624951867535)


## Evaluate

In [56]:
sim = ['Aggregation', 'Text', 'Genres', 'Cast', 'Director', 'Audio']
cases = {sim[0]:similarity, sim[1]:text_similarity, sim[2]:genre_similarity, sim[3]:cast_similarity, sim[4]:dir_similarity, sim[5]:audio_similarity}

for case in sim:
    print("Evaluating {}....".format(case))
    threshold = np.percentile(cases[case], 90)
    # y_pred = [box(x, threshold) for x in cases[case]]
    y_pred = cases[case]
    evaluate(y_true, y_pred, k=3)
    print()

Evaluating Aggregation....
Jaccard Score: 0.35372043160824057
MAR@K: 0.0
MAP@K: 0.1357370095440085

Evaluating Text....
Jaccard Score: 0.40584294334801385
MAR@K: 0.10602131005358482
MAP@K: 0.6644279486273115

Evaluating Genres....
Jaccard Score: 0.4609932069761751
MAR@K: 0.0
MAP@K: 0.1357370095440085

Evaluating Cast....
Jaccard Score: 0.33664134785382155
MAR@K: 0.025699497565852502
MAP@K: 0.24867444326617152

Evaluating Director....
Jaccard Score: 0.31024054480577834
MAR@K: 0.0902599476610703
MAP@K: 0.626193001060444

Evaluating Audio....
Jaccard Score: 0.8642629904559915
MAR@K: 0.13569870860313996
MAP@K: 0.7464946388594299

