# Setup

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [3]:
import numpy as np
import pandas as pd
from ml_metrics import mapk
from recmetrics import mark
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics.pairwise import pairwise_distances 

ModuleNotFoundError: No module named 'ml_metrics'

In [None]:
# reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('../Datasets/ml-100k/Text/u.user', sep='|', names=u_cols,encoding='latin-1')

# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('../Datasets/ml-100k/Text/u.data', sep='\t', names=r_cols,encoding='latin-1')

# # reading items file:
items = pd.read_csv("../Datasets/ml-100k/Text/items.csv")

In [None]:
print("\nUser Data :")
print("shape : ", users.shape)
users.head()

In [None]:
print("\nRatings Data :")
print("shape : ", ratings.shape)
ratings.head()

In [None]:
print("\nItem Data :")
print("shape : ", items.shape)
items.head()

In [None]:
ratings_train = pd.read_csv('../Datasets/ml-100k/Text/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('../Datasets/ml-100k/Text/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

In [None]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

In [None]:
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

In [None]:
np.unique(data_matrix, return_counts=True)

In [None]:
movie_embeddings = pd.read_csv("../Datasets/ml-100k/Audio/embeddings.csv").drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1', 'Unnamed: 0.1.1.1.1', 'Unnamed: 0.1.1.1.1.1', 'Unnamed: 0.1.1.1.1.1.1'], axis=1).dropna(axis=1)
temp = items[items["YT-Trailer ID"].isin(list(movie_embeddings.columns))]
items = items.iloc[list(temp.index)]
data_matrix = data_matrix[:, list(temp.index)]

# Matrix Factorization

In [None]:
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [None]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [None]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [None]:
user_prediction.shape

## Evaluate

In [None]:
def box(array, percentile):
    threshold = np.percentile(array, percentile)
    ret = np.where(array>threshold, 1, 0)
    return(ret)

In [None]:
def evaluate(y_true, y_pred, k = 3, percentile = 90, data_matrix = data_matrix):
    score = []
    y_true_box = box(y_true, percentile)
    y_pred_box = box(y_pred, percentile)
    for j in range(len(data_matrix)):
        include = [i for i in range(len(data_matrix[j])) if data_matrix[j][i] != 0]
        y_true_include = [y_true[j][i] for i in include]
        y_pred_include = [y_pred[j][i] for i in include]
        y_true_include_box = [y_true_box[j][i] for i in include]
        y_pred_include_box = [y_pred_box[j][i] for i in include]
        score.append([jaccard_score(y_true_include_box, y_pred_include_box, average="micro"), 
                      mark([y_true_include], [y_pred_include], k = k),
                      mapk([y_true_include], [y_pred_include], k = k)])
    print("Jaccard Score: {}\nMAR@K: {}\nMAP@K: {}".format(*np.mean(score, axis=0)))

In [None]:
y_true = data_matrix
y_pred = user_prediction

In [None]:
evaluate(y_true, y_pred)

# Textual Summaries

In [None]:
data_text = items['Summary']

In [None]:
import keras
from utils.autoencoder import Autoencoder_Text

AE = Autoencoder_Text(data_text)
# AE.train(data_text, epochs = 10, batch_size = 100)

In [None]:
AE.enc_dec_model = keras.models.load_model('./pretrained/text_model')

In [None]:
inputs = np.squeeze(AE.pre_process(data_text), axis = 2)
movie_embeddings = AE.encoder.predict(inputs)

In [None]:
# user_embeddings = np.dot(data_matrix, movie_embeddings)/np.array([np.sum(data_matrix, axis = 1)]).T

In [None]:
# p1=np.sqrt(np.sum(user_embeddings**2,axis=1))[:,np.newaxis]
# p2=np.sqrt(np.sum(movie_embeddings**2,axis=1))[np.newaxis,:]
# text_similarity = np.dot(user_embeddings, movie_embeddings.T) / (p1*p2)

In [None]:
movie_similarity = 1 - pairwise_distances(movie_embeddings, metric='hamming')
text_similarity = predict(data_matrix, movie_similarity, type='item')

In [None]:
y_pred = text_similarity
evaluate(y_true, y_pred)

# Meta Data

## Genres

In [None]:
movie_embeddings = items.iloc[:, 9:-7]

In [None]:
user_embeddings = np.dot(data_matrix, movie_embeddings)/np.array([np.sum(data_matrix, axis = 1)]).T

In [None]:
movie_similarity = 1 - pairwise_distances(movie_embeddings, metric='hamming')
genre_similarity = predict(data_matrix, movie_similarity, type='item')

In [None]:
genre_similarity.shape

In [None]:
y_pred = genre_similarity
evaluate(y_true, y_pred)

In [None]:
print(*zip(y_true[1][:20], y_pred[1][:20]))

# Crew

### Cast

In [None]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

In [None]:
def get_cast(text):
    return(str(text).split("|"))

In [None]:
data_cast = items['Cast'].apply(get_cast)

In [None]:
movie_similarity = []
for i in data_cast:
    temp = []
    for j in data_cast:
        temp.append(jaccard_similarity(i, j))
    movie_similarity.append(temp)
    
movie_similarity = np.array(movie_similarity)

In [None]:
cast_similarity = predict(data_matrix, movie_similarity, type='item')

In [None]:
y_pred = cast_similarity
evaluate(y_true, y_pred)

In [None]:
print(*zip(y_true[1][:20], y_pred[1][:20]))

### Director

In [None]:
data_dir = items['Director']

In [None]:
movie_similarity = []
for i in data_dir:
    temp = []
    for j in data_dir:
        temp.append(int(i == j))
    movie_similarity.append(temp)
    
movie_similarity = np.array(movie_similarity)

In [None]:
dir_similarity = np.nan_to_num( predict(data_matrix, movie_similarity, type='item') )

In [None]:
y_pred = dir_similarity
evaluate(y_true, y_pred)

In [None]:
print(*zip(y_true[1][:20], y_pred[1][:20]))

# Audio Features

In [None]:
movie_embeddings = pd.read_csv("../Datasets/ml-100k/Audio/embeddings.csv").drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1', 'Unnamed: 0.1.1.1.1', 'Unnamed: 0.1.1.1.1.1', 'Unnamed: 0.1.1.1.1.1.1'], axis=1).dropna(axis=1)

In [None]:
# temp = items[items["YT-Trailer ID"].isin(list(movie_embeddings.columns))]

In [None]:
# data_matrix_ = data_matrix[:, list(temp.index)]

In [None]:
movie_similarity = 1 - pairwise_distances(movie_embeddings.T, metric='hamming')
audio_similarity = predict(data_matrix, movie_similarity, type='item')

In [None]:
y_pred = audio_similarity
evaluate(y_true, y_pred, data_matrix = data_matrix, k=2)

In [None]:
print(*zip(y_true[1][:20], y_pred[1][:20]))

# Aggregation

In [None]:
similarity = np.mean([text_similarity, genre_similarity, cast_similarity, dir_similarity, audio_similarity], axis = 0)

In [None]:
y_pred = similarity
evaluate(y_true, y_pred)

In [None]:
print(*zip(y_true[1][:20], y_pred[1][:20]))

## Evaluate

In [None]:
sim = ['Aggregation', 'Text', 'Genres', 'Cast', 'Director', 'Audio']
cases = {sim[0]:similarity, sim[1]:text_similarity, sim[2]:genre_similarity, sim[3]:cast_similarity, sim[4]:dir_similarity, sim[5]:audio_similarity}

for case in sim:
    print("Evaluating {}....".format(case))
    threshold = np.percentile(cases[case], 90)
    # y_pred = [box(x, threshold) for x in cases[case]]
    y_pred = cases[case]
    evaluate(y_true, y_pred, k=3)
    print()