In [141]:
import pandas as pd
import numpy as np

from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MinMaxScaler
from ast import literal_eval
from scipy.spatial import distance

In [142]:
df_transformed = pd.read_csv("../data/ted_talks_transformed.csv")

In [143]:
viewed = [367, 844, 1449]

In [144]:
def cum_dist(dist_matrix):
    return np.mean([dist_matrix[i] for i in viewed], axis=0)

# Descriptions

In [145]:
desc_embeds = np.load("../data/embeddings/desc_embeddings.npy")

desc_dist = pairwise_distances(desc_embeds, metric="cosine")
np.fill_diagonal(desc_dist, np.max(desc_dist))

In [146]:
desc_cum_dist = cum_dist(desc_dist)
desc_cum_dist

array([0.32796997, 0.49536842, 0.31775838, ..., 0.45223674, 0.4681971 ,
       0.51917017], dtype=float32)

# About speaker

In [147]:
speak_embeds = np.load("../data/embeddings/speak_embeddings.npy")

speak_dist = pairwise_distances(speak_embeds, metric="cosine")
np.fill_diagonal(speak_dist, np.max(speak_dist))

In [148]:
speak_cum_dist = cum_dist(speak_dist)
speak_cum_dist

array([0.32948244, 0.39329743, 0.44754088, ..., 1.1051987 , 1.1051987 ,
       1.1051987 ], dtype=float32)

# General infos

In [149]:
scaler  = MinMaxScaler()

In [150]:
def scale_data(column):
    reshaped_values = df_transformed[column].values.reshape(-1, 1)
    return scaler.fit_transform(reshaped_values).T[0]

In [151]:
gen_infos = np.array([
    scale_data("views"),
    scale_data("comments"),
    scale_data("duration"),
    scale_data("discussion_rate")
]).T

In [152]:
gen_dist = pairwise_distances(gen_infos, metric="euclidean")
np.fill_diagonal(gen_dist, np.max(gen_dist))

In [153]:
gen_cum_dist = cum_dist(gen_dist)
gen_cum_dist

array([0.51707142, 0.53921797, 0.52560978, ..., 0.61999207, 0.63470471,
       0.63162485])

# Related distance

In [41]:
topic_dist = np.load("../data/embeddings/related_distance.npy")
topic_dist

array([[  0, 105, 109, ..., 122,  52, 495],
       [ 85,   0, 460, ..., 148, 886, 638],
       [158, 151,   0, ..., 656,   7, 515],
       ...,
       [ 24,  27, 121, ...,   0, 603, 838],
       [ 89,  44, 464, ..., 152,   0, 792],
       [207,   7, 183, ..., 846, 595,   0]])

In [155]:
topic_cum_dist = cum_dist(topic_dist)
topic_cum_dist

array([ 60. ,  31.5, 282.5, ..., 422.5, 352.5, 554. ])

# Sentiments

In [156]:
sent_cum_dist = np.mean([abs(df_transformed.sentiment[i] - df_transformed.sentiment) for i in viewed], axis=0)
sent_cum_dist

array([0.05232382, 0.03262515, 0.05669726, ..., 0.07312248, 0.03262515,
       0.03351366])

# Dates

In [157]:
date_cum_dist = np.mean([abs(df_transformed.recorded_date[i] - df_transformed.recorded_date) for i in viewed], axis=0)
date_cum_dist

array([3.649968e+08, 3.650832e+08, 3.846960e+08, ..., 8.212320e+07,
       8.220960e+07, 8.238240e+07])

# Topics

In [158]:
def boolean_df(item_lists, unique_items):
    bool_dict = {}
    
    for item in unique_items:
        bool_dict[item] = item_lists.apply(lambda x: item in x)
    
    return pd.DataFrame(bool_dict)

In [159]:
df_transformed.topics = df_transformed.topics.apply(lambda x: literal_eval(x))
one_hot_topics = boolean_df(df_transformed.topics, df_transformed.topics.explode().unique()).astype(int)

In [160]:
jacc_topic_dist = distance.squareform(distance.pdist(one_hot_topics,"jaccard"))

In [161]:
np.fill_diagonal(jacc_topic_dist, np.max(jacc_topic_dist))
jacc_topic_cum_dist = cum_dist(jacc_topic_dist)

# Combining distances

## Scale distances

In [162]:
def scale_distances(dist_matrix):
    reshaped_values = dist_matrix.reshape(-1, 1)
    return scaler.fit_transform(reshaped_values).T[0]

In [163]:
combined_dist_matrix = np.array([
    scale_distances(dist_matrix) for dist_matrix in [
        desc_cum_dist,
        speak_cum_dist,
        gen_cum_dist,
        topic_cum_dist,
        sent_cum_dist,
        date_cum_dist,
        jacc_topic_cum_dist
    ]
])

In [164]:
combined_dist_matrix.shape

(7, 3957)

## Weights

In [52]:
weights = {
    "description": 1,
    "about_speaker": 1,
    "general_infos": 1,
    "sentiment": 1,
    "related_distance": 1,
    "dates": 1,
    "topics_names": 1

}

weights = np.array(list(weights.values())).reshape(-1, 1)

## Recommend talks

In [166]:
global_distance = np.mean(weights * combined_dist_matrix, axis=0)

In [ ]:
np.argsort(global_distance)[:5]

array([3953,  367,  594, 3950, 2555])