### This is based on the 04_cos_similarity notebook. We use the same logic to find the top 5 videos per subject and store their ids to a list. These will be used for benchmarks with their transcripts. 

In [1]:
import json
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

In [2]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

def get_data(path): 
        merged_df = pd.read_csv(path)
        strings = merged_df['String']
        str_lst = strings.values

        vocab = merged_df['Title'].values
        identifier = merged_df['identifier']
        identifier_vocab = pd.DataFrame({'ID': identifier, 'Vocab': vocab})
        identifier_vocab = identifier_vocab.set_index('Vocab')['ID'].to_dict()
        return merged_df, str_lst, vocab, identifier_vocab, identifier


merged_df, str_lst, vocab, identifier_vocab, identifier = get_data('data/merged_data_for_AI.csv')


2023-07-12 18:17:06.446126: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-12 18:17:07.915603: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
import pickle


with open("data/study_subjects_with_embeddings.pkl", "rb") as pkl_file:
    study_data_we = pickle.load(pkl_file)


with open("data/lifelong_learning_with_embeddings.pkl", "rb") as pkl_file:
    ll_data_we = pickle.load(pkl_file)

In [5]:
with open("data/title_encoded.pkl", "rb") as pkl_file:
    title_encoded = pickle.load(pkl_file)

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

#function to calculate the cosine similarity between an matrix A and a vector b. Matrix A is the document database and vector b the search query. 
def calculate_cosine_similarity(A, b):
    # Convert A and b to numpy arrays
    A = np.array(A)
    b = np.array(b)

    # Calculate cosine similarity between A and b
    similarity_scores = cosine_similarity(A, b.reshape(1, -1)).flatten()

    # Sort the indices based on similarity scores
    sorted_indices = np.argsort(similarity_scores)[::-1]

    return sorted_indices


In [7]:
def get_top_n_videos(subject,lj_data, database_embeddings, video_titles, video_identifier, n = 3):
    '''
    function to get the top n videos with the highest cos similarity. Actually returns the videos ids. 

            Parameters:
                    subject (str): The name of the subject someone wants to study. e.g. "Informatik"
                    lj_data (dict): A dict with different learning journeys and their specific steps. Al steps are NLU encoded. 
                    database_embeddings (dict): A numpy array with the embeddings of the documents. 
                    video_titles (list of str): The titles of the videos, to print them for the relevant videos. 
                    video_identifier (list of str): The ids of the youtube videos, to print the link to the video. 
                    n (int): number of videos that should be recommended per learning step
            
            Returns:
                    Video ids.

            
    '''
    ids = []
    subject_dict = lj_data[subject]
    for i in subject_dict.keys():     
        t_emb = subject_dict[i]["name"]["embedding"]
        cosine_similarity_sorted = calculate_cosine_similarity(database_embeddings,t_emb)
        for index, v in enumerate(cosine_similarity_sorted[:n]):

            ids.append(video_identifier[v])
    return ids



In [15]:
ids = []

for k in study_data_we:
    ids += get_top_n_videos(k, study_data_we, title_encoded,merged_df["Title"],merged_df["identifier"], n = 3)

for k in ll_data_we:
    ids += get_top_n_videos(k, ll_data_we, title_encoded,merged_df["Title"],merged_df["identifier"], n = 3)

In [18]:
ids = list(set(ids))

In [19]:
ids

['0MECOZHEdDg',
 'bTRUI8Kfsew',
 'qDbBiIFhDJQ',
 'scd6_QB-9E4',
 'JaBCHbuI1EI',
 'hdCBGWcd4qw',
 '2d7c3O_2Vx8',
 'AG-rnTlIvgM',
 'Wfo4T97LSHk',
 'e3faQlkUQf4',
 'mkhPsT2NyOU',
 '8GIvBpvufMs',
 'Zq4upTEaQyM',
 'xWBSf4BfKRk',
 '-wlsOJZj6VI',
 'DNHffMznKn4',
 'kx4mQB0QzvQ',
 'sw8Mis8Mogw',
 '19BT4CtOCMk',
 'JDxUqZ65fsU',
 'z7bTl6YzbTA',
 'jqrrNz-E9Is',
 'CPK3sg6YnjI',
 'c6m3v5kTFJ0',
 'Yxg-CyQ7Wts',
 'wb0JrsgDdPc',
 'KPbufkMxr0k',
 'KJu4wEEiBAM',
 'Y2S9D7Fqkz4',
 'AkgXF-N7h-4',
 'ipashDeLAdA',
 '6PxKBQTvbCc',
 'KLDSokWCbFY',
 '_2UjBDHTBLc',
 'zy2Q35IA6ls',
 'LEfU1JWqIpg',
 '1OGFui3_5YU',
 'DiiNu-Wk6UU',
 'AMxgVB4fwJM',
 'TW_B1skghSY',
 '2WJxFgAzWR4',
 'XMRXcl6TVms',
 'cKXgRm5AeLQ',
 '9TycLR0TqFA',
 'c8_avX9miag',
 'av-YFlnQZiQ',
 'M3VA1WiP8Vk',
 '8i9BOYkKUPU',
 'A31CR6eaBBM',
 '0PGC0hQpwuo',
 'HcYgDxgXEJA',
 'LdTUghjsJ40',
 'ciXtgc4bsPg',
 'RpRRUQFbePU',
 'ENOatOPHC8c',
 'y194tutTvFA',
 'sGWpbDI11Ik',
 'KvfUs_jSe8M',
 '9oFvA1uYLXc',
 'aco9HNkt7ZA',
 'q7R_upR81FU',
 'u9HL1VQNllk',
 'MYuh5y

In [28]:
with open("data/video_sample.pkl", "wb+") as file:
   pickle.dump(ids, file)