### Notebook to create learning journeys (recommendations of videos to watch to learn a goal) by using encodings and cos similarity

In [2]:
import json
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

In [3]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

def get_data(path): 
        merged_df = pd.read_csv(path)
        strings = merged_df['String']
        str_lst = strings.values

        vocab = merged_df['Title'].values
        identifier = merged_df['identifier']
        identifier_vocab = pd.DataFrame({'ID': identifier, 'Vocab': vocab})
        identifier_vocab = identifier_vocab.set_index('Vocab')['ID'].to_dict()
        return merged_df, str_lst, vocab, identifier_vocab, identifier


merged_df, str_lst, vocab, identifier_vocab, identifier = get_data('data/merged_data_for_AI.csv')


2023-07-12 00:54:54.676880: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-12 00:54:55.126729: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
import pickle


with open("data/study_subjects_with_embeddings.pkl", "rb") as pkl_file:
    study_data_we = pickle.load(pkl_file)


with open("data/lifelong_learning_with_embeddings.pkl", "rb") as pkl_file:
    ll_data_we = pickle.load(pkl_file)

In [9]:

with open("data/title_encoded.pkl", "rb") as pkl_file:
    title_encoded = pickle.load(pkl_file)

In [10]:
video_embeddings = pd.read_csv('data/sentence_transformer_embedding.csv')
del video_embeddings['Unnamed: 0']
video_embeddings_values = video_embeddings.values
print(video_embeddings_values.shape)

(71980, 768)


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

#function to calculate the cosine similarity between an matrix A and a vector b. Matrix A is the document database and vector b the search query. 
def calculate_cosine_similarity(A, b):
    # Convert A and b to numpy arrays
    A = np.array(A)
    b = np.array(b)

    # Calculate cosine similarity between A and b
    similarity_scores = cosine_similarity(A, b.reshape(1, -1)).flatten()

    # Sort the indices based on similarity scores
    sorted_indices = np.argsort(similarity_scores)[::-1]

    return sorted_indices


In [12]:
def get_top_n_videos(subject,lj_data, database_embeddings, video_titles, video_identifier, n = 3):
    '''
    function to get the top n videos with the highest cos similarity

            Parameters:
                    subject (str): The name of the subject someone wants to study. e.g. "Informatik"
                    lj_data (dict): A dict with different learning journeys and their specific steps. Al steps are NLU encoded. 
                    database_embeddings (dict): A numpy array with the embeddings of the documents. 
                    video_titles (list of str): The titles of the videos, to print them for the relevant videos. 
                    video_identifier (list of str): The ids of the youtube videos, to print the link to the video. 
                    n (int): number of videos that should be recommended per learning step

            
    '''
    subject_dict = lj_data[subject]
    print("Learning journey for ", subject, " is:")
    for i in subject_dict.keys():
        
        print(i)
        tmp_string = subject_dict[i]["name"]["raw_content"]
        t_emb = subject_dict[i]["name"]["embedding"]
        print("to learn ", tmp_string, " watch the following videos:")
        cosine_similarity_sorted = calculate_cosine_similarity(database_embeddings,t_emb)
        for index, v in enumerate(cosine_similarity_sorted[:3]):
            title = video_titles[v]
            #print(index)
            #print(title)
            url = "https://www.youtube.com/watch?v=" + video_identifier[v]
            print("Video %i: %s" % (index + 1, title))
            print(url)
        print("____________________________" + "\n")


### The learning journeys are stored in the dictonaries study_data_we and ll_data_we. The keys are the subjects they contain journeys for

In [13]:
study_data_we.keys()

dict_keys(['Betriebswirtschaftslehre', 'Informatik', 'Medizin', 'Maschinenbau', 'Elektrotechnik', 'Psychologie', 'Jura/Rechtswissenschaften', 'Architektur', 'Chemie', 'Biologie', 'Geschichte', 'Soziologie', 'Volkswirtschaftslehre', 'Mathematik', 'Physik', 'Politikwissenschaft', 'Medienwissenschaft', 'Sprachwissenschaft/Linguistik', 'Pädagogik', 'Philosophie', 'Kunstgeschichte', 'Sportwissenschaft', 'Geografie', 'Musikwissenschaft', 'Ethnologie'])

In [14]:
ll_data_we.keys()

dict_keys(['Barista', 'Gärtner', 'Stricken', 'Basteln', 'Malen', 'Kommunizieren', 'Meditation', 'Campingmanagement', 'Scrum', 'Azure DevOps', 'Projektmanagement', 'Immobilien', 'Teamleitung'])

### title_encoded and video_embeddings_values are both valid encoding databases. 
- title_encoded: only the title of the videos are encoded. 
- video_embeddings_values: title, uploader and the transcript are concatenated. Of this concatenation the first 128 tokens are encoded. 

Change the database to the variable you want to use for the encoding database

In [15]:
database = video_embeddings_values
#database = title_encoded

In [16]:
get_top_n_videos("Informatik", study_data_we, database,merged_df["Title"],merged_df["identifier"])

Learning journey for  Informatik  is:
Grundlagen der Informatik
to learn  Grundlagen der Informatik  watch the following videos:
Video 1: IT Fundamentals - 1.1 - ICT and Computer Systems
https://www.youtube.com/watch?v=mYXuCeawhm8
Video 2: CS-224 Computer Organization Lecture   47
https://www.youtube.com/watch?v=OdKFOqtYEAs
Video 3: 1.  Introduction
https://www.youtube.com/watch?v=6WohmegNYAQ
____________________________

Programmiergrundlagen
to learn  Programmiergrundlagen  watch the following videos:
Video 1: #47 Assertions and Design by Contract, Part-1
https://www.youtube.com/watch?v=cnEhFwo4u5g
Video 2: MANUAL GUIDE i - Inserting a Start Program Fixed Form Sentence
https://www.youtube.com/watch?v=sULtGy0RLS0
Video 3: Programming Paradigms, Assembly, Procedural, Functional & OOP | Ep28
https://www.youtube.com/watch?v=AmS2-9KEeS0
____________________________

Datenstrukturen und Algorithmen
to learn  Datenstrukturen und Algorithmen  watch the following videos:
Video 1: Data Structu