In [10]:
import json
from sentence_transformers import SentenceTransformer


with open("data/lifelong_learning_parsed.json", "r") as json_file:
    ll_data = json.load(json_file)

with open("data/study_subjects_parsed.json", "r") as json_file:
    study_data = json.load(json_file)

In [11]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')


In [34]:
def add_embeddings(data):
    return_dict = {}
    for k in data.keys():
        print(k)
        return_dict[k] = {}
        for skill in data[k]["Skills"]:
            return_dict[k][skill["name"]] = {}
            for skill_k in skill.keys():
                raw_content = skill[skill_k]
                embedding = model.encode(raw_content)
                return_dict[k][skill["name"]][skill_k] = { 
                    "raw_content":raw_content,
                    "embedding":embedding
                }
                    
                

    return return_dict
                
        

In [35]:
ll_data_we = add_embeddings(ll_data)

Barista
Gärtner
Stricken
Basteln
Malen
Kommunizieren
Meditation
Campingmanagement
Scrum
Azure DevOps
Projektmanagement
Immobilien
Teamleitung


In [36]:
study_data_we = add_embeddings(study_data)

Betriebswirtschaftslehre
Informatik
Medizin
Maschinenbau
Elektrotechnik
Psychologie
Jura/Rechtswissenschaften
Architektur
Chemie
Biologie
Geschichte
Soziologie
Volkswirtschaftslehre
Mathematik
Physik
Politikwissenschaft
Medienwissenschaft
Sprachwissenschaft/Linguistik
Pädagogik
Philosophie
Kunstgeschichte
Sportwissenschaft
Geografie
Musikwissenschaft
Ethnologie


In [37]:
study_data_we.keys()

dict_keys(['Betriebswirtschaftslehre', 'Informatik', 'Medizin', 'Maschinenbau', 'Elektrotechnik', 'Psychologie', 'Jura/Rechtswissenschaften', 'Architektur', 'Chemie', 'Biologie', 'Geschichte', 'Soziologie', 'Volkswirtschaftslehre', 'Mathematik', 'Physik', 'Politikwissenschaft', 'Medienwissenschaft', 'Sprachwissenschaft/Linguistik', 'Pädagogik', 'Philosophie', 'Kunstgeschichte', 'Sportwissenschaft', 'Geografie', 'Musikwissenschaft', 'Ethnologie'])

In [33]:
ll_data_we["Barista"]["Kaffeebohnen auswählen und mahlen"]

{'name': {'raw_content': 'Kaffeebohnen auswählen und mahlen',
  'embedding': array([-1.57498606e-02, -8.16695467e-02, -1.57980919e-02,  6.01161681e-02,
          3.58368792e-02, -2.45419610e-02, -1.11461312e-01, -6.66545378e-03,
          3.10437996e-02,  1.04398757e-01,  5.13043553e-02,  2.22218111e-01,
         -6.19753450e-03, -1.51367718e-02, -7.61652365e-02, -5.34046553e-02,
          6.68487921e-02,  4.71840091e-02, -9.62478891e-02,  2.91998554e-02,
         -6.61870465e-02,  6.15111599e-03,  4.84740138e-02,  8.74144863e-03,
         -1.72364473e-01, -4.39406969e-02, -1.08735859e-02,  5.16339391e-02,
          1.11528091e-01,  6.53922036e-02,  7.27193058e-02,  1.03658468e-01,
         -9.32022333e-02,  2.81466041e-02,  6.98798224e-02, -1.17267855e-01,
          5.48255071e-03,  4.04218361e-02, -2.63045877e-02, -3.55635732e-02,
          7.64574185e-02,  1.34651229e-01, -1.69605967e-02,  4.54959124e-02,
         -1.43978849e-01,  1.17385060e-01, -9.89141595e-03, -7.01941699e-02,
 

In [38]:
def print_dict_keys(dictionary, indent=''):
    for key, value in dictionary.items():
        if isinstance(value, dict):
            print(f"{indent}{key}:")
            print_dict_keys(value, indent + '  ')
        else:
            print(f"{indent}{key}")

In [39]:
print_dict_keys(ll_data_we)

Barista:
  Kaffeebohnen auswählen und mahlen:
    name:
      raw_content
      embedding
    description:
      raw_content
      embedding
    why:
      raw_content
      embedding
    duration:
      raw_content
      embedding
  Espresso-Extraktion:
    name:
      raw_content
      embedding
    description:
      raw_content
      embedding
    why:
      raw_content
      embedding
    duration:
      raw_content
      embedding
  Milch aufschäumen und Latte Art:
    name:
      raw_content
      embedding
    description:
      raw_content
      embedding
    why:
      raw_content
      embedding
    duration:
      raw_content
      embedding
  Verschiedene Kaffeegetränke:
    name:
      raw_content
      embedding
    description:
      raw_content
      embedding
    why:
      raw_content
      embedding
    duration:
      raw_content
      embedding
Gärtner:
  Pflanzenbiologie:
    name:
      raw_content
      embedding
    description:
      raw_content
      embeddi

In [40]:
#now we need pickle because we have numpy data, which is not serializable
import pickle



with open("data/study_subjects_with_embeddings.pkl", "wb") as pkl_file:
    pickle.dump(study_data_we, pkl_file)


with open("data/lifelong_learning_with_embeddings.pkl", "wb") as pkl_file:
    pickle.dump(ll_data_we, pkl_file)