In [19]:
from collections import defaultdict
import json
import os
import sys

import numpy as np
import pandas as pd
from tqdm import tqdm

sys.path.append(os.path.abspath(".."))

from config import (
    FILTERED_DATA_PATH,
    TRAIN_DATA_PATH,
    VAL_DATA_PATH,
    TEST_DATA_PATH,
    METADATA_PATH,
    EMBEDDINGS_TITLE_PATH,
    EMBEDDINGS_DESCR_PATH,
    EMBEDDINGS_COMBI_PATH,
    RANDOM_STATE,
    N_RECOMMENDATIONS,
    N_EPOCHS,
    RECOMMENDATIONS_PATH,
    TRANSFORMED_DATA_PATH,
    SPLIT_DATE_TRAIN_VAL,
    SPLIT_DATE_VAL_TEST,
)
import utils.utils as utils

In [2]:
# importing training data
train_df = pd.read_parquet("../" + TRAIN_DATA_PATH)

# importing validation data
val_df = pd.read_parquet("../" + VAL_DATA_PATH)

# importing test data
test_df = pd.read_parquet("../" + TEST_DATA_PATH)

# importing metadata
meta_df = pd.read_parquet("../" + METADATA_PATH)

# importing the transformed data
transformed_df = pd.read_parquet("../" + TRANSFORMED_DATA_PATH)

# importing the filtered data
filtered_df = pd.read_parquet("../" + FILTERED_DATA_PATH)

# importing embeddings
title_embeddings = pd.read_parquet("../" + EMBEDDINGS_TITLE_PATH)

In [6]:
train_w_meta = pd.merge(train_df, meta_df, on="prd_number", how="left")
sorted(train_w_meta['prd_number'].unique().tolist())

['11031452026',
 '11031452036',
 '11031452116',
 '11031452216',
 '11031557092',
 '11031557102',
 '11031557142',
 '11031557162',
 '11031557242',
 '11031557322',
 '11031557332',
 '11031557342',
 '11031557362',
 '11031557372',
 '11031557402',
 '11031557432',
 '11031557442',
 '11031557472',
 '11031557482',
 '11031557502',
 '11031557512',
 '11031559154',
 '11031559164',
 '11031559464',
 '11031559524',
 '11031659184',
 '11031659234',
 '11031659264',
 '11031659354',
 '11031659374',
 '11031659384',
 '11031659394',
 '11031659404',
 '11031659414',
 '11031659434',
 '11031659494',
 '11031711096',
 '11031711106',
 '11031711116',
 '11031711176',
 '11031711436',
 '11031711600',
 '11031716337',
 '11031716347',
 '11031716357',
 '11031716367',
 '11031759034',
 '11031759044',
 '11031759054',
 '11031759064',
 '11031759074',
 '11031759084',
 '11031759104',
 '11031759124',
 '11031759134',
 '11031759144',
 '11031759164',
 '11031759194',
 '11031759204',
 '11031759224',
 '11031759244',
 '11031759254',
 '110317

In [16]:
train_w_meta_sorted = train_w_meta.sort_values(by="pub_date", ascending=True)
train_w_meta_sorted

Unnamed: 0,user_id,prd_number,completion_rate,series_title,unique_title,pub_date,episode_duration,genre,branding_channel,mother_channel,category,episode_description,episode_title
89981,12c913493f00b33ca6d90c4a7e59670b4cb47ce3cf6f66...,14602411366,1.000000,P4 Fyn regionale nyheder,Regionale nyheder_14602411366,-,120,Nyheder,P4 Fyn,P4 Fyn,Nyheder,Hør seneste regionale nyheder fra P4 Fyn.,Regionale nyheder
1024775,d681d65761c11fb75956b9ff2758522bb5b91718f857c1...,14102413361,0.950000,P4 Nordjylland regionale nyheder,Regionale nyheder_14102413361,-,180,Nyheder,P4 Nordjylland,P4 Nordjylland,Nyheder,Hør seneste regionale nyheder fra P4 Nordjylla...,Regionale nyheder
187173,26ffa6e7673b25c207ce6d5a023624b487f097a31b2e60...,14702419401,0.983333,P4 Sjælland regionale nyheder,Regionale nyheder_14702419401,-,180,Nyheder,P4 Sjælland,P4 Sjælland,Nyheder,Hør seneste regionale nyheder fra P4 Sjælland.,Regionale nyheder
187170,26ffa6e7673b25c207ce6d5a023624b487f097a31b2e60...,14702417401,0.955556,P4 Sjælland regionale nyheder,Regionale nyheder_14702417401,-,180,Nyheder,P4 Sjælland,P4 Sjælland,Nyheder,Hør seneste regionale nyheder fra P4 Sjælland.,Regionale nyheder
187167,26ffa6e7673b25c207ce6d5a023624b487f097a31b2e60...,14702416405,0.933333,P4 Sjælland regionale nyheder,Regionale nyheder_14702416405,-,120,Nyheder,P4 Sjælland,P4 Sjælland,Nyheder,Hør seneste regionale nyheder fra P4 Sjælland.,Regionale nyheder
...,...,...,...,...,...,...,...,...,...,...,...,...,...
160571,216d97a48925e6191af1bc0804b3e243a42e3c60451837...,11802450427,0.992206,Genstart,Genstart: Jehovas domstol_11802450427,2024-10-20,1668,Aktualitet,DR Lyd,-,Nyheder,"""Fik du en udløsning?"" som helt ung bliver en ...",Jehovas domstol
645887,8661728516c0426717832ca6c572639c04e4217de9b24f...,11802450427,0.999400,Genstart,Genstart: Jehovas domstol_11802450427,2024-10-20,1668,Aktualitet,DR Lyd,-,Nyheder,"""Fik du en udløsning?"" som helt ung bliver en ...",Jehovas domstol
1173590,f568102ad7a95797e0bec7f6c3c19904a81448a45652ad...,11042406427,0.093305,Pilgrim - på rejse i troens univers,Pilgrim i Søhøjlandet - på kant af tiden_11042...,2024-10-20,3301,Kultur,DR P1,DR P1,Oplysning og kultur,"Når Gud ikke virker. Hvad gør vi, når ord om G...",Pilgrim i Søhøjlandet - på kant af tiden
236697,3159c32db96b3c6016391f2f26b41fe1931ea3b842e840...,11031711106,1.000000,Klog på Sprog,Klog på Sprog: Sex og metaforer_11031711106,2024-11-17,2707,-,DR P1,DR P1,Oplysning og kultur,Hassan Preisler og Anna Grue er gæster i denne...,Sex og metaforer


In [22]:
user_show_episodes_train = defaultdict(lambda: defaultdict(list))
for _, row in train_w_meta_sorted.iterrows():
    user_id = row["user_id"]
    series_title = row["series_title"]
    prd_number = row["prd_number"]
    
    user_show_episodes_train[user_id][series_title].append(prd_number)

In [34]:
my_dict = {"DK": 6, "SE": 8}
for val in my_dict.values():
    print(val)

6
8


In [29]:
set_1 = {"pungrotte", "abe", "ål"}
set_2 = {"ål"}
set_1 - set_2

{'abe', 'pungrotte'}