In [2]:
import json
import os
import sys

import numpy as np
import pandas as pd
from tqdm import tqdm

sys.path.append(os.path.abspath(".."))

from config import (
    TRAIN_DATA_PATH,
    TEST_DATA_PATH,
    METADATA_PATH,
    EMBEDDINGS_TITLE_PATH,
    N_COMPONENTS,
    RANDOM_STATE,
    N_RECOMMENDATIONS,
    N_EPOCHS,
    EPSILON,
    RECOMMENDATIONS_PATH,
)
import utils



In [5]:
# importing training data
train_df = pd.read_parquet("../" + TRAIN_DATA_PATH)

# importing test data
test_df = pd.read_parquet("../" + TEST_DATA_PATH)

# importing metadata
meta_df = pd.read_parquet("../" + METADATA_PATH)

In [11]:
meta_df

Unnamed: 0,prd_number,series_title,unique_title,pub_date,episode_duration,genre,branding_channel,mother_channel,category,episode_description,episode_title
0,11031452026,Radiofortællinger,Radiofortællinger: Lykkelige ulykker_11031452026,2020-03-21,1604,-,DR P1,-,Oplysning og kultur,Hvordan er det at se hele ens professionelle l...,Lykkelige ulykker
1,11031452036,Radiofortællinger,Radiofortællinger: Kunsten at vælge et andet m...,2018-08-21,1621,-,DR P1,-,Oplysning og kultur,Alle mennesker må på et eller andet tidspunkt ...,Kunsten at vælge et andet menneske fra
2,11031452116,Radiofortællinger,Radiofortællinger: Ind og ud af troen_11031452116,2024-09-30,1626,Fakta og debat,DR P1,DR P1,Oplysning og kultur,"På trods af sin unge alder, 19 år, har Asta al...",Ind og ud af troen
3,11031452216,Radiofortællinger,Radiofortællinger: Flyskræk_11031452216,2024-06-06,1626,Fakta og debat,DR P1,DR P1,Oplysning og kultur,Er du bange for at flyve? Du er ikke den enest...,Flyskræk
4,11031557262,Radioklassikeren,Radioklassikeren: Monica - En radioklassiker t...,2020-09-06,3416,,,,Oplysning og kultur,Hun begyndte i radioens sprogudsendelser på fr...,Monica - En radioklassiker takker af
...,...,...,...,...,...,...,...,...,...,...,...
11913,19388840392,-,Krop og bevægelse_19388840392,2018-08-21,574,,,,Præsentation og services,"- med Inge Gotved,\nMargit Riis-Vestergaard og...",Krop og bevægelse
11914,19388840393,-,Krop og bevægelse_19388840393,2018-08-22,578,,,,Præsentation og services,"- med Inge Gotved,\nMargit Riis-Vestergaard og...",Krop og bevægelse
11915,19388840394,-,Krop og bevægelse_19388840394,2018-08-23,535,,,,Præsentation og services,"- med Inge Gotved,\nMargit Riis-Vestergaard og...",Krop og bevægelse
11916,19388840395,-,Krop og bevægelse_19388840395,2018-08-24,589,,,,Præsentation og services,"- med Inge Gotved,\nMargit Riis-Vestergaard og...",Krop og bevægelse


In [None]:
test_data = {"ep1": [2,3,4],
             "ep2": [8,4,6]}

utils.format_embedding_dict(test_data)

{'episode': ['ep1', 'ep2'],
 'feature1': [2, 8],
 'feature2': [3, 4],
 'feature3': [4, 6]}

In [13]:
item_features = pd.DataFrame({
    'prd_number': ["101", "102", "103", "104"],
    'feature1': [0.5, 0.45, 0.41, 0.9],
    'feature2': [0.2, 0.25, 0.89, 0.95],
    'feature3': [0.6, 0.6, 0.7, 0.85],
})
title_emb_df = item_features.rename(columns={"episode": "prd_number"})
title_emb_dict = title_emb_df.to_dict(orient="list")
title_emb_dict

{'prd_number': ['101', '102', '103', '104'],
 'feature1': [0.5, 0.45, 0.41, 0.9],
 'feature2': [0.2, 0.25, 0.89, 0.95],
 'feature3': [0.6, 0.6, 0.7, 0.85]}

In [16]:
list1 = ["ko", "kat"]
list2 = [2, 3]

for animal, number in tqdm(zip(list1, list2), total=len(list1)):
    print(f"There are {number} instances of {animal}.")

100%|██████████| 2/2 [00:00<?, ?it/s]

There are 2 instances of ko.
There are 3 instances of kat.





In [13]:
train_items = set(train_df["prd_number"])
print(len(train_items))
test_items = set(test_df["prd_number"])
print(len(test_items))
new_items = test_items.difference(train_items)
print(len(new_items))
all_items = train_items.union(test_items)
print(len(all_items))

10631
8852
1287
11918


In [None]:
sorted(list(all_items))

['11031452026',
 '11031452036',
 '11031452116',
 '11031452216',
 '11031557262',
 '11031557512',
 '11031559154',
 '11031559164',
 '11031559464',
 '11031559524',
 '11031659184',
 '11031659234',
 '11031659264',
 '11031659354',
 '11031659374',
 '11031659384',
 '11031659394',
 '11031659404',
 '11031659414',
 '11031659434',
 '11031659494',
 '11031716337',
 '11031716347',
 '11031716357',
 '11031716367',
 '11031759034',
 '11031759044',
 '11031759064',
 '11031759084',
 '11031759104',
 '11031759124',
 '11031759134',
 '11031759144',
 '11031759164',
 '11031759194',
 '11031759204',
 '11031759224',
 '11031759244',
 '11031759254',
 '11031759264',
 '11031759344',
 '11031759354',
 '11031759364',
 '11031759384',
 '11031759394',
 '11031759404',
 '11031759414',
 '11031759424',
 '11031759434',
 '11031759444',
 '11031759454',
 '11031759464',
 '11031759474',
 '11031759494',
 '11031759504',
 '11031759514',
 '11031759524',
 '11031762001',
 '11031762002',
 '11031762003',
 '11031762004',
 '11031762005',
 '110317

In [21]:
print("test")

test


In [16]:
sorted(train_df['prd_number'].unique().tolist())

['11031452026',
 '11031452036',
 '11031452116',
 '11031452216',
 '11031557262',
 '11031557512',
 '11031559154',
 '11031559164',
 '11031559464',
 '11031559524',
 '11031659184',
 '11031659234',
 '11031659264',
 '11031659354',
 '11031659374',
 '11031659384',
 '11031659394',
 '11031659404',
 '11031659414',
 '11031659434',
 '11031659494',
 '11031716337',
 '11031716347',
 '11031716357',
 '11031716367',
 '11031759034',
 '11031759044',
 '11031759064',
 '11031759084',
 '11031759104',
 '11031759124',
 '11031759134',
 '11031759144',
 '11031759164',
 '11031759194',
 '11031759204',
 '11031759224',
 '11031759244',
 '11031759254',
 '11031759264',
 '11031759344',
 '11031759354',
 '11031759364',
 '11031759384',
 '11031759394',
 '11031759404',
 '11031759414',
 '11031759424',
 '11031759434',
 '11031759444',
 '11031759454',
 '11031759464',
 '11031759474',
 '11031759494',
 '11031759504',
 '11031759514',
 '11031759524',
 '11031762001',
 '11031762002',
 '11031762003',
 '11031762004',
 '11031762005',
 '110317

In [3]:
np.zeros((2,3))

array([[0., 0., 0.],
       [0., 0., 0.]])

In [25]:
title_emb_df = pd.read_parquet("../" + EMBEDDINGS_TITLE_PATH)
# title_emb_df = title_emb_df.rename(columns={"episode": "prd_number"})
# title_emb_dict = title_emb_df.to_dict(orient="list")
# title_emb_dict
one_feature_df = title_emb_df[["episode", "feature1"]]
one_feature_df.to_parquet("../embeddings/one_feature.parquet")

In [17]:
title_emb_df = pd.read_parquet("../" + EMBEDDINGS_TITLE_PATH)
item_matrix = title_emb_df.drop(columns='prd_number').values
item_matrix_csr = csr_matrix(item_matrix)

KeyError: "['prd_number'] not found in axis"