In [1]:
import json
import os
import sys

import numpy as np
import pandas as pd
from tqdm import tqdm

sys.path.append(os.path.abspath(".."))

from config import (
    FILTERED_DATA_PATH,
    TRAIN_DATA_PATH,
    VAL_DATA_PATH,
    TEST_DATA_PATH,
    METADATA_PATH,
    EMBEDDINGS_TITLE_PATH,
    EMBEDDINGS_DESCR_PATH,
    EMBEDDINGS_COMBI_PATH,
    N_COMPONENTS,
    RANDOM_STATE,
    N_RECOMMENDATIONS,
    N_EPOCHS,
    EPSILON,
    RECOMMENDATIONS_PATH,
    TRANSFORMED_DATA_PATH,
    SPLIT_DATE_TRAIN_VAL,
    SPLIT_DATE_VAL_TEST,
)
import utils



In [2]:
# importing training data
train_df = pd.read_parquet("../" + TRAIN_DATA_PATH)

# importing validation data
val_df = pd.read_parquet("../" + VAL_DATA_PATH)

# importing test data
test_df = pd.read_parquet("../" + TEST_DATA_PATH)

# importing metadata
meta_df = pd.read_parquet("../" + METADATA_PATH)

# importing the transformed data
transformed_df = pd.read_parquet("../" + TRANSFORMED_DATA_PATH)

# importing the filtered data
filtered_df = pd.read_parquet("../" + FILTERED_DATA_PATH)

In [4]:
train_w_meta = pd.merge(train_df, meta_df, on="prd_number", how="left")
show_counts = train_w_meta.groupby("series_title").size().reset_index(name="count")
top_10_shows = show_counts.nlargest(10, "count")["series_title"].tolist()
top_10_shows

['Genstart',
 'Tiden',
 'Djævlen i detaljen',
 'Stjerner og striber',
 'Radioavisen',
 'Sara & Monopolet - podcast',
 'Tyran',
 'Lyssky',
 'Ubegribeligt',
 'Bakspejl']

In [8]:
recommendations = []
for show in top_10_shows:
    show_filtered = meta_df[(meta_df["series_title"] == show) & (meta_df["pub_date"] >= "2024-11-11")]
    shows_after_split_date = len(show_filtered)

    if shows_after_split_date == 0:
        show_filtered = meta_df[meta_df["series_title"] == show]

    show_filtered_sorted = show_filtered.sort_values(by="pub_date")
    first_prd_number = show_filtered_sorted.iloc[0]["prd_number"]
    recommendations.append(first_prd_number)
recommendations

['11802450461',
 '11802451184',
 '16122492640',
 '11802437046',
 '14842401457',
 '15452448047',
 '16122493792',
 '11802465031',
 '11162405467',
 '11162411030']

In [12]:
# loading embeddings
emb_df = pd.read_parquet("../" + EMBEDDINGS_TITLE_PATH)
emb_df.drop(columns="episode", inplace=True)
min_value = emb_df.min().min()
max_value = emb_df.max().max()

print(f"Minimum value: {min_value}")
print(f"Maximum value: {max_value}")

Minimum value: -0.3516335799441893
Maximum value: 0.6239668024905588


In [13]:
# loading embeddings
emb_df = pd.read_parquet("../" + EMBEDDINGS_DESCR_PATH)
emb_df.drop(columns="episode", inplace=True)
min_value = emb_df.min().min()
max_value = emb_df.max().max()

print(f"Minimum value: {min_value}")
print(f"Maximum value: {max_value}")

Minimum value: -0.3516335799441893
Maximum value: 0.6149092034145577


In [15]:
# loading embeddings
emb_df = pd.read_parquet("../" + EMBEDDINGS_COMBI_PATH)
emb_df.drop(columns="episode", inplace=True)
min_value = emb_df.min().min()
max_value = emb_df.max().max()

print(f"Minimum value: {min_value}")
print(f"Maximum value: {max_value}")

Minimum value: -0.3516335799441893
Maximum value: 0.6149092034145577
