In [12]:
import json
import os
import sys

import numpy as np
import pandas as pd
from tqdm import tqdm

sys.path.append(os.path.abspath(".."))

from config import (
    FILTERED_DATA_PATH,
    TRAIN_DATA_PATH,
    TEST_DATA_PATH,
    METADATA_PATH,
    EMBEDDINGS_TITLE_PATH,
    EMBEDDINGS_DESCR_PATH,
    EMBEDDINGS_COMBI_PATH,
    N_COMPONENTS,
    RANDOM_STATE,
    N_RECOMMENDATIONS,
    N_EPOCHS,
    EPSILON,
    RECOMMENDATIONS_PATH,
    TRANSFORMED_DATA_PATH,
)
import utils

In [13]:
# importing training data
train_df = pd.read_parquet("../" + TRAIN_DATA_PATH)

# importing test data
test_df = pd.read_parquet("../" + TEST_DATA_PATH)

# importing metadata
meta_df = pd.read_parquet("../" + METADATA_PATH)

# importing the transformed data
transformed_df = pd.read_parquet("../" + TRANSFORMED_DATA_PATH)

# importing the filtered data
filtered_df = pd.read_parquet("../" + FILTERED_DATA_PATH)

In [15]:
filtered_df[filtered_df["date_time"] < "2024-11-11 00:00:00"]

Unnamed: 0,user_id,prd_number,date_time,series_title,unique_title,platform,device_type,pub_date,episode_duration,genre,branding_channel,mother_channel,category,content_time_spent
0,000065a7ec329b0fc01a779ead0e8d38d987b070300113...,11032421443,2024-11-01 08:56:00,Brinkmanns briks,Brinkmanns briks: Vi skal tale om pillerne_110...,web,Other,2024-10-30,3422.0,Fakta og debat,DR P1,DR P1,Oplysning og kultur,3423
1,000065a7ec329b0fc01a779ead0e8d38d987b070300113...,11032422442,2024-11-01 11:19:00,Hjernekassen på P1,Hjernekassen på P1: Forebyggelse_11032422442,web,Other,2024-10-29,3363.0,Fakta og debat,DR P1,DR P1,Oplysning og kultur,359
2,000065a7ec329b0fc01a779ead0e8d38d987b070300113...,11162405447,2024-11-01 09:53:00,Ubegribeligt,Ubegribeligt: Vand_11162405447,web,Other,2024-10-31,3417.0,Fakta og debat,DR P1,DR P1,Aktualitet og debat,5160
3,000065a7ec329b0fc01a779ead0e8d38d987b070300113...,11802437044,2024-11-01 08:40:00,Stjerner og striber,"Stjerner og striber: Joken, der ikke vil dø_11...",web,Other,2024-11-01,2847.0,Aktualitet,DR P1,-,Nyheder,2847
4,000065a7ec329b0fc01a779ead0e8d38d987b070300113...,11802451178,2024-11-01 09:27:00,Tiden,"Tiden: Skraldemanden Trump, spansk oversvømmel...",web,Other,2024-11-01,947.0,Nyheder,DR Lyd,-,Nyheder,610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2951490,fffe6e1be686be369dfe0943282e377f7d1a51c0eae35a...,16122413392,2024-09-24 12:52:00,Guld og grønne skove,Guld og grønne skove: Døde batterier_16122413392,app,Mobile Phone,2024-09-24,3247.0,Fakta og debat,DR P1,DR P1,Oplysning og kultur,3246
2951492,fffe6e1be686be369dfe0943282e377f7d1a51c0eae35a...,16122415366,2024-09-07 14:55:00,Verdens bedste film,Verdens bedste film: Freaks med Michael Noer_1...,app,Mobile Phone,2024-09-06,3415.0,Kultur,DR Lyd,DR P1,Oplysning og kultur,366
2951494,fffe6e1be686be369dfe0943282e377f7d1a51c0eae35a...,16122494406,2024-09-09 20:43:00,Gift ved første blik podcasten,Gift ved første blik podcasten: Et frø der spi...,app,Mobile Phone,2024-09-05,3417.0,Livsstil,DR Lyd,-,Oplysning og kultur,3408
2951495,fffe8620003aa2cac24d0864dc477f4603180281a2e1d9...,13332410434,2024-10-27 09:28:00,Go' Morgen P3,Go' Morgen P3_13332410434,web,Other,2024-10-24,6000.0,Unge,P3,P3,Aktualitet og debat,367


In [9]:
test_dict = {"user1": [1,2,3,4,5,6],
             "user2": [9,6,4,2,3,5]}
import pandas as pd
test_df = pd.DataFrame(test_dict)

test_set = {1,2,3}
test_set.intersection(test_df["user1"])

{1, 2, 3}

In [29]:
# loading embeddings
emb_df = pd.read_parquet("../" + EMBEDDINGS_TITLE_PATH)
emb_df.drop(columns="episode", inplace=True)
min_value = emb_df.min().min()
max_value = emb_df.max().max()

print(f"Minimum value: {min_value}")
print(f"Maximum value: {max_value}")

Minimum value: -1.4321266412734985
Maximum value: 2.9307823181152344


In [30]:
# loading embeddings
emb_df = pd.read_parquet("../" + EMBEDDINGS_DESCR_PATH)
emb_df.drop(columns="episode", inplace=True)
min_value = emb_df.min().min()
max_value = emb_df.max().max()

print(f"Minimum value: {min_value}")
print(f"Maximum value: {max_value}")

Minimum value: -1.4321266412734985
Maximum value: 2.5043904781341553


In [31]:
# loading embeddings
emb_df = pd.read_parquet("../" + EMBEDDINGS_COMBI_PATH)
emb_df.drop(columns="episode", inplace=True)
min_value = emb_df.min().min()
max_value = emb_df.max().max()

print(f"Minimum value: {min_value}")
print(f"Maximum value: {max_value}")

Minimum value: -1.4321266412734985
Maximum value: 2.5043904781341553


In [10]:
train_and_test_df = pd.concat([train_df, test_df])
episode_counts = train_and_test_df.groupby("prd_number").size().reset_index(name="count")
top_10_episodes_tt = episode_counts.nlargest(10, "count")["prd_number"].tolist()
top_10_episodes_tt

['11802450402',
 '16122492630',
 '16122492637',
 '16122493781',
 '16122492636',
 '16122493791',
 '16122492631',
 '16122492639',
 '16122492633',
 '16122492632']

In [20]:
my_set = {}
my_list = ["hund", "høne"]

bool(my_set)

False

In [11]:
episode_counts = transformed_df.groupby("prd_number").size().reset_index(name="count")
top_10_episodes_tr = episode_counts.nlargest(10, "count")["prd_number"].tolist()
top_10_episodes_tr

['11802450402',
 '16122492630',
 '16122493781',
 '16122493791',
 '16122492631',
 '16122492636',
 '16122492637',
 '16122492632',
 '16122492633',
 '16122492639']