In [None]:
import os
import json
import numpy as np
import nltk
from tqdm import tqdm

from collections import defaultdict

from util.constants import Topic
from util.helpers import extend_dicts, sort_dict

Read data

In [None]:
# Read data
channel_videos_dict = {}
for cat in Topic._member_names_:
    with open(os.path.join("..", "data", "info_videos", F"videos-info_{cat}.json"), "r") as f:
        channel_videos_dict.update(json.load(f))

In [None]:
RESULTS_DIR = os.path.join("..", "data", "title-tokens")

channel_results_dir = os.path.join(RESULTS_DIR, "channels")
def get_done_list(dir):
    return [nm.replace(".json",'') for nm in os.listdir(dir)]

Make channel inverted index

In [None]:
# Make index
for channel,videos in tqdm(channel_videos_dict.items()):
    inv_idx = defaultdict(list)
    for vid in videos:
        tokens = nltk.word_tokenize(vid["title"])
        for t in set(tokens):
            inv_idx[t].append(vid["id"])
    with open(os.path.join(RESULTS_DIR, "channels_inv_index", f"{channel}.json"), "w") as f:
        json.dump(inv_idx, f)

Channel results

In [None]:
# Calculate channel results
for channel,videos in tqdm(channel_videos_dict.items()):
    video_info_dict = {vid["id"]:vid for vid in videos}

    with open(os.path.join(RESULTS_DIR, "channels_inv_index", f"{channel}.json"), "r") as f:
        inv_idx = json.load(f)

    channel_result = {
        "token_views": sort_dict({k:sum([video_info_dict[id]["views"] for id in v]) for k,v in inv_idx.items()}),
        "token_counts": sort_dict({k:len(v) for k,v in inv_idx.items()}),
    }
    
    filepath = os.path.join(channel_results_dir, f"{channel}.json")
    with open(filepath, "w") as f:
        json.dump(channel_result, f)

Category results

In [None]:
# Read data
with open(os.path.join("..", "data", "channel2category.json"), "r") as f:
    channel2cat = json.load(f)

In [None]:
# Make list of channels for each category
category_channels_dict = {}
for cat in tqdm(Topic._member_names_):
    with open(os.path.join("..", "data", "info_videos", F"videos-info_{cat}.json"), "r") as f:
        category_channels_dict[cat] = list(json.load(f).keys())

In [None]:
# Make index
for cat,channels in category_channels_dict.items():
    print(cat)
    
    inv_indices = []
    for channel in tqdm(channels):
        with open(os.path.join(RESULTS_DIR, "channels_inv_index", f"{channel}.json"), "r") as f:
            inv_indices.append(json.load(f))

    inv_idx = extend_dicts(inv_indices)

    with open(os.path.join(RESULTS_DIR, "categories_inv_index", f"{cat}.json"), "w") as f:
        json.dump(inv_idx, f)

In [None]:
# Calculate category results
for cat,channels in tqdm(category_channels_dict.items()):
    video_info_dict = {vid["id"]:vid for channel in channels for vid in channel_videos_dict[channel]}

    with open(os.path.join(RESULTS_DIR, "categories_inv_index", f"{cat}.json"), "r") as f:
        inv_idx = json.load(f)

    category_result = {
        "token_views": sort_dict({k:sum([video_info_dict[id]["views"] for id in v]) for k,v in inv_idx.items()}),
        "token_counts": sort_dict({k:len(v) for k,v in inv_idx.items()}),
    }
    
    filepath = os.path.join(RESULTS_DIR, "categories", f"{cat}.json")
    with open(filepath, "w") as f:
        json.dump(category_result, f)

TF*IDF for channels

In [None]:
# Document Frequency
for cat,channels in category_channels_dict.items():
    print(cat)

    df = defaultdict(int)
    for channel in tqdm(channels):
        with open(os.path.join(RESULTS_DIR, "channels_inv_index", f"{channel}.json"), "r") as f:
            inv_idx = json.load(f)
        for token in inv_idx:
            df[token] += 1

    df = sort_dict(df)

    with open(os.path.join(RESULTS_DIR, "categories_doc_freq", f"{cat}.json"), "w") as f:
        json.dump(df, f)

In [None]:
# tf*idf
for cat,channels in category_channels_dict.items():
    print(cat)

    with open(os.path.join(RESULTS_DIR, "categories_doc_freq", f"{cat}.json"), "r") as f:
        df = json.load(f)
    idf = {k:np.log(len(channels)/v) for k,v in df.items()}

    for channel in tqdm(channels):
        with open(os.path.join(RESULTS_DIR, "channels", f"{channel}.json"), "r") as f:
            tf = json.load(f)["token_counts"]

        terms = list(tf.keys())
        freqs = np.array(list(tf.values()))
        channel_idf = np.array([idf[k] for k in tf])

        tf_idf = freqs*channel_idf
        tf_idf = {term:freq for term,freq in zip(terms,tf_idf)}

        tf_idf = sort_dict(tf_idf)

        with open(os.path.join(RESULTS_DIR, "channels_tf_idf", f"{channel}.json"), "w") as f:
            json.dump(tf_idf, f)

TF*IDF for categories

In [None]:
# Document Frequency
df = defaultdict(int)
for cat,channels in category_channels_dict.items():
    print(cat)

    for channel in tqdm(channels):
        with open(os.path.join(RESULTS_DIR, "channels_inv_index", f"{channel}.json"), "r") as f:
            inv_idx = json.load(f)
        for token in inv_idx:
            df[token] += 1

df = sort_dict(df)

with open(os.path.join(RESULTS_DIR, f"doc_freq_all.json"), "w") as f:
    json.dump(df, f)

In [None]:
# tf*idf
with open(os.path.join(RESULTS_DIR, f"doc_freq_all.json"), "r") as f:
    df = json.load(f)
idf = {k:np.log(len(channel2cat)/v) for k,v in df.items()}

for cat,channels in category_channels_dict.items():
    print(cat)

    with open(os.path.join(RESULTS_DIR, "categories", f"{cat}.json"), "r") as f:
        tf = json.load(f)["token_counts"]

    terms = list(tf.keys())
    freqs = np.array(list(tf.values()))
    cat_idf = np.array([idf[k] for k in tf])

    tf_idf = freqs*cat_idf
    tf_idf = {term:freq for term,freq in zip(terms,tf_idf)}

    tf_idf = sort_dict(tf_idf)

    with open(os.path.join(RESULTS_DIR, "categories_tf_idf", f"{cat}.json"), "w") as f:
        json.dump(tf_idf, f)