In [None]:
import os
import json
from json import JSONDecodeError
import numpy as np
import nltk
from tqdm import tqdm

from collections import defaultdict

from util.constants import Topic

In [None]:
# Models
from common_words import get_views_counts, get_avg_term_views, get_avg_symbol_views

def sum_dicts(dics):
    new_dic = defaultdict(int)
    for k in set([key for dic in dics for key in dic]):
        for dic in dics:
            if k in dic:
                new_dic[k] += dic[k]
    return new_dic

def sort_dict(dic):
    return {k:v for k,v in sorted(dic.items(), key=lambda x: x[1])}

Channel stats

In [None]:
# Read data
channel_videos_dict = {}
for cat in Topic._member_names_:
    with open(os.path.join("..", "data", "info_videos", F"videos-info_{cat}.json"), "r") as f:
        channel_videos_dict.update(json.load(f))

In [None]:
RESULTS_DIR = os.path.join("..", "data", "title-tokens")

channel_results_dir = os.path.join(RESULTS_DIR, "channels")
def get_done_list(dir):
    return [nm.replace(".json",'') for nm in os.listdir(dir)]

In [None]:
# Calculate channel results
channel_results = {}
for channel,videos in tqdm(channel_videos_dict.items()):
    token_views, token_counts = get_views_counts(videos)
    term_views,  term_counts  = get_views_counts(videos, "term")

    channel_results[channel] = {
        "token_views": sort_dict(token_views),
        "token_counts": sort_dict(token_counts),
        "term_views": sort_dict(term_views),
        "term_counts": sort_dict(term_counts),
    }

In [None]:
# Save channel stats
for channel,results in channel_results.items():
    filepath = os.path.join(channel_results_dir, f"{channel}.json")
    with open(filepath, "w") as f:
        json.dump(results, f)

Category stats

In [None]:
# Read data
with open(os.path.join("..", "data", "channel2category.json"), "r") as f:
    channel2cat = json.load(f)

In [None]:
# Make list of results per channel for each category
category_results_list = defaultdict(list)
for channel in tqdm(get_done_list(channel_results_dir)):
    cat = channel2cat[channel]
    filepath = os.path.join(channel_results_dir, f"{channel}.json")
    try:
        with open(filepath, "r") as f:
            results = json.load(f)
    except JSONDecodeError:
        print(f"couldn't open {channel}; deleting file")
        os.remove(filepath)
    category_results_list[cat].append(results)

In [None]:
# Calculate category results
category_results = {}
for cat,results_list in category_results_list.items():
    cat_result = {}
    for k in ["token_views", "token_counts", "term_views", "term_counts"]:
        cat_result[k] = sort_dict(sum_dicts([res[k] for res in results_list]))
    category_results[cat] = cat_result

In [None]:
# Save category results
for cat,stats in category_results.items():
    filepath = os.path.join(RESULTS_DIR, "categories", f"{cat}.json")
    with open(filepath, "w") as f:
        json.dump(stats, f)