In [None]:
import os
import json
from json import JSONDecodeError
import numpy as np
import torch
import gc
from tqdm import tqdm

from collections import defaultdict

from util.constants import Topic, ThumbnailURL, thumbnail_URL
import requests

from transformers import YolosFeatureExtractor, YolosForObjectDetection
from PIL import Image
from matplotlib import pyplot as plt
import matplotlib.patches as patches
import torch.nn.functional as F

In [None]:
# Helper functions
def sum_dicts(dics):
    new_dic = defaultdict(int)
    for dic in dics:
        for k,v in dic.items():
            new_dic[k] += v
    return new_dic

def extend_dicts(dics):
    new_dic = defaultdict(list)
    for dic in dics:
        for k,l in dic.items():
            new_dic[k].extend(l)
    return new_dic

def sort_dict(dic):
    return {k:v for k,v in sorted(dic.items(), key=lambda x: x[1], reverse=True)}

In [None]:
# Models

with open(os.path.join("..", "data", "coco_classes.txt"), "r") as f:
    coco_classes = [c.rstrip("\n") for c in f.readlines()]
coco_classes.insert(0, "unknown")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(f"using device: {device}")

feature_extractor = YolosFeatureExtractor.from_pretrained("hustvl/yolos-tiny")
def load_model():
    return YolosForObjectDetection.from_pretrained("hustvl/yolos-tiny").to(device)
YOLOS_model = load_model()

# Print model size
mem_params = sum([param.nelement()*param.element_size() for param in YOLOS_model.parameters()])
mem_bufs = sum([buf.nelement()*buf.element_size() for buf in YOLOS_model.buffers()])
print(f"Memory used by model: {mem_params + mem_bufs} bytes")


def model(images):
    inputs = feature_extractor(images=images, return_tensors="pt").to(device)
    outputs = YOLOS_model(**inputs)

    ls = np.zeros((len(images),0)).tolist()

    # model predicts bounding boxes and corresponding COCO classes
    logits = outputs.logits.detach().cpu()
    bboxes = outputs.pred_boxes.detach().cpu()

    probs = F.softmax(logits.clone(), dim=-1)  # [B, 100, 92]
    preds      = probs.argmax(-1) # [B, 100]
    confidence = probs.max(-1)[0] # [B, 100]
    known_indices     = [(preds_img != 91).nonzero()[:,0] for preds_img in preds] # [B, known indices]
    confident_indices = [(conf_img > 0.75).nonzero()[:,0] for conf_img in confidence] # [B, confident indices]
    indices = [list(set(known_idx_img.tolist()).intersection(set(conf_idx_img.tolist()))) for known_idx_img,conf_idx_img in zip(known_indices, confident_indices)] # [B, intersection of indices]
    pred_classes = [[coco_classes[v] for v in preds_img[idx_img]] for preds_img, idx_img in zip(preds, indices)] # [B, predicted classes]

    confidence = np.round((confidence*100).tolist(), 1)

    for i, img in enumerate(images):
        for j, patch_idx in enumerate(indices[i]):
            c = pred_classes[i][j]
            conf = confidence[i][patch_idx]
            bbox = np.round(bboxes[i, patch_idx].tolist(), 3).tolist()

            ls[i].append({
                "pred_class": c,
                "conf": conf,
                "bbox": bbox
            })
    
    return ls

def resize(img, new_width=100):
    wpercent = (new_width/float(img.size[0]))
    hsize = int((float(img.size[1])*float(wpercent)))
    return img.resize((new_width,hsize), Image.ANTIALIAS)

Get objects for all videos

In [None]:
# Read data
videos = []
for cat in Topic._member_names_:
    with open(os.path.join("..", "data", "info_videos", F"videos-info_{cat}.json"), "r") as f:
        videos_info = json.load(f)
        videos.extend([vid for channel_vids in videos_info.values() for vid in channel_vids])

In [None]:
RESULTS_DIR = os.path.join("..", "data", "thumbnail-objects")

video_results_dir = os.path.join(os.path.join("..","..","DATA","thumbnail-objects","videos"))
channel_results_dir = os.path.join(RESULTS_DIR, "channels")
def get_done_list(dir):
    return [nm.replace(".json",'') for nm in os.listdir(dir)]

In [None]:
del YOLOS_model
torch.cuda.memory_allocated() / 1e6

In [None]:
try:
    del YOLOS_model
except:
    YOLOS_model = load_model()
torch.cuda.memory_allocated() / 1e6

In [None]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.memory_allocated() / 1e6

In [None]:
# Run the code in batches
batch_size = 4
batch_num = len(videos)//batch_size
if batch_num != int(len(videos)/batch_size):
    batch_num += 1 

quality = ThumbnailURL.high

done_list = get_done_list(video_results_dir)
for batch in tqdm(range(batch_num)):

    vid_batch = videos[batch*batch_size:(batch+1)*batch_size]

    ids = [vid["id"] for vid in vid_batch if
        # os.path.isfile("../data/thumbnails/"+vid["id"]+"_high.jpg") and
        vid["id"] not in done_list]

    if not ids:
        continue

    # imgs_paths = ["../data/thumbnails/"+vid_id+"_high.jpg" for vid_id in ids]
    # images = [Image.open(path) for path in imgs_paths]

    raws = []
    fetched_ids = []
    for id in ids:
        url = thumbnail_URL(id, quality)
        try:
            raws.append(requests.get(url, stream=True).raw)
        except:
            continue
        fetched_ids.append(id)
    images = [Image.open(raw) for raw in raws]
    ids = fetched_ids

    # images = [resize(img, 100) for img in images]

    results = model(images)

    for vid_id, result in zip(ids, results):
        path = os.path.join(video_results_dir, f"{vid_id}.json")
        with open(path, "w") as f:
            json.dump(result, f)

Make channel inverted index

In [None]:
# Read data
channel_videos_dict = {}
for cat in Topic._member_names_:
    with open(os.path.join("..", "data", "info_videos", F"videos-info_{cat}.json"), "r") as f:
        channel_videos_dict.update(json.load(f))

In [None]:
# Make index
for channel,videos in tqdm(channel_videos_dict.items()):
    inv_idx = defaultdict(list)
    for vid in videos:
        try:
            with open(os.path.join(video_results_dir, vid["id"]+".json"), "r") as f:
                objects = [o["pred_class"] for o in json.load(f)]
        except FileNotFoundError:
            continue
        for o in set(objects):
            inv_idx[o].append(vid["id"])

    with open(os.path.join(RESULTS_DIR, "channels_inv_index", f"{channel}.json"), "w") as f:
        json.dump(inv_idx, f)

Channel results

In [None]:
# Calculate channel results
for channel,videos in tqdm(channel_videos_dict.items()):
    video_info_dict = {vid["id"]:vid for vid in videos}

    with open(os.path.join(RESULTS_DIR, "channels_inv_index", f"{channel}.json"), "r") as f:
        inv_idx = json.load(f)

    channel_result = {
        "object_views": sort_dict({k:sum([video_info_dict[id]["views"] for id in v]) for k,v in inv_idx.items()}),
        "object_counts": sort_dict({k:len(v) for k,v in inv_idx.items()}),
    }

    filepath = os.path.join(channel_results_dir, f"{channel}.json")
    with open(filepath, "w") as f:
        json.dump(channel_result, f)

Category results

In [None]:
# Read data
with open(os.path.join("..", "data", "channel2category.json"), "r") as f:
    channel2cat = json.load(f)

In [None]:
# Make list of channels for each category
category_channels_dict = {}
for cat in tqdm(Topic._member_names_):
    with open(os.path.join("..", "data", "info_videos", F"videos-info_{cat}.json"), "r") as f:
        category_channels_dict[cat] = list(json.load(f).keys())

In [None]:
# Make index
for cat,channels in category_channels_dict.items():
    print(cat)
    
    inv_indices = []
    for channel in tqdm(channels):
        with open(os.path.join(RESULTS_DIR, "channels_inv_index", f"{channel}.json"), "r") as f:
            inv_indices.append(json.load(f))

    inv_idx = extend_dicts(inv_indices)

    with open(os.path.join(RESULTS_DIR, "categories_inv_index", f"{cat}.json"), "w") as f:
        json.dump(inv_idx, f)

In [None]:
# Calculate category results
for cat,channels in tqdm(category_channels_dict.items()):
    video_info_dict = {vid["id"]:vid for channel in channels for vid in channel_videos_dict[channel]}

    with open(os.path.join(RESULTS_DIR, "categories_inv_index", f"{cat}.json"), "r") as f:
        inv_idx = json.load(f)

    category_result = {
        "object_views": sort_dict({k:sum([video_info_dict[id]["views"] for id in v]) for k,v in inv_idx.items()}),
        "object_counts": sort_dict({k:len(v) for k,v in inv_idx.items()}),
    }
    
    filepath = os.path.join(RESULTS_DIR, "categories", f"{cat}.json")
    with open(filepath, "w") as f:
        json.dump(category_result, f)

TD*IDF for channels

In [None]:
# Document Frequency
for cat,channels in category_channels_dict.items():
    print(cat)

    df = defaultdict(int)
    for channel in tqdm(channels):
        with open(os.path.join(RESULTS_DIR, "channels_inv_index", f"{channel}.json"), "r") as f:
            inv_idx = json.load(f)
        for token in inv_idx:
            df[token] += 1

    df = sort_dict(df)

    with open(os.path.join(RESULTS_DIR, "categories_doc_freq", f"{cat}.json"), "w") as f:
        json.dump(df, f)

In [None]:
# tf*idf
for cat,channels in category_channels_dict.items():
    print(cat)

    with open(os.path.join(RESULTS_DIR, "categories_doc_freq", f"{cat}.json"), "r") as f:
        df = json.load(f)
    idf = {k:np.log(len(channels)/v) for k,v in df.items()}

    for channel in tqdm(channels):
        with open(os.path.join(RESULTS_DIR, "channels", f"{channel}.json"), "r") as f:
            tf = json.load(f)["object_counts"]

        terms = list(tf.keys())
        freqs = np.array(list(tf.values()))
        channel_idf = np.array([idf[k] for k in tf])

        tf_idf = freqs*channel_idf
        tf_idf = {term:freq for term,freq in zip(terms,tf_idf)}

        tf_idf = sort_dict(tf_idf)

        with open(os.path.join(RESULTS_DIR, "channels_tf_idf", f"{channel}.json"), "w") as f:
            json.dump(tf_idf, f)

TF*IDF for categories

In [None]:
# Document Frequency
df = defaultdict(int)
for cat,channels in category_channels_dict.items():
    print(cat)

    for channel in tqdm(channels):
        with open(os.path.join(RESULTS_DIR, "channels_inv_index", f"{channel}.json"), "r") as f:
            inv_idx = json.load(f)
        for token in inv_idx:
            df[token] += 1

df = sort_dict(df)

with open(os.path.join(RESULTS_DIR, f"doc_freq_all.json"), "w") as f:
    json.dump(df, f)

In [None]:
# tf*idf
with open(os.path.join(RESULTS_DIR, f"doc_freq_all.json"), "r") as f:
    df = json.load(f)
idf = {k:np.log(len(channel2cat)/v) for k,v in df.items()}

for cat,channels in category_channels_dict.items():
    print(cat)

    with open(os.path.join(RESULTS_DIR, "categories", f"{cat}.json"), "r") as f:
        tf = json.load(f)["object_counts"]

    terms = list(tf.keys())
    freqs = np.array(list(tf.values()))
    cat_idf = np.array([idf[k] for k in tf])

    tf_idf = freqs*cat_idf
    tf_idf = {term:freq for term,freq in zip(terms,tf_idf)}

    tf_idf = sort_dict(tf_idf)

    with open(os.path.join(RESULTS_DIR, "categories_tf_idf", f"{cat}.json"), "w") as f:
        json.dump(tf_idf, f)

Miscellaneous

In [None]:
# Get broad idea of what is finished
done_list = get_done_list(video_results_dir)
channel_counts = defaultdict(int)
for id in done_list:
    channel_counts[vid2channel[id]] += 1
cat_counts = defaultdict(int)
for chan,count in channel_counts.items():
    if count >= 30:
        cat_counts[channel2cat[chan]] += 1
cat_counts