# Imports

In [1]:
import os

import pandas as pd
import torch

In [2]:
cwd = os.getcwd()
parent_dir = os.path.abspath(os.path.join(cwd, os.pardir))
data_dir = os.path.join(parent_dir, "data")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Loading

In [3]:
df_ner = pd.read_json(os.path.join(data_dir, "combined_ner.json"))

# Analysis

In [4]:
df_ner_normalized = pd.json_normalize(
    df_ner["ner"].apply(lambda x: {item["type"]: item["text"] for item in x})
)

df_ner = pd.concat([df_ner, df_ner_normalized], axis=1)

In [5]:
df_ner.isna().sum().sort_values(ascending=False)

LANGUAGE       2300171
PERCENT        2300089
LAW            2300027
MONEY          2298404
QUANTITY       2297263
NORP           2295368
FAC            2293754
EVENT          2292930
LOC            2290612
PRODUCT        2290450
GPE            2278748
TIME           2278573
ORG            2255985
ORDINAL        2240877
WORK_OF_ART    2191385
PERSON         2082416
DATE           2049449
CARDINAL       2024569
parent_id      1779152
genre           169059
author             225
ner                  0
emoji                0
channel              0
is_reply             0
video_id             0
comment              0
date                 0
comment_id           0
dtype: int64

In [6]:
category = "LOC"
df_ner_category = df_ner[df_ner[category].notna()]

In [7]:
df_ner_category[category].value_counts()

LOC
West               1926
South               735
Earth               703
Anthrax             546
North               455
                   ... 
Multiverse            1
Viridian Forest       1
the Safari Zone       1
Celestial             1
Strawberry            1
Name: count, Length: 1981, dtype: int64

In [8]:
df_ner[df_ner[category].str.contains("moon", na=False, case=False)]

Unnamed: 0,comment_id,author,date,comment,video_id,is_reply,parent_id,channel,genre,emoji,...,ORG,TIME,PERCENT,FAC,LOC,EVENT,QUANTITY,MONEY,LAW,LANGUAGE
12071,UgwJTIOVYQOuGtrIsnx4AaABAg,Pegita Mg,2022-08-30 07:32:07+00:00,Do these ever do a video inside I 39 m a vo...,oCi0RHLrauU,False,,kaleo,blues,[],...,,,,,moon,,39,,,
14567,UgwKs75O4aiAw1uzINF4AaABAg,Narek Hakobyan,2022-03-11 18:09:54+00:00,Can 39 t wait to see their live performance ...,bv5RuxhBEqk,False,,kaleo,blues,[],...,,,,,Moon,,,,,
14568,UgwKs75O4aiAw1uzINF4AaABAg.9ZRmdCuaVWH9vNi4K7N4lL,Khalid Saleh,2023-10-02 20:15:09+00:00,Can 39 t wait to see their live performance ...,bv5RuxhBEqk,True,UgwKs75O4aiAw1uzINF4AaABAg,kaleo,blues,[],...,,,,,Moon,,,,,
21553,Ugwz_hh3RBaflOMtdG94AaABAg,Vitor Pinheiro,2021-04-14 19:26:14+00:00,next up live performance on the Moon,6Vs6tExC-Go,False,,kaleo,blues,[],...,,,,,Moon,,,,,
26839,Ugyn6qoGLy9w27gC1qx4AaABAg,gunnar þ björnsson,2020-12-06 01:17:07+00:00,I with you are you from Moon you are male or...,ABx6Zdl1aIQ,False,,kaleo,blues,[],...,,,,,Moon,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981089,UgwXF8Cgra0AVDQCkPJ4AaABAg,Cassandra Skanen,2022-07-12 19:57:08+00:00,The Algorithm me this 39 s then this of a ...,hOTFcnexJBg,False,,queen,rock,"['://www.youtube.com/watch?v=Oz4FU991k1s"">http...",...,,,,,Moon,,,,,
1997784,UgyoLpPg4biYFSTKTEN4AaABAg,Tore Aurstad,2022-03-31 13:29:51+00:00,Dis video was shot before May a in astrophysic...,yI8lrvKLzg0,False,,queen,rock,[],...,,11 s,,,Moon,,,,,
2015695,Ugwt5TDPjWmHser17D94AaABAg,youngalpaca,2023-02-26 00:24:08+00:00,I saw the title and said quot What in the bl...,0stc0duKiHs,False,,skrillex,,[],...,,,,,blue moon,,,,,
2139852,UgxlfH1IgVmZn4vUtEd4AaABAg,mariana_gl,2023-10-31 21:16:22+00:00,Love you to the Moon and to this vault are Ama...,vaRGgiGfsw4,False,,taylor_swift,pop,"['💙', '💙', '💙', '💙']",...,,,,,Moon,,,,,
