# Classify your YouTube history

## Prepare data

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

from preparation import prepare_data
from select_by_date_interval import select_by_date_interval
from plotting import h_bar_plot, get_videos_per_h, month_bar_plot, get_videos_per_month, stacked_bar_with_labels

In [None]:
plt.rcParams['figure.dpi'] = 150 # make your graphs bigger
style.use('seaborn') # apply style for your plot. Hard to read letters in pie charts if style isn't suitable

In [None]:
PATH = "C:/Users/San/Documents/CS projects/yt_activity_analysis/data/Takeout/YouTube and YouTube Music/history/watch-history.json"
df = prepare_data(PATH)

start_date = "2023-01-01"
# end_date = "2022-11-01"
df = select_by_date_interval(df, start=start_date)
# df = df.loc[df["app"] == "YouTube Music"] # if you wanna work only with YT Music data
df

Unnamed: 0,video_title,channel_name,time,app
1,–ñ–±—É—Ä–ª—è—é,–•–∞—Ä—Ü–∏–∑–∏,2023-04-04 14:59:28.805000+03:00,YouTube Music
2,–õ—ñ—Ö—Ç–∞—Ä,Rohata Zhaba,2023-04-04 14:25:11.177000+03:00,YouTube
3,Drinker's Chasers - ANOTHER Rey Skywalker Movie?!,Critical Drinker After Hours,2023-04-04 14:12:17.193000+03:00,YouTube
4,Test Your English Vocabulary: SHAPES & PATTERNS,Learn English with Gill ¬∑ engVid,2023-04-04 14:06:00.179000+03:00,YouTube
5,Finland joins NATO in historic shift prompted ...,FRANCE 24 English,2023-04-04 14:02:41.452000+03:00,YouTube
...,...,...,...,...
4391,Assassin's Creed Odyssey - Before You Buy,gameranx,2023-01-01 10:02:05.355000+02:00,YouTube
4392,#Ukraine's spy chief tells ABC News there will...,ABC News,2023-01-01 09:44:58.841000+02:00,YouTube
4393,Russian missile zooms over Kyiv before being s...,The Sun,2023-01-01 09:43:07.927000+02:00,YouTube
4394,YARMAK FT. TOF - –ú–û–Ø –ö–†–ê–á–ù–ê,Yarmak Music,2023-01-01 00:07:14.838000+02:00,YouTube


Let's think about possible categories:
- music - it's the easiest one because I can do it without any classification algo by treating every video in YouTube Music as a song. However, you'll need to handle songs you listened in YouTube app. Maybe just check if video_title or channel_name happened already in YouTube Music. If so, then it's a song as well
- entertainment - games, movies, books, and other not so productive stuff
- enlightment/learning - english, math, cs, books, and other subjects
- politics - war, inner policies, and so on

However, what if some videos combine different categories? For example, a history video can be both entertaining and enlightning

In [None]:
nltk.download('toolbox')

[nltk_data] Downloading package toolbox to
[nltk_data]     C:\Users\San\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\toolbox.zip.


True

In [None]:
def categorize():
    pass

Let's start with deciding whether the video is a song or not

In [None]:
df["app"].value_counts()

YouTube          2507
YouTube Music    1888
Name: app, dtype: int64

In [None]:
def initial_classification(row):
    if row["app"] == "YouTube Music":
        category = "Music"
    else:
        category = "Not Music"
    return category

In [None]:
df["label"] = df.apply(initial_classification, axis=1)
df.head(3)

Unnamed: 0,video_title,channel_name,time,app,label
1,–ñ–±—É—Ä–ª—è—é,–•–∞—Ä—Ü–∏–∑–∏,2023-04-04 14:59:28.805000+03:00,YouTube Music,Music
2,–õ—ñ—Ö—Ç–∞—Ä,Rohata Zhaba,2023-04-04 14:25:11.177000+03:00,YouTube,Not Music
3,Drinker's Chasers - ANOTHER Rey Skywalker Movie?!,Critical Drinker After Hours,2023-04-04 14:12:17.193000+03:00,YouTube,Not Music


In [None]:
music_df = df.loc[df["app"] == "YouTube Music"]

In [None]:
"Antytila" in music_df["channel_name"].values

True

In [None]:
def classification(row):
    category = row["label"]
    if row["label"] == "Not Music":
        if row["channel_name"] in music_df["channel_name"].values:
            category = "Music"
        elif row["video_title"] in music_df["video_title"].values:
            category = "Music"
    return category

In [None]:
df["label"] = df.apply(classification, axis=1)
df

Unnamed: 0,video_title,channel_name,time,app,label
1,–ñ–±—É—Ä–ª—è—é,–•–∞—Ä—Ü–∏–∑–∏,2023-04-04 14:59:28.805000+03:00,YouTube Music,Music
2,–õ—ñ—Ö—Ç–∞—Ä,Rohata Zhaba,2023-04-04 14:25:11.177000+03:00,YouTube,Not Music
3,Drinker's Chasers - ANOTHER Rey Skywalker Movie?!,Critical Drinker After Hours,2023-04-04 14:12:17.193000+03:00,YouTube,Not Music
4,Test Your English Vocabulary: SHAPES & PATTERNS,Learn English with Gill ¬∑ engVid,2023-04-04 14:06:00.179000+03:00,YouTube,Not Music
5,Finland joins NATO in historic shift prompted ...,FRANCE 24 English,2023-04-04 14:02:41.452000+03:00,YouTube,Not Music
...,...,...,...,...,...
4391,Assassin's Creed Odyssey - Before You Buy,gameranx,2023-01-01 10:02:05.355000+02:00,YouTube,Not Music
4392,#Ukraine's spy chief tells ABC News there will...,ABC News,2023-01-01 09:44:58.841000+02:00,YouTube,Not Music
4393,Russian missile zooms over Kyiv before being s...,The Sun,2023-01-01 09:43:07.927000+02:00,YouTube,Not Music
4394,YARMAK FT. TOF - –ú–û–Ø –ö–†–ê–á–ù–ê,Yarmak Music,2023-01-01 00:07:14.838000+02:00,YouTube,Music


In [None]:
df["label"].value_counts()

Not Music    2378
Music        2017
Name: label, dtype: int64

Well, with several rules (that is a Rule-based system or smt), I got 129 more videos classified as songs. However, there are game/movie soundtrack collections classified as 'Not Music'

Time to assign some labels

In [None]:
# after sampling, drop useless for classification 
# cols such as weekday and time
# subdf = df.sample(n=109, random_state=42).drop(["time", "weekday"], axis=1)
# subdf.head(5)

To assign labels, sample your df, and save that sample in .csv file. Then in the .csv file itself, manualy assign labels and import for further actions

In [None]:
# comented out, so I don't accidentadly override my labeled data with unlabeled one
# subdf.to_csv("labeled.csv", index=False)

In [None]:
from langdetect import detect
detect("–¢–∏—Ö–æ –ø—Ä–∏–π—à–æ–≤, —Ç–∏—Ö–æ –ø—ñ—à–æ–≤ –∞–±–æ –ø—ñ—Å–Ω—è —Å–ø–µ—Ü—ñ–∞–ª—å–Ω–æ–≥–æ...")

'uk'

In [None]:
from langdetect import detect_langs
detect_langs("–¢–∏—Ö–æ –ø—Ä–∏–π—à–æ–≤, —Ç–∏—Ö–æ –ø—ñ—à–æ–≤ –∞–±–æ –ø—ñ—Å–Ω—è —Å–ø–µ—Ü—ñ–∞–ª—å–Ω–æ–≥–æ...")

[uk:0.9999978276889379]

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('toolbox')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess(text):
    if is_english(text):
        stop_words = set(stopwords.words('english'))
    elif is_ukrainian(text):
        stop_words = set(stopwords.words('ukrainian'))
    else:
        stop_words = set()
        
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    words = [word for word in words if word not in stop_words]
    
    return ' '.join(words)

def is_english(text):
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    text_vocab = set(w.lower() for w in word_tokenize(text))
    uncommon = text_vocab.difference(english_vocab)
    return len(uncommon) / len(text_vocab) < 0.2

def is_ukrainian(text):
    ukrainian_vocab = set(w.lower() for w in nltk.corpus.toolbox.words('ukrainian.txt'))
    text_vocab = set(w.lower() for w in word_tokenize(text))
    uncommon = text_vocab.difference(ukrainian_vocab)
    return len(uncommon) / len(text_vocab) < 0.2


Possible labels:
- music
- rec (short form of recreation)/entertainment - games, movies, fictional books, and other not so productive stuff
- studies (enlightment/learning/study) - english, math, cs, books, and other subjects
- politics - war, inner politics, news, and so on
- sport - workouts and so on

However, what if some videos combine different categories? For example, a history video can be both entertaining and enlightning

In [None]:
labeled_df = pd.read_csv('labeled.csv', header = 0)
labeled_df

Unnamed: 0,video_title,channel_name,app,category
0,Valhalla (Extended Mix),Miss Monique,YouTube Music,Music
1,Why You Should Read The Stormlight Archive - B...,Daniel Greene,YouTube,Rec
2,"–¢–∏—Ö–æ –ø—Ä–∏–π—à–æ–≤, —Ç–∏—Ö–æ –ø—ñ—à–æ–≤ –∞–±–æ –ø—ñ—Å–Ω—è —Å–ø–µ—Ü—ñ–∞–ª—å–Ω–æ–≥...",Riffmaster,YouTube Music,Music
3,"üéôÔ∏è [SingingMarch] ‚ôØ23 MARIAH CAREY ‚Äì ""Whenever...",Mioune,YouTube,Music
4,Ukraine frontline: the battle for Bakhmut - B...,BBC News,YouTube,Politics
...,...,...,...,...
104,–ë—Ä–∞—Ç—Ç—è —É–∫—Ä–∞—ó–Ω—Ü—ñ,Shablya,YouTube Music,Music
105,Tran,Miss Monique,YouTube Music,Music
106,How South Koreans got so much taller,Vox,YouTube,Studies
107,Deep Rock Galactic - Playstation Launch Traile...,PlayStation,YouTube,Rec


In [None]:
labeled_df["category"].value_counts()

Music       52
Rec         21
Politics    20
Studies     14
Sport        2
Name: category, dtype: int64

In [None]:
labeled_df["target_label"] = labeled_df["category"].map({
    "Music": 0,
    "Rec": 1,
    "Politics": 2,
    "Studies": 3,
    "Sport": 4
})
labeled_df

Unnamed: 0,video_title,channel_name,app,category,target_label
0,Valhalla (Extended Mix),Miss Monique,YouTube Music,Music,0
1,Why You Should Read The Stormlight Archive - B...,Daniel Greene,YouTube,Rec,1
2,"–¢–∏—Ö–æ –ø—Ä–∏–π—à–æ–≤, —Ç–∏—Ö–æ –ø—ñ—à–æ–≤ –∞–±–æ –ø—ñ—Å–Ω—è —Å–ø–µ—Ü—ñ–∞–ª—å–Ω–æ–≥...",Riffmaster,YouTube Music,Music,0
3,"üéôÔ∏è [SingingMarch] ‚ôØ23 MARIAH CAREY ‚Äì ""Whenever...",Mioune,YouTube,Music,0
4,Ukraine frontline: the battle for Bakhmut - B...,BBC News,YouTube,Politics,2
...,...,...,...,...,...
104,–ë—Ä–∞—Ç—Ç—è —É–∫—Ä–∞—ó–Ω—Ü—ñ,Shablya,YouTube Music,Music,0
105,Tran,Miss Monique,YouTube Music,Music,0
106,How South Koreans got so much taller,Vox,YouTube,Studies,3
107,Deep Rock Galactic - Playstation Launch Traile...,PlayStation,YouTube,Rec,1


In [None]:
def preprocess(text):
    sentences = re.split("[\n\t]", text)
    # remove empty lines
    sentences = [sentence for sentence in sentences if sentence]
    # further cleaning
    sentences = [re.sub(r"[^0-9a-zA-Z\s]", "", sentence, re.I|re.A).lower() for sentence in sentences]
    sentences = [sentence.lower().strip() for sentence in sentences]
    wpt = nltk.WordPunctTokenizer()
    stop_words = nltk.corpus.stopwords.words("english")
    output = []
    for sentence in sentences:
        tokens = wpt.tokenize(sentence)
        filtered_tokens = [token for token in tokens if token not in stop_words]
        output.append(" ".join(filtered_tokens))
    return " ".join(output)

df["prepped"] = df["video_title"].apply(preprocess)
df.head(5)

Unnamed: 0,video_title,channel_name,time,app,prepped
1,–ñ–±—É—Ä–ª—è—é,–•–∞—Ä—Ü–∏–∑–∏,2023-04-04 14:59:28.805000+03:00,YouTube Music,
2,–õ—ñ—Ö—Ç–∞—Ä,Rohata Zhaba,2023-04-04 14:25:11.177000+03:00,YouTube,
3,Drinker's Chasers - ANOTHER Rey Skywalker Movie?!,Critical Drinker After Hours,2023-04-04 14:12:17.193000+03:00,YouTube,drinkers chasers another rey skywalker movie
4,Test Your English Vocabulary: SHAPES & PATTERNS,Learn English with Gill ¬∑ engVid,2023-04-04 14:06:00.179000+03:00,YouTube,test english vocabulary shapes patterns
5,Finland joins NATO in historic shift prompted ...,FRANCE 24 English,2023-04-04 14:02:41.452000+03:00,YouTube,finland joins nato historic shift prompted ukr...
