# Classify your YouTube history

## Prepare data

In [2]:
import pandas as pd
import numpy as np
import re
import nltk

from preparation import prepare_data
from select_by_date_interval import select_by_date_interval

In [28]:
PATH = "C:/Users/San/Documents/CS projects/yt_activity_analysis/data/Takeout/YouTube and YouTube Music/history/watch-history.json"
df = prepare_data(PATH)

start_date = "2023-01-01"
# end_date = "2022-11-01"
df = select_by_date_interval(df, start=start_date)
# df = df.loc[df["app"] == "YouTube Music"] # if you wanna work only with YT Music data
df

Unnamed: 0,video_title,channel_name,time,app
1,Жбурляю,Харцизи,2023-04-04 14:59:28.805000+03:00,YouTube Music
2,Ліхтар,Rohata Zhaba,2023-04-04 14:25:11.177000+03:00,YouTube
3,Drinker's Chasers - ANOTHER Rey Skywalker Movie?!,Critical Drinker After Hours,2023-04-04 14:12:17.193000+03:00,YouTube
4,Test Your English Vocabulary: SHAPES & PATTERNS,Learn English with Gill · engVid,2023-04-04 14:06:00.179000+03:00,YouTube
5,Finland joins NATO in historic shift prompted ...,FRANCE 24 English,2023-04-04 14:02:41.452000+03:00,YouTube
...,...,...,...,...
4391,Assassin's Creed Odyssey - Before You Buy,gameranx,2023-01-01 10:02:05.355000+02:00,YouTube
4392,#Ukraine's spy chief tells ABC News there will...,ABC News,2023-01-01 09:44:58.841000+02:00,YouTube
4393,Russian missile zooms over Kyiv before being s...,The Sun,2023-01-01 09:43:07.927000+02:00,YouTube
4394,YARMAK FT. TOF - МОЯ КРАЇНА,Yarmak Music,2023-01-01 00:07:14.838000+02:00,YouTube


Let's think about possible categories:
- music - it's the easiest one because I can do it without any classification algo by treating every video in YouTube Music as a song. However, you'll need to handle songs you listened in YouTube app. Maybe just check if video_title or channel_name happened already in YouTube Music. If so, then it's a song as well
- entertainment - games, movies, books, and other not so productive stuff
- enlightment/learning - english, math, cs, books, and other subjects
- politics - war, inner policies, and so on

However, what if some videos combine different categories? For example, a history video can be both entertaining and enlightning

Let's start with deciding whether the video is a song or not

In [4]:
df["app"].value_counts()

YouTube          2507
YouTube Music    1888
Name: app, dtype: int64

In [29]:
def if_YT_Music(row):
    if row["app"] == "YouTube Music":
        category = "Music"
    else:
        category = "Not Music"
    return category

def if_in_YT_Music(row):
    # if the channel_name or video_title were encountered in
    # YT Music app before, assign "Music" category to the video
    category = row["category"]
    if row["category"] == "Not Music":
        if row["channel_name"] in music_df["channel_name"].values:
            category = "Music"
        elif row["video_title"] in music_df["video_title"].values:
            category = "Music"
    return category

df["category"] = df.apply(if_YT_Music, axis=1)
music_df = df.loc[df["app"] == "YouTube Music"]
df["category"] = df.apply(if_in_YT_Music, axis=1)
df.head(3)

Unnamed: 0,video_title,channel_name,time,app,category
1,Жбурляю,Харцизи,2023-04-04 14:59:28.805000+03:00,YouTube Music,Music
2,Ліхтар,Rohata Zhaba,2023-04-04 14:25:11.177000+03:00,YouTube,Not Music
3,Drinker's Chasers - ANOTHER Rey Skywalker Movie?!,Critical Drinker After Hours,2023-04-04 14:12:17.193000+03:00,YouTube,Not Music


Well, with several rules (that is a Rule-based system or smt), I got 129 more videos classified as songs. However, there are game/movie soundtrack collections classified as 'Not Music'

In [30]:
df["category"].value_counts()

Not Music    2378
Music        2017
Name: category, dtype: int64

In [34]:
def preprocess(text):
    sentences = re.split("[\n\t]", text)
    # remove empty lines
    sentences = [sentence for sentence in sentences if sentence]
    # further cleaning
    sentences = [re.sub(r"[^0-9a-zA-Z\s]", "", sentence, re.I|re.A).lower() for sentence in sentences]
    sentences = [sentence.lower().strip() for sentence in sentences]
    wpt = nltk.WordPunctTokenizer()
    stop_words = nltk.corpus.stopwords.words("english")
    output = []
    for sentence in sentences:
        tokens = wpt.tokenize(sentence)
        filtered_tokens = [token for token in tokens if token not in stop_words]
        output.append(" ".join(filtered_tokens))
    return " ".join(output)

df["preproc"] = df["video_title"].apply(preprocess)
# if we have print 'preproc' col, we'll see that non-english text is not properly handled
# After several failed attempts at handling non-english text, it was decided to drop rows
# where 'preproc' col has fewer than 4 chars
df = df[df['preproc'].str.len() >= 4]
df.head(3)

Unnamed: 0,video_title,channel_name,time,app,category,preproc
3,Drinker's Chasers - ANOTHER Rey Skywalker Movie?!,Critical Drinker After Hours,2023-04-04 14:12:17.193000+03:00,YouTube,Not Music,drinkers chasers another rey skywalker movie
4,Test Your English Vocabulary: SHAPES & PATTERNS,Learn English with Gill · engVid,2023-04-04 14:06:00.179000+03:00,YouTube,Not Music,test english vocabulary shapes patterns
5,Finland joins NATO in historic shift prompted ...,FRANCE 24 English,2023-04-04 14:02:41.452000+03:00,YouTube,Not Music,finland joins nato historic shift prompted ukr...


# CLASSIFICATION WITH ALREADY PREPARED SAMPLE

In [25]:
df = pd.read_csv('preproc_01Jan23_to_04Apr23_data.csv', index_col=0)
df.tail(3)

FileNotFoundError: [Errno 2] No such file or directory: 'sample_1000.csv'

In [None]:
df["category"].value_counts()

Music       497
Rec         195
Politics    158
Studies     120
Sport        30
Name: category, dtype: int64

In [None]:
# df['target_label'] = pd.factorize(df['category'])[0]
df["target_label"] = df["category"].map({
    "Music": 0,
    "Rec": 1,
    "Politics": 2,
    "Studies": 3,
    "Sport": 4
})
df.tail(3)

Unnamed: 0,video_title,channel_name,app,category,prepped,time,target_label
997,Would I Like to Visit Ukraine? - April Q&A,Jake Broe,YouTube,Politics,would like visit ukraine april qa,2023-04-04 10:47:03.150000+03:00,2
998,Deep Rock Galactic - 5th Anniversary Space Rig...,Thai,YouTube,Music,deep rock galactic 5th anniversary space rig m...,2023-04-04 12:47:36.411000+03:00,0
999,Drinker's Chasers - ANOTHER Rey Skywalker Movie?!,Critical Drinker After Hours,YouTube,Rec,drinkers chasers another rey skywalker movie,2023-04-04 14:12:17.193000+03:00,1


In [None]:
df["target_label"].value_counts()

0    497
1    195
2    158
3    120
4     30
Name: target_label, dtype: int64

At last, time to try classification

In [None]:
from sklearn.model_selection import train_test_split

train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names = train_test_split(
    np.array(df['preproc']), np.array(df['target_label']), np.array(df['category']), test_size=0.2, random_state=0)
train_corpus.shape, test_corpus.shape

((800,), (200,))

In [None]:
from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(min_df=0., max_df=1., norm='l2', use_idf=True, smooth_idf=True)
tv_train_features = tv.fit_transform(train_corpus)
tv_test_features = tv.transform(test_corpus)

In [None]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', C=1, random_state=0)
svm.fit(tv_train_features, train_label_names)
svm_bow_tv_scores = cross_val_score(svm, tv_train_features, train_label_names, cv=5)
svm_bow_tv_mean_score = np.mean(svm_bow_tv_scores)

print('CV Accuracy (5-fold):', svm_bow_tv_scores)
print('Mean CV Accuracy:', svm_bow_tv_mean_score)
svm_bow_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_bow_test_score)

CV Accuracy (5-fold): [0.84375 0.81875 0.80625 0.7875  0.79375]
Mean CV Accuracy: 0.8099999999999999
Test Accuracy: 0.795


In [None]:
from sklearn.metrics import classification_report
# predict labels for the test set
svm_predictions = svm.predict(tv_test_features)
# get the unique classes
unique_classes = list(set(test_label_names))
# print the classification report
print(classification_report(test_label_names, svm_predictions, labels=unique_classes))

              precision    recall  f1-score   support

     Studies       0.69      0.48      0.56        23
    Politics       0.80      0.67      0.73        48
       Sport       1.00      0.67      0.80         6
         Rec       0.80      0.88      0.84        42
       Music       0.80      0.93      0.86        81

    accuracy                           0.80       200
   macro avg       0.82      0.72      0.76       200
weighted avg       0.79      0.80      0.79       200



In [None]:
from sklearn.metrics import classification_report
# predict labels for the test set
svm_predictions = svm.predict(tv_test_features)
# get the unique classes
unique_classes = list(set(test_label_names))
# print the classification report
print(classification_report(test_label_names, svm_predictions, labels=unique_classes))

              precision    recall  f1-score   support

     Studies       0.82      0.90      0.86        20
    Politics       0.88      0.67      0.76        21
       Sport       1.00      1.00      1.00         4
         Rec       0.89      0.83      0.86        29
       Music       0.93      1.00      0.96        66

    accuracy                           0.90       140
   macro avg       0.90      0.88      0.89       140
weighted avg       0.90      0.90      0.90       140



Now let's see what labels were assigned in df format

In [None]:
# Assuming you have already trained your model and obtained the predicted labels
predicted_labels = svm.predict(tv_test_features)

# Create a new dataframe that contains the test data and the predicted labels
test_df = pd.DataFrame({'video_title': test_corpus, 'category': test_label_names, 'actual_label': test_label_nums, 'predicted_label': predicted_labels})

# Print the first 10 rows of the new dataframe
test_df.head(10)

Unnamed: 0,video_title,category,actual_label,predicted_label
0,happens arthur sawedoff shotgun instead revolver,Rec,1,Music
1,hero ages brandon sanderson stick landing part,Rec,1,Rec
2,let go frozensoundtrack version,Music,0,Music
3,kids song,Music,0,Music
4,one final effort,Music,0,Music
5,bad boys theme cops,Music,0,Music
6,sector,Music,0,Music
7,day 4 best full body yoga stretch 30 days yoga,Sport,4,Sport
8,containers vs vms whats difference,Studies,3,Studies
9,sonne,Music,0,Music


Select 100 rows from the big df of like 3-4k rows to test how the model performs in production or whatever it's called

In [None]:
# new rows were gotten with df.sample(n=600, random_state=42)
# random_state=42 because I want to get the same sample
# get the features for the new videos using the trained vectorizer
new_features = tv.transform(new_rows["preproc"])
# predict the category of the new videos using the trained LinearSVC
predicted_labels = svm.predict(new_features)
# add the predicted labels to the new_videos dataframe
new_rows['category'] = predicted_labels
new_rows

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_rows['category'] = predicted_labels


Unnamed: 0,video_title,channel_name,app,category,prepped
600,Sector,Daniel Deluxe,YouTube Music,Music,sector
601,"Let's talk about the tech layoffs, objectively.",Karolina Sowinska,YouTube,Politics,lets talk tech layoffs objectively
602,Defence strategy for small nations - force des...,Perun,YouTube,Politics,defence strategy small nations force design fr...
603,Berserkir,Danheim,YouTube Music,Music,berserkir
604,SadSvit - Силуети (feat. СТРУКТУРА ЩАСТЯ) Lyri...,SadSvit,YouTube Music,Music,sadsvit feat lyric video
...,...,...,...,...,...
695,Berserkir,Danheim,YouTube Music,Music,berserkir
696,Let's talk about Abrams approval and timelines...,Beau of the Fifth Column,YouTube,Politics,lets talk abrams approval timelines
697,Covert Operations,Adam Schneider,YouTube Music,Music,covert operations
698,What is NLP (Natural Language Processing)?,IBM Technology,YouTube,Studies,nlp natural language processing


In [None]:
# new_rows.to_csv("tried_600-700.csv")