# Classify your YouTube history

## Prepare data

In [1]:
import pandas as pd
import numpy as np
import re
import nltk

from preparation import prepare_data
from select_by_date_interval import select_by_date_interval

In [2]:
PATH = "C:/Users/San/Documents/CS projects/yt_activity_analysis/data/Takeout/YouTube and YouTube Music/history/watch-history.json"
df = prepare_data(PATH)

start_date = "2023-01-01"
# end_date = "2022-11-01"
df = select_by_date_interval(df, start=start_date)
# df = df.loc[df["app"] == "YouTube Music"] # if you wanna work only with YT Music data
df

Unnamed: 0,video_title,channel_name,time,app
1,Жбурляю,Харцизи,2023-04-04 14:59:28.805000+03:00,YouTube Music
2,Ліхтар,Rohata Zhaba,2023-04-04 14:25:11.177000+03:00,YouTube
3,Drinker's Chasers - ANOTHER Rey Skywalker Movie?!,Critical Drinker After Hours,2023-04-04 14:12:17.193000+03:00,YouTube
4,Test Your English Vocabulary: SHAPES & PATTERNS,Learn English with Gill · engVid,2023-04-04 14:06:00.179000+03:00,YouTube
5,Finland joins NATO in historic shift prompted ...,FRANCE 24 English,2023-04-04 14:02:41.452000+03:00,YouTube
...,...,...,...,...
4391,Assassin's Creed Odyssey - Before You Buy,gameranx,2023-01-01 10:02:05.355000+02:00,YouTube
4392,#Ukraine's spy chief tells ABC News there will...,ABC News,2023-01-01 09:44:58.841000+02:00,YouTube
4393,Russian missile zooms over Kyiv before being s...,The Sun,2023-01-01 09:43:07.927000+02:00,YouTube
4394,YARMAK FT. TOF - МОЯ КРАЇНА,Yarmak Music,2023-01-01 00:07:14.838000+02:00,YouTube


Let's start with deciding whether the video is a song or not

In [46]:
df["app"].value_counts()

YouTube          2507
YouTube Music    1888
Name: app, dtype: int64

In [3]:
def if_YT_Music(row):
    if row["app"] == "YouTube Music":
        category = "Music"
    else:
        category = "Not Music"
    return category

def if_in_YT_Music(row):
    # if the channel_name or video_title were encountered in
    # YT Music app before, assign "Music" category to the video
    category = row["category"]
    if row["category"] == "Not Music":
        if row["channel_name"] in music_df["channel_name"].values:
            category = "Music"
        elif row["video_title"] in music_df["video_title"].values:
            category = "Music"
    return category

df["category"] = df.apply(if_YT_Music, axis=1)
music_df = df.loc[df["app"] == "YouTube Music"]
df["category"] = df.apply(if_in_YT_Music, axis=1)
df.head(3)

Unnamed: 0,video_title,channel_name,time,app,category
1,Жбурляю,Харцизи,2023-04-04 14:59:28.805000+03:00,YouTube Music,Music
2,Ліхтар,Rohata Zhaba,2023-04-04 14:25:11.177000+03:00,YouTube,Not Music
3,Drinker's Chasers - ANOTHER Rey Skywalker Movie?!,Critical Drinker After Hours,2023-04-04 14:12:17.193000+03:00,YouTube,Not Music


Well, with several rules (that is a Rule-based system or smt), I got 129 more videos classified as songs. However, there are game/movie soundtrack collections classified as 'Not Music'

In [48]:
df["category"].value_counts()

Not Music    2378
Music        2017
Name: category, dtype: int64

In [4]:
def preprocess(text):
    sentences = re.split("[\n\t]", text)
    # remove empty lines
    sentences = [sentence for sentence in sentences if sentence]
    # further cleaning
    sentences = [re.sub(r"[^0-9a-zA-Z\s]", "", sentence, re.I|re.A).lower() for sentence in sentences]
    sentences = [sentence.lower().strip() for sentence in sentences]
    wpt = nltk.WordPunctTokenizer()
    stop_words = nltk.corpus.stopwords.words("english")
    output = []
    for sentence in sentences:
        tokens = wpt.tokenize(sentence)
        filtered_tokens = [token for token in tokens if token not in stop_words]
        output.append(" ".join(filtered_tokens))
    return " ".join(output)

df["preproc"] = df["video_title"].apply(preprocess)
# if we have print 'preproc' col, we'll see that non-english text is not properly handled
# After several failed attempts at handling non-english text, it was decided to drop rows
# where 'preproc' col has fewer than 4 chars
print(f"Before dropping: {len(df)}")
empty_rows = df[df['preproc'].str.len() < 4]
df = df[df['preproc'].str.len() >= 4]
print(f"After: {len(df)}")
df.head(3)

Before dropping: 4395
After: 3404


Unnamed: 0,video_title,channel_name,time,app,category,preproc
3,Drinker's Chasers - ANOTHER Rey Skywalker Movie?!,Critical Drinker After Hours,2023-04-04 14:12:17.193000+03:00,YouTube,Not Music,drinkers chasers another rey skywalker movie
4,Test Your English Vocabulary: SHAPES & PATTERNS,Learn English with Gill · engVid,2023-04-04 14:06:00.179000+03:00,YouTube,Not Music,test english vocabulary shapes patterns
5,Finland joins NATO in historic shift prompted ...,FRANCE 24 English,2023-04-04 14:02:41.452000+03:00,YouTube,Not Music,finland joins nato historic shift prompted ukr...


In [6]:
empty_rows.head(3)

Unnamed: 0,video_title,channel_name,time,app,category,preproc
1,Жбурляю,Харцизи,2023-04-04 14:59:28.805000+03:00,YouTube Music,Music,
2,Ліхтар,Rohata Zhaba,2023-04-04 14:25:11.177000+03:00,YouTube,Not Music,
9,Мавка,Authentix,2023-04-04 12:57:54.594000+03:00,YouTube,Not Music,


In [8]:
empty_rows["combined"] = empty_rows["video_title"] + " " + empty_rows["channel_name"]
empty_rows["combined_preproc"] = empty_rows["combined"].apply(preprocess)
empty_rows.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  empty_rows["combined"] = empty_rows["video_title"] + " " + empty_rows["channel_name"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  empty_rows["combined_preproc"] = empty_rows["combined"].apply(preprocess)


Unnamed: 0,video_title,channel_name,time,app,category,preproc,combined,combined_preproc
1,Жбурляю,Харцизи,2023-04-04 14:59:28.805000+03:00,YouTube Music,Music,,Жбурляю Харцизи,
2,Ліхтар,Rohata Zhaba,2023-04-04 14:25:11.177000+03:00,YouTube,Not Music,,Ліхтар Rohata Zhaba,rohata zhaba
9,Мавка,Authentix,2023-04-04 12:57:54.594000+03:00,YouTube,Not Music,,Мавка Authentix,authentix
10,Харцизи - Забуті боги,Харцизи,2023-04-04 12:50:32.692000+03:00,YouTube,Music,,Харцизи - Забуті боги Харцизи,
13,Злива,Харцизи,2023-04-04 12:44:35.400000+03:00,YouTube,Music,,Злива Харцизи,


In [10]:
len(empty_rows[empty_rows['combined_preproc'].str.len() >= 4])

610

In [26]:
sec_failure = empty_rows[empty_rows['combined_preproc'].str.len() < 4].drop(["preproc"], axis=1)
sec_failure

Unnamed: 0,video_title,channel_name,time,app,category,combined,combined_preproc
1,Жбурляю,Харцизи,2023-04-04 14:59:28.805000+03:00,YouTube Music,Music,Жбурляю Харцизи,
10,Харцизи - Забуті боги,Харцизи,2023-04-04 12:50:32.692000+03:00,YouTube,Music,Харцизи - Забуті боги Харцизи,
13,Злива,Харцизи,2023-04-04 12:44:35.400000+03:00,YouTube,Music,Злива Харцизи,
14,На самоті,Харцизи,2023-04-04 12:41:26.109000+03:00,YouTube,Music,На самоті Харцизи,
15,Глибоке небо,Харцизи,2023-04-04 12:37:56.562000+03:00,YouTube,Music,Глибоке небо Харцизи,
...,...,...,...,...,...,...,...
4173,"Консерваторія чи ""консерва"" імені Чайковського...",ШОУБІСИКИ,2023-01-04 23:28:24.540000+02:00,YouTube,Not Music,"Консерваторія чи ""консерва"" імені Чайковського...",
4238,Ніколи? Знову!,ХАС,2023-01-04 09:25:02.756000+02:00,YouTube Music,Music,Ніколи? Знову! ХАС,
4251,Ніколи? Знову!,ХАС,2023-01-03 18:00:17.111000+02:00,YouTube Music,Music,Ніколи? Знову! ХАС,
4310,Сухпай Збройних сил Республіки Корея 한국군 배급,ХЛОПЦІ З ЛІСУ,2023-01-02 21:39:10.283000+02:00,YouTube,Not Music,Сухпай Збройних сил Республіки Корея 한국군 배급 ХЛ...,


In [27]:
sec_failure["category"].value_counts()

Not Music    270
Music        111
Name: category, dtype: int64

In [29]:
sec_failure.to_csv("381_rows_2nd_preproc_failure.csv")

In [30]:
sec_failure["channel_name"].value_counts()

Цензор.НЕТ          62
Харцизи             46
ШОУБІСИКИ           36
Тартак              28
ХАС                 28
                    ..
Дети Фристайла       1
АНТИТІЛА             1
Андрій Павленко      1
Конь в пальто        1
Сметанин Василий     1
Name: channel_name, Length: 69, dtype: Int64

From those 381 rows, 1/3 is music, and the rest is mostly relating to 'Politics' category with tiny bit connected to 'Rec' category

In [13]:
new = empty_rows[empty_rows['combined_preproc'].str.len() >= 4].drop(["preproc"], axis=1)
new

Unnamed: 0,video_title,channel_name,time,app,category,combined,combined_preproc
2,Ліхтар,Rohata Zhaba,2023-04-04 14:25:11.177000+03:00,YouTube,Not Music,Ліхтар Rohata Zhaba,rohata zhaba
9,Мавка,Authentix,2023-04-04 12:57:54.594000+03:00,YouTube,Not Music,Мавка Authentix,authentix
23,Фортеця Бахмут,Antytila,2023-04-04 11:52:49.239000+03:00,YouTube Music,Music,Фортеця Бахмут Antytila,antytila
29,ВЛАДЛЕН ТАТАРСЬКИЙ ЗАЖМУРИВСЯ - ОЛДСКУЛЬНИЙ РО...,Old School,2023-04-03 23:42:34.126000+03:00,YouTube,Not Music,ВЛАДЛЕН ТАТАРСЬКИЙ ЗАЖМУРИВСЯ - ОЛДСКУЛЬНИЙ РО...,old school
50,"""Ось наші Леопарди!"": танкісти і піхота 92-ї О...",BIHUS Info,2023-04-03 22:03:11.147000+03:00,YouTube,Not Music,"""Ось наші Леопарди!"": танкісти і піхота 92-ї О...",92 bihus info
...,...,...,...,...,...,...,...
4250,Силуети,SadSvit,2023-01-03 18:03:09.811000+02:00,YouTube Music,Music,Силуети SadSvit,sadsvit
4256,#КОЗАЦЬКОМУ_РОДУ,Jerry Heil,2023-01-03 17:34:33.059000+02:00,YouTube Music,Music,#КОЗАЦЬКОМУ_РОДУ Jerry Heil,jerry heil
4257,"Тихо прийшов, тихо пішов або пісня спеціальног...",Riffmaster,2023-01-03 17:29:21.578000+02:00,YouTube Music,Music,"Тихо прийшов, тихо пішов або пісня спеціальног...",riffmaster
4346,Як військові відреагували на звернення Зеленсь...,Ukrainian Witness,2023-01-02 08:53:54.148000+02:00,YouTube,Not Music,Як військові відреагували на звернення Зеленсь...,ukrainian witness


In [16]:
new["category"].value_counts()

Music        477
Not Music    133
Name: category, dtype: int64

In [14]:
new.to_csv("610_additional.csv")

In [18]:
maj_ua_610 = pd.read_csv("610_additional.csv", index_col=0)
maj_ua_610

Unnamed: 0,video_title,channel_name,time,app,category,combined,combined_preproc
2,Ліхтар,Rohata Zhaba,2023-04-04 14:25:11.177000+03:00,YouTube,Music,Ліхтар Rohata Zhaba,rohata zhaba
9,Мавка,Authentix,2023-04-04 12:57:54.594000+03:00,YouTube,Music,Мавка Authentix,authentix
23,Фортеця Бахмут,Antytila,2023-04-04 11:52:49.239000+03:00,YouTube Music,Music,Фортеця Бахмут Antytila,antytila
29,ВЛАДЛЕН ТАТАРСЬКИЙ ЗАЖМУРИВСЯ - ОЛДСКУЛЬНИЙ РО...,Old School,2023-04-03 23:42:34.126000+03:00,YouTube,Politics,ВЛАДЛЕН ТАТАРСЬКИЙ ЗАЖМУРИВСЯ - ОЛДСКУЛЬНИЙ РО...,old school
50,"""Ось наші Леопарди!"": танкісти і піхота 92-ї О...",BIHUS Info,2023-04-03 22:03:11.147000+03:00,YouTube,Politics,"""Ось наші Леопарди!"": танкісти і піхота 92-ї О...",92 bihus info
...,...,...,...,...,...,...,...
4250,Силуети,SadSvit,2023-01-03 18:03:09.811000+02:00,YouTube Music,Music,Силуети SadSvit,sadsvit
4256,#КОЗАЦЬКОМУ_РОДУ,Jerry Heil,2023-01-03 17:34:33.059000+02:00,YouTube Music,Music,#КОЗАЦЬКОМУ_РОДУ Jerry Heil,jerry heil
4257,"Тихо прийшов, тихо пішов або пісня спеціальног...",Riffmaster,2023-01-03 17:29:21.578000+02:00,YouTube Music,Music,"Тихо прийшов, тихо пішов або пісня спеціальног...",riffmaster
4346,Як військові відреагували на звернення Зеленсь...,Ukrainian Witness,2023-01-02 08:53:54.148000+02:00,YouTube,Politics,Як військові відреагували на звернення Зеленсь...,ukrainian witness


In [20]:
maj_ua_610["category"].value_counts()

Music       497
Politics     86
Rec          23
Studies       3
Sport         1
Name: category, dtype: int64

In [21]:
maj_ua_610["category"].value_counts(normalize=True)

Music       0.814754
Politics    0.140984
Rec         0.037705
Studies     0.004918
Sport       0.001639
Name: category, dtype: float64

In [25]:
df["category"].value_counts()

Music       497
Rec         195
Politics    158
Studies     120
Sport        30
Name: category, dtype: int64

In [24]:
df["category"].value_counts(normalize=True)

Music       0.497
Rec         0.195
Politics    0.158
Studies     0.120
Sport       0.030
Name: category, dtype: float64

In [11]:
len(empty_rows[empty_rows['combined_preproc'].str.len() < 4])

381

Without combining 'video_title' and 'channel_name' cols, we lose 991 rows. Let's see how many rows we're gonna lose when these cols are combined

In [62]:
df["combined"] = df["video_title"] + " " + df["channel_name"]
df["combined_preproc"] = df["combined"].apply(preprocess)
df.head(3)

Unnamed: 0,video_title,channel_name,time,app,category,combined,combined_preproc
1,Жбурляю,Харцизи,2023-04-04 14:59:28.805000+03:00,YouTube Music,Music,Жбурляю Харцизи,
2,Ліхтар,Rohata Zhaba,2023-04-04 14:25:11.177000+03:00,YouTube,Not Music,Ліхтар Rohata Zhaba,rohata zhaba
3,Drinker's Chasers - ANOTHER Rey Skywalker Movie?!,Critical Drinker After Hours,2023-04-04 14:12:17.193000+03:00,YouTube,Not Music,Drinker's Chasers - ANOTHER Rey Skywalker Movie?! Critical Drinker After Hours,drinkers chasers another rey skywalker movie critical drinker hours


In [63]:
print(f"Before dropping: {len(df)}")
df = df[df['combined_preproc'].str.len() >= 4]
print(f"After: {len(df)}")

Before dropping: 4395
After: 4014


In [65]:
df.to_csv("unlabeled_combined_len_4014.csv")

As you can see, if we combine those two cols, we lose significantly fewer rows. Only 381 to be precise

# CLASSIFICATION WITH ALREADY PREPARED SAMPLE

In [22]:
df = pd.read_csv('labeled_sample_1000_elem.csv', index_col=0)
df.tail(3)

Unnamed: 0,video_title,channel_name,app,category,preproc,time
997,Would I Like to Visit Ukraine? - April Q&A,Jake Broe,YouTube,Politics,would like visit ukraine april qa,2023-04-04 10:47:03.150000+03:00
998,Deep Rock Galactic - 5th Anniversary Space Rig...,Thai,YouTube,Music,deep rock galactic 5th anniversary space rig m...,2023-04-04 12:47:36.411000+03:00
999,Drinker's Chasers - ANOTHER Rey Skywalker Movie?!,Critical Drinker After Hours,YouTube,Rec,drinkers chasers another rey skywalker movie,2023-04-04 14:12:17.193000+03:00


Let's think about possible categories:
- music - it's the easiest one because I can do it without any classification algo by treating every video in YouTube Music as a song. However, you'll need to handle songs you listened in YouTube app. Maybe just check if video_title or channel_name happened already in YouTube Music. If so, then it's a song as well
- entertainment - games, movies, books, and other not so productive stuff
- enlightment/learning - english, math, cs, books, and other subjects
- politics - war, inner policies, and so on

However, what if some videos combine different categories? For example, a history video can be both entertaining and enlightning

In [23]:
df["category"].value_counts()

Music       497
Rec         195
Politics    158
Studies     120
Sport        30
Name: category, dtype: int64

In [52]:
# df['target_label'] = pd.factorize(df['category'])[0]
df["target_label"] = df["category"].map({
    "Music": 0,
    "Rec": 1,
    "Politics": 2,
    "Studies": 3,
    "Sport": 4
})
df.tail(3)

Unnamed: 0,video_title,channel_name,app,category,preproc,time,target_label
997,Would I Like to Visit Ukraine? - April Q&A,Jake Broe,YouTube,Politics,would like visit ukraine april qa,2023-04-04 10:47:03.150000+03:00,2
998,Deep Rock Galactic - 5th Anniversary Space Rig Music,Thai,YouTube,Music,deep rock galactic 5th anniversary space rig music,2023-04-04 12:47:36.411000+03:00,0
999,Drinker's Chasers - ANOTHER Rey Skywalker Movie?!,Critical Drinker After Hours,YouTube,Rec,drinkers chasers another rey skywalker movie,2023-04-04 14:12:17.193000+03:00,1


In [53]:
df["target_label"].value_counts()

0    497
1    195
2    158
3    120
4     30
Name: target_label, dtype: int64

At last, time to try classification

In [54]:
from sklearn.model_selection import train_test_split

train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names = train_test_split(
    np.array(df['preproc']), np.array(df['target_label']), np.array(df['category']), test_size=0.2, random_state=0)
train_corpus.shape, test_corpus.shape

((800,), (200,))

In [55]:
from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(min_df=0., max_df=1., norm='l2', use_idf=True, smooth_idf=True)
tv_train_features = tv.fit_transform(train_corpus)
tv_test_features = tv.transform(test_corpus)

In [56]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', C=1, random_state=0)
svm.fit(tv_train_features, train_label_names)
svm_bow_tv_scores = cross_val_score(svm, tv_train_features, train_label_names, cv=5)
svm_bow_tv_mean_score = np.mean(svm_bow_tv_scores)

print('CV Accuracy (5-fold):', svm_bow_tv_scores)
print('Mean CV Accuracy:', svm_bow_tv_mean_score)
svm_bow_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_bow_test_score)

CV Accuracy (5-fold): [0.84375 0.81875 0.80625 0.7875  0.79375]
Mean CV Accuracy: 0.8099999999999999
Test Accuracy: 0.795


In [57]:
from sklearn.metrics import classification_report
# predict labels for the test set
svm_predictions = svm.predict(tv_test_features)
# get the unique classes
unique_classes = list(set(test_label_names))
# print the classification report
print(classification_report(test_label_names, svm_predictions, labels=unique_classes))

              precision    recall  f1-score   support

    Politics       0.80      0.67      0.73        48
       Sport       1.00      0.67      0.80         6
     Studies       0.69      0.48      0.56        23
       Music       0.80      0.93      0.86        81
         Rec       0.80      0.88      0.84        42

    accuracy                           0.80       200
   macro avg       0.82      0.72      0.76       200
weighted avg       0.79      0.80      0.79       200



Now let's see what labels were assigned in df format

In [58]:
# Assuming you have already trained your model and obtained the predicted labels
predicted_labels = svm.predict(tv_test_features)

# Create a new dataframe that contains the test data and the predicted labels
test_df = pd.DataFrame({'video_title': test_corpus, 'category': test_label_names, 'actual_label': test_label_nums, 'predicted_label': predicted_labels})

# Print the first 10 rows of the new dataframe
test_df.head(10)

Unnamed: 0,video_title,category,actual_label,predicted_label
0,ost disposal unitimperium mix,Music,0,Music
1,endless space 2 riftborn prologue,Rec,1,Rec
2,russia cant beat us final episode,Politics,2,Politics
3,mavka forest song official trailer,Rec,1,Rec
4,capybara found human friend introduces whole family,Rec,1,Rec
5,cossack song,Music,0,Music
6,cruel world,Music,0,Music
7,hearts iron iv der machtigste konig im luftrevier german march,Music,0,Music
8,ukraine wins war,Politics,2,Politics
9,yakuza ost baka mitai kiryu full version,Music,0,Music


Select 100 rows from the big df of like 3-4k rows to test how the model performs in production or whatever it's called

In [59]:
# new rows were gotten with df.sample(n=600, random_state=42)
# random_state=42 because I want to get the same sample
# get the features for the new videos using the trained vectorizer
new_features = tv.transform(new_rows["preproc"])
# predict the category of the new videos using the trained LinearSVC
predicted_labels = svm.predict(new_features)
# add the predicted labels to the new_videos dataframe
new_rows['category'] = predicted_labels
new_rows

NameError: name 'new_rows' is not defined

In [None]:
# new_rows.to_csv("tried_600-700.csv")

Let's try to improve model accuracy by combining 'video_title' and 'channel_name'. This way, the model will have more information to work with. Also,  there'll be a higher chance that Ukrainian and other non-english videos won't be ignored. However, this'll be more useful with later because the current dataset has mostly English videos

In [None]:
# video_title_AND_channel_name is too long. Maybe try to use 'info' or 'combined' instead?
df["combined"] = df["video_title"] + " " + df["channel_name"]
df["combined_preproc"] = df["combined"].apply(preprocess)
df.head(3)

Unnamed: 0,video_title,channel_name,app,category,preproc,time,target_label,combined
0,YARMAK FT. TOF - МОЯ КРАЇНА,Yarmak Music,YouTube,Music,yarmak ft tof,2023-01-01 00:07:14.838000+02:00,0,YARMAK FT. TOF - МОЯ КРАЇНА Yarmak Music
1,Russian missile zooms over Kyiv before being shot down just in time,The Sun,YouTube,Politics,russian missile zooms kyiv shot time,2023-01-01 09:43:07.927000+02:00,2,Russian missile zooms over Kyiv before being shot down just in time The Sun
2,Pathfinder: Kingmaker Review,MandaloreGaming,YouTube,Rec,pathfinder kingmaker review,2023-01-01 10:15:11.845000+02:00,1,Pathfinder: Kingmaker Review MandaloreGaming


In [None]:
from sklearn.model_selection import train_test_split

train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names = train_test_split(
    np.array(df['combined_preproc']), np.array(df['target_label']), np.array(df['category']), test_size=0.2, random_state=0)
train_corpus.shape, test_corpus.shape

((800,), (200,))

In [None]:
from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(min_df=0., max_df=1., norm='l2', use_idf=True, smooth_idf=True)
tv_train_features = tv.fit_transform(train_corpus)
tv_test_features = tv.transform(test_corpus)

In [None]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', C=1, random_state=0)
svm.fit(tv_train_features, train_label_names)
svm_bow_tv_scores = cross_val_score(svm, tv_train_features, train_label_names, cv=5)
svm_bow_tv_mean_score = np.mean(svm_bow_tv_scores)

print('CV Accuracy (5-fold):', svm_bow_tv_scores)
print('Mean CV Accuracy:', svm_bow_tv_mean_score)
svm_bow_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_bow_test_score)

CV Accuracy (5-fold): [0.8875  0.85625 0.8375  0.85    0.83125]
Mean CV Accuracy: 0.8525
Test Accuracy: 0.845


In [None]:
from sklearn.metrics import classification_report
# predict labels for the test set
svm_predictions = svm.predict(tv_test_features)
# get the unique classes
unique_classes = list(set(test_label_names))
# print the classification report
print(classification_report(test_label_names, svm_predictions, labels=unique_classes))

              precision    recall  f1-score   support

    Politics       0.84      0.79      0.82        48
       Sport       1.00      0.83      0.91         6
     Studies       0.81      0.57      0.67        23
       Music       0.83      0.93      0.88        81
         Rec       0.86      0.90      0.88        42

    accuracy                           0.84       200
   macro avg       0.87      0.80      0.83       200
weighted avg       0.84      0.84      0.84       200

