In [24]:
import pandas as pd
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to /home/pooya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
video_stats_data = pd.read_csv("videos-stats.csv")
video_stats_data.head()

Unnamed: 0.1,Unnamed: 0,Title,Video ID,Published At,Keyword,Likes,Comments,Views
0,0,Apple Pay Is Killing the Physical Wallet After...,wAZZ-UWGVHI,2022-08-23,tech,3407.0,672.0,135612.0
1,1,The most EXPENSIVE thing I own.,b3x28s61q3c,2022-08-24,tech,76779.0,4306.0,1758063.0
2,2,My New House Gaming Setup is SICK!,4mgePWWCAmA,2022-08-23,tech,63825.0,3338.0,1564007.0
3,3,Petrol Vs Liquid Nitrogen | Freezing Experimen...,kXiYSI7H2b0,2022-08-23,tech,71566.0,1426.0,922918.0
4,4,Best Back to School Tech 2022!,ErMwWXQxHp0,2022-08-08,tech,96513.0,5155.0,1855644.0


In [4]:
videos_titles = list(video_stats_data["Title"].values)
keywords = list(video_stats_data["Keyword"].values)

In [5]:
# get the actual title. The first part of string (before | ) is the video title. 
# The rest are the publisher metadata

titles = [v_t.split('|')[0] for v_t in videos_titles]

In [6]:
# 1- get all english stop words
# 2- remove non-alphabetic characters
# 3- make all lowercase
# 4- remove stop words
# 5- stemming


eng_stopwords = stopwords.words('english')
corpus = []
for vt in titles:
    cleaned_title = re.sub('[^a-zA-Z]', ' ', vt)
    cleaned_title = cleaned_title.lower()
    cleaned_title = cleaned_title.split()
    stemmer = PorterStemmer()
    cleaned_title = [stemmer.stem(token) for token in cleaned_title if not token in set(eng_stopwords)]
    corpus.append(' '.join(cleaned_title))

In [15]:
# create bag of words

cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
lbe = LabelEncoder()
y = lbe.fit_transform(keywords)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
mNB = MultinomialNB()
mNB.fit(X_train, y_train)

In [23]:
y_pred = mNB.predict(X_test)

In [25]:
accuracy_score(y_test, y_pred)

0.7851458885941645

In [27]:
y_pred

array([22, 17, 20, 21, 29, 18,  6, 32, 39, 33,  2, 26, 35, 18, 26,  1, 39,
       15, 23,  4,  6, 20,  4, 32,  5,  2, 15, 11, 33, 15, 27,  5, 29,  4,
       35, 31, 40,  8, 24, 24, 25, 27, 18, 27,  5,  3, 38,  2,  8,  8, 27,
       33, 26, 12,  6, 17, 11, 37, 10, 13, 19, 14, 32,  2, 17, 31, 22, 29,
        1, 37,  7,  1,  1,  8, 21, 23, 22,  2, 11, 39, 26, 15,  9, 26, 20,
       38, 38, 36, 29, 11, 27,  7, 22,  8, 20,  3,  9, 33,  7, 10, 40, 10,
        2, 28, 18, 17,  3,  7, 21, 23, 18,  6, 29,  1, 14, 32,  8,  4, 15,
       31,  8,  7,  9, 24,  5, 19,  4, 23,  9, 26, 40, 15, 18,  9,  3, 20,
       17, 30, 39, 30, 32, 40, 18, 34, 37, 17, 15,  6, 26, 40, 21,  9, 38,
       30, 10, 36, 15, 15,  5, 29, 26, 28, 19, 14,  1, 24, 16,  1, 14,  9,
       17, 33,  5, 10, 22,  1, 14, 34, 34, 26, 18, 33, 33, 14, 12, 37,  2,
       31, 13, 37, 35, 36, 14, 38, 34, 10, 31,  6,  8, 38,  3,  5, 15, 40,
       32,  4, 40, 39, 22, 24,  3, 23,  3, 35,  7, 35, 21, 18, 21, 19, 35,
       18, 30,  3,  2, 11

In [28]:
y_test

array([22, 17, 20, 21, 29, 28,  6, 32, 39, 33,  2, 19, 35, 13, 26,  1, 39,
       15, 23,  4,  6, 20,  4, 32,  5,  2, 15, 11, 33, 15, 27,  5, 27,  4,
       35, 13, 19,  8, 24, 24, 25, 27, 18, 34,  5,  3, 16, 16,  8,  8, 27,
       33, 26, 12,  6, 33, 11,  9, 10, 13, 19, 14, 32,  2, 17, 31, 22, 29,
        1, 37,  7,  1,  1, 25, 21, 23, 22,  2, 11, 39, 26, 15,  9, 26, 20,
       16, 38, 36, 29, 11, 27,  7, 22,  8, 20,  3,  9, 13,  7, 10, 40, 10,
        2, 28, 18, 13, 12,  7, 21, 23, 28,  6, 29,  1, 39, 32,  8,  4, 15,
       31,  8,  7,  9, 24,  5, 35,  4, 23,  9, 26, 18, 15, 19,  9,  3, 20,
       17, 34, 39, 30, 32, 40, 18, 34, 37, 17, 31,  6, 12, 40, 21,  9, 38,
       30, 10,  6, 15, 15,  5, 14, 17, 28, 19, 28,  1, 24, 16,  1, 34,  9,
       17, 33,  5, 10, 30,  1, 14, 36, 24, 26, 18, 33, 33, 14, 12, 37,  2,
       20, 13, 16, 35, 36, 14, 38, 27, 10, 31,  6,  8, 38,  3,  5, 15, 40,
       32, 24, 40, 39, 22,  7,  3, 23,  3, 31, 31, 35, 21, 31, 21,  5, 19,
       31, 36,  3,  2, 23

In [29]:
X_test[5]

array([0, 0, 0, ..., 0, 0, 0])