# Youtube vidoes: Tag (keyword) Generator 

- This notebook aims to generate a tag (keyword) for a youtube video based in its title.
- Vectorization: Bag-Of-Words
- Model: Logistic Regression

In [18]:
import pandas as pd
import re
import numpy as np
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package stopwords to /home/pooya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data loading

In [19]:
video_stats_data = pd.read_csv("videos-stats.csv")
video_stats_data.head()

Unnamed: 0.1,Unnamed: 0,Title,Video ID,Published At,Keyword,Likes,Comments,Views
0,0,Apple Pay Is Killing the Physical Wallet After...,wAZZ-UWGVHI,2022-08-23,tech,3407.0,672.0,135612.0
1,1,The most EXPENSIVE thing I own.,b3x28s61q3c,2022-08-24,tech,76779.0,4306.0,1758063.0
2,2,My New House Gaming Setup is SICK!,4mgePWWCAmA,2022-08-23,tech,63825.0,3338.0,1564007.0
3,3,Petrol Vs Liquid Nitrogen | Freezing Experimen...,kXiYSI7H2b0,2022-08-23,tech,71566.0,1426.0,922918.0
4,4,Best Back to School Tech 2022!,ErMwWXQxHp0,2022-08-08,tech,96513.0,5155.0,1855644.0


In [20]:
videos_titles = list(video_stats_data["Title"].values)
keywords = list(video_stats_data["Keyword"].values)

#### Get the actual title. The first part of string (before | ) is the video's title. The rest are the publisher metadata that we do not need.

In [21]:
titles = [v_t.split('|')[0] for v_t in videos_titles]

### Cleaning
- Get all english stop words
- Remove non-alphabetic characters
- Make all characters lowercase
- Remove stop words
- Stemming

In [22]:
eng_stopwords = stopwords.words('english')
corpus = []
for vt in titles:
    cleaned_title = re.sub('[^a-zA-Z]', ' ', vt)
    cleaned_title = cleaned_title.lower()
    cleaned_title = cleaned_title.split()
    stemmer = PorterStemmer()
    cleaned_title = [stemmer.stem(token) for token in cleaned_title if not token in set(eng_stopwords)]
    corpus.append(' '.join(cleaned_title))

### Bag-Of-Words

In [23]:
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
lbe = LabelEncoder()
y = lbe.fit_transform(keywords)

### Split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Logistic Regression Model

In [25]:
lr_reg = LogisticRegression(multi_class='ovr', solver='liblinear')
lr_reg.fit(X_train, y_train)
y_pred = lr_reg.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy is: {}".format(acc))

Accuracy is: 0.7904509283819628


### CAP analysis

In [26]:
model_y = [y for _, y in sorted(zip(y_pred, y_test), reverse=True)]
nb_y = np.append([0], np.cumsum(model_y))
half_x = int((50 * len(y_test) / 100))
cap = nb_y[half_x] * 100 / max(nb_y)
print("CAP: {} %".format(cap))

CAP: 70.97393689986282 %
