# Youtube Video Title Predictor for Dhruv Rathee
A **Machine Learning** `Model` to Predict Title of Youtube Videos of Specifically Dhruv Rathee With Input of `Description` and `No. of Views`.

## Fetch The Data

In [1]:
from fetch_data import fetch
# fetch()

## Load The Data

In [2]:
import pandas as pd
videos = pd.read_csv("dhruv_rathee_videos.csv")
videos.head()

Unnamed: 0,Title,Description,Views,Likes
0,The Dark Reality of World's Fattest Country | ...,The most obese country in the world might surp...,2793372,145552
1,Why Hitler Lost? | World War 2 | Dhruv Rathee,"In the last part of our World War II series, w...",3314998,167680
2,The Truth of Pulwama | Satyapal Malik Allegati...,"Satyapal Malik, a senior politician and ex-gov...",5896099,328429
3,Why World War 2 Happened? | The Real Reason | ...,World War II was a significant event in world ...,4141493,198121
4,Why was Donald Trump Arrested? | Full Case Exp...,Former US President Donald Trump was recently ...,3296577,131908


## Vectorize The Data

In [3]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /home/praddyumnyadav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the data
videos = pd.read_csv("dhruv_rathee_videos.csv")

# Define a function to vectorize the text data using the TfidfVectorizer
def vectorize_text(text, max_length=100):
    # tokenize the text
    tokens = nltk.word_tokenize(text.lower())
    # create a dictionary of word frequencies
    word_freq = nltk.FreqDist(tokens)
    # create a bag-of-words vector
    vector = np.zeros(max_length)
    for i, token in enumerate(tokens):
        if i >= max_length:
            break
        if token in word_freq:
            vector[i] = word_freq[token]
    return vector

# Create a new dataframe that contains the vectorized text data
title_vectors = pd.DataFrame(videos["Title"].apply(lambda x: vectorize_text(x)).tolist())
description_vectors = pd.DataFrame(videos["Description"].apply(lambda x: vectorize_text(x)).tolist())

# Rename the columns of the new dataframes
title_vectors.columns = ["title_" + str(col) for col in title_vectors.columns]
description_vectors.columns = ["description_" + str(col) for col in description_vectors.columns]

# Concatenate the original dataframe with the new dataframes
videos = pd.concat([videos, title_vectors, description_vectors], axis=1)

# Drop the original text columns
videos = videos.drop(["Title", "Description"], axis=1)

# Convert the new columns to float64 data type
videos = videos.astype("float64")

# Print the data types of the dataframe
print(videos.dtypes)

Views             float64
Likes             float64
title_0           float64
title_1           float64
title_2           float64
                   ...   
description_95    float64
description_96    float64
description_97    float64
description_98    float64
description_99    float64
Length: 202, dtype: object


## Info of the DataSet

In [5]:
videos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371 entries, 0 to 370
Columns: 202 entries, Views to description_99
dtypes: float64(202)
memory usage: 585.6 KB


In [6]:
videos.describe()

Unnamed: 0,Views,Likes,title_0,title_1,title_2,title_3,title_4,title_5,title_6,title_7,...,description_90,description_91,description_92,description_93,description_94,description_95,description_96,description_97,description_98,description_99
count,371.0,371.0,371.0,371.0,371.0,371.0,371.0,371.0,371.0,371.0,...,371.0,371.0,371.0,371.0,371.0,371.0,371.0,371.0,371.0,371.0
mean,3335111.0,183855.299191,1.02965,1.048518,1.09973,1.188679,1.242588,1.161725,1.145553,1.105121,...,10.315364,10.504043,11.48248,11.121294,11.161725,10.552561,11.541779,11.407008,12.541779,12.142857
std,2364448.0,113729.54529,0.169848,0.227363,0.407066,0.489877,0.545664,0.4541,0.471169,0.517009,...,18.508004,17.919847,18.409504,18.371507,18.423567,18.204375,18.79765,18.976482,19.418304,18.871359
min,54558.0,1424.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1612798.0,101329.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,2826966.0,166028.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
75%,4581922.0,239584.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,5.0,6.0,8.0,7.0,7.0,6.5,7.5,6.5,9.0,10.0
max,12864340.0,718797.0,2.0,3.0,4.0,3.0,4.0,4.0,4.0,4.0,...,80.0,62.0,73.0,79.0,67.0,79.0,65.0,71.0,67.0,79.0


## Split Input and Output from the DataSet

In [7]:
titles = []
for i in range(100):
    titles.append(f"title_{i}")

# Store the remaining columns in 'X'
X = videos.drop(titles, axis=1)

# Split the 'Title' column into a separate variable 'y'
y = videos[titles]

In [8]:
X.head()

Unnamed: 0,Views,Likes,description_0,description_1,description_2,description_3,description_4,description_5,description_6,description_7,...,description_90,description_91,description_92,description_93,description_94,description_95,description_96,description_97,description_98,description_99
0,2793372.0,145552.0,7.0,1.0,1.0,2.0,5.0,7.0,1.0,1.0,...,2.0,4.0,7.0,3.0,2.0,45.0,16.0,45.0,2.0,1.0
1,3314998.0,167680.0,3.0,5.0,1.0,2.0,3.0,1.0,3.0,3.0,...,16.0,43.0,1.0,1.0,1.0,6.0,1.0,1.0,1.0,6.0
2,5896099.0,328429.0,1.0,1.0,5.0,1.0,1.0,1.0,3.0,1.0,...,13.0,1.0,1.0,2.0,1.0,2.0,2.0,7.0,1.0,3.0
3,4141493.0,198121.0,3.0,4.0,1.0,1.0,1.0,1.0,1.0,5.0,...,1.0,4.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0,1.0
4,3296577.0,131908.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,...,3.0,1.0,2.0,4.0,1.0,8.0,1.0,1.0,1.0,2.0


In [9]:
y.head()

Unnamed: 0,title_0,title_1,title_2,title_3,title_4,title_5,title_6,title_7,title_8,title_9,...,title_90,title_91,title_92,title_93,title_94,title_95,title_96,title_97,title_98,title_99
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Split `Train` and `Test` Set

In [10]:
from sklearn.model_selection import train_test_split

# Split Dataset Into Train and Test Set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Take Info of Both `Train` and `Test` Set

In [11]:
X_train.head()

Unnamed: 0,Views,Likes,description_0,description_1,description_2,description_3,description_4,description_5,description_6,description_7,...,description_90,description_91,description_92,description_93,description_94,description_95,description_96,description_97,description_98,description_99
192,1695648.0,117018.0,2.0,8.0,2.0,3.0,1.0,1.0,1.0,1.0,...,1.0,4.0,1.0,4.0,1.0,3.0,2.0,1.0,1.0,1.0
75,6060899.0,325096.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,...,1.0,2.0,4.0,2.0,2.0,1.0,3.0,1.0,3.0,2.0
84,2860754.0,137997.0,5.0,4.0,5.0,1.0,5.0,1.0,1.0,1.0,...,1.0,1.0,5.0,1.0,4.0,1.0,2.0,1.0,3.0,1.0
359,208711.0,13237.0,7.0,1.0,1.0,2.0,7.0,1.0,7.0,1.0,...,80.0,1.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0
16,2035775.0,110762.0,1.0,3.0,2.0,2.0,5.0,1.0,4.0,1.0,...,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0


In [12]:
y_train.head()

Unnamed: 0,title_0,title_1,title_2,title_3,title_4,title_5,title_6,title_7,title_8,title_9,...,title_90,title_91,title_92,title_93,title_94,title_95,title_96,title_97,title_98,title_99
192,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
359,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
X_test.head()

Unnamed: 0,Views,Likes,description_0,description_1,description_2,description_3,description_4,description_5,description_6,description_7,...,description_90,description_91,description_92,description_93,description_94,description_95,description_96,description_97,description_98,description_99
327,1138893.0,57851.0,2.0,2.0,1.0,2.0,1.0,1.0,4.0,4.0,...,1.0,1.0,3.0,1.0,7.0,4.0,1.0,4.0,1.0,1.0
33,5666776.0,213287.0,1.0,1.0,2.0,5.0,2.0,1.0,1.0,5.0,...,1.0,1.0,1.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0
15,5914905.0,239606.0,1.0,7.0,5.0,1.0,4.0,1.0,1.0,1.0,...,3.0,2.0,2.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0
314,816234.0,50920.0,7.0,1.0,1.0,2.0,3.0,1.0,2.0,9.0,...,3.0,1.0,1.0,8.0,1.0,8.0,1.0,1.0,16.0,1.0
57,7154900.0,365901.0,1.0,3.0,1.0,1.0,6.0,1.0,1.0,1.0,...,3.0,8.0,3.0,2.0,2.0,5.0,1.0,1.0,2.0,5.0


In [14]:
y_test.head()

Unnamed: 0,title_0,title_1,title_2,title_3,title_4,title_5,title_6,title_7,title_8,title_9,...,title_90,title_91,title_92,title_93,title_94,title_95,title_96,title_97,title_98,title_99
327,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
314,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Test Different **Machine Learning** Models.

## Try `Ridge Regression` Model

In [15]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# create a Ridge regression object with regularization parameter alpha=1.0
ridge = Ridge(alpha=1.0)

# fit the model on the training data
ridge.fit(X_train, y_train)

# predict the target values for the test data
y_pred = ridge.predict(X_test)

# calculate the mean squared error between the predicted and actual values
mse = mean_squared_error(y_test, y_pred)

# print the mean squared error
print("Mean Squared Error:", mse*100, "%")

Mean Squared Error: 8.427648348813209 %
