# Youtube Video Title Predictor for Dhruv Rathee
A **Machine Learning** `Model` to Predict Title of Youtube Videos of Specifically Dhruv Rathee With Input of `Description` and `No. of Views`.

## Fetch The Data

In [1]:
from fetch_data import fetch
# fetch()

## Load The Data

In [2]:
import pandas as pd
videos = pd.read_csv("dhruv_rathee_videos.csv")
videos.head()

Unnamed: 0,Title,Description,Views,Likes
0,Theory of Evolution | Fact vs Fiction | How Li...,Take Traya's free hair test:\nhttps://trayahea...,2721036,161595
1,The Dark Reality of World's Fattest Country | ...,"Learn how to manage time, battle procrastinati...",3714773,173012
2,Why Hitler Lost? | World War 2 | Dhruv Rathee,"In the last part of our World War II series, w...",3815011,178130
3,The Truth of Pulwama | Satyapal Malik Allegati...,"Satyapal Malik, a senior politician and ex-gov...",6328261,342651
4,Why World War 2 Happened? | The Real Reason | ...,World War II was a significant event in world ...,4355653,202866


## Vectorize The Data

In [3]:
import nltk
nltk.download("punkt")
nltk.download("words")

[nltk_data] Downloading package punkt to
[nltk_data]     /home/praddyumnyadav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /home/praddyumnyadav/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the data
videos = pd.read_csv("dhruv_rathee_videos.csv")

# Define a function to vectorize the text data using the TfidfVectorizer
def vectorize_text(text, max_length=100):
    # tokenize the text
    tokens = nltk.word_tokenize(text.lower())
    # create a dictionary of word frequencies
    word_freq = nltk.FreqDist(tokens)
    # create a bag-of-words vector
    vector = np.zeros(max_length)
    for i, token in enumerate(tokens):
        if i >= max_length:
            break
        if token in word_freq:
            vector[i] = word_freq[token]
    return vector

# Create a new dataframe that contains the vectorized text data
title_vectors = pd.DataFrame(videos["Title"].apply(lambda x: vectorize_text(x)).tolist())
description_vectors = pd.DataFrame(videos["Description"].apply(lambda x: vectorize_text(x)).tolist())

# Rename the columns of the new dataframes
title_vectors.columns = ["title_" + str(col) for col in title_vectors.columns]
description_vectors.columns = ["description_" + str(col) for col in description_vectors.columns]

# Concatenate the original dataframe with the new dataframes
videos = pd.concat([videos, title_vectors, description_vectors], axis=1)

# Drop the original text columns
videos = videos.drop(["Title", "Description"], axis=1)

# Convert the new columns to float64 data type
videos = videos.astype("float64")

# Print the data types of the dataframe
print(videos.dtypes)

Views             float64
Likes             float64
title_0           float64
title_1           float64
title_2           float64
                   ...   
description_95    float64
description_96    float64
description_97    float64
description_98    float64
description_99    float64
Length: 202, dtype: object


## Create a Function for **Devectorisation**

In [5]:
def devectorize_text(vector):
    # get the indices of the non-zero elements in the vector
    nonzero_indices = np.nonzero(vector)[0]
    # create a list of words corresponding to the non-zero indices
    words = [nltk.corpus.words.words()[i] for i in nonzero_indices]
    # concatenate the words to form the original text
    text = " ".join(words)
    return text

## Info of the DataSet

In [6]:
videos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372 entries, 0 to 371
Columns: 202 entries, Views to description_99
dtypes: float64(202)
memory usage: 587.2 KB


In [7]:
videos.describe()

Unnamed: 0,Views,Likes,title_0,title_1,title_2,title_3,title_4,title_5,title_6,title_7,...,description_90,description_91,description_92,description_93,description_94,description_95,description_96,description_97,description_98,description_99
count,372.0,372.0,372.0,372.0,372.0,372.0,372.0,372.0,372.0,372.0,...,372.0,372.0,372.0,372.0,372.0,372.0,372.0,372.0,372.0,372.0
mean,3340067.0,183564.155914,1.02957,1.045699,1.099462,1.196237,1.241935,1.155914,1.145161,1.110215,...,10.47043,10.491935,11.185484,10.833333,10.943548,10.400538,11.352151,11.104839,12.024194,11.502688
std,2387519.0,114059.844338,0.169626,0.221628,0.40655,0.499797,0.545073,0.449489,0.470594,0.525575,...,18.541763,17.892685,18.167104,18.299667,18.173602,18.06885,18.619966,18.646153,19.051695,18.375464
min,54735.0,1429.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1588478.0,100749.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,2846777.0,165705.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
75%,4591072.0,239758.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,5.0,6.0,8.0,7.0,7.0,6.0,7.0,6.0,8.25,9.0
max,12939650.0,723467.0,2.0,3.0,4.0,3.0,4.0,4.0,4.0,4.0,...,80.0,62.0,67.0,79.0,67.0,79.0,65.0,71.0,67.0,79.0


## Split Input and Output from the DataSet

In [8]:
titles = []
for i in range(100):
    titles.append(f"title_{i}")

# Store the remaining columns in 'X'
X = videos.drop(titles, axis=1)

# Split the 'Title' column into a separate variable 'y'
y = videos[titles]

In [9]:
X.head()

Unnamed: 0,Views,Likes,description_0,description_1,description_2,description_3,description_4,description_5,description_6,description_7,...,description_90,description_91,description_92,description_93,description_94,description_95,description_96,description_97,description_98,description_99
0,2721036.0,161595.0,1.0,1.0,3.0,1.0,1.0,1.0,43.0,15.0,...,1.0,3.0,9.0,1.0,2.0,1.0,5.0,5.0,1.0,1.0
1,3714773.0,173012.0,3.0,4.0,7.0,2.0,2.0,7.0,1.0,2.0,...,1.0,1.0,1.0,1.0,7.0,2.0,1.0,1.0,1.0,1.0
2,3815011.0,178130.0,3.0,5.0,1.0,2.0,3.0,1.0,3.0,3.0,...,16.0,43.0,1.0,1.0,2.0,6.0,1.0,1.0,1.0,6.0
3,6328261.0,342651.0,1.0,1.0,4.0,2.0,1.0,1.0,4.0,1.0,...,12.0,1.0,1.0,2.0,1.0,2.0,2.0,6.0,1.0,3.0
4,4355653.0,202866.0,3.0,4.0,1.0,1.0,2.0,1.0,1.0,5.0,...,2.0,4.0,1.0,1.0,1.0,4.0,1.0,1.0,2.0,1.0


In [10]:
y.head()

Unnamed: 0,title_0,title_1,title_2,title_3,title_4,title_5,title_6,title_7,title_8,title_9,...,title_90,title_91,title_92,title_93,title_94,title_95,title_96,title_97,title_98,title_99
0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,3.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Split `Train` and `Test` Set

In [11]:
from sklearn.model_selection import train_test_split

# Split Dataset Into Train and Test Set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Take Info of Both `Train` and `Test` Set

In [12]:
X_train.head()

Unnamed: 0,Views,Likes,description_0,description_1,description_2,description_3,description_4,description_5,description_6,description_7,...,description_90,description_91,description_92,description_93,description_94,description_95,description_96,description_97,description_98,description_99
192,1008023.0,60848.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,6.0,1.0,2.0,1.0,6.0,1.0,3.0,5.0,1.0
75,3909899.0,219424.0,1.0,2.0,1.0,5.0,3.0,5.0,1.0,1.0,...,4.0,2.0,5.0,1.0,2.0,1.0,1.0,1.0,5.0,4.0
84,3865090.0,216433.0,2.0,1.0,6.0,1.0,1.0,7.0,1.0,1.0,...,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0
360,114148.0,5899.0,1.0,2.0,1.0,1.0,1.0,6.0,1.0,1.0,...,1.0,4.0,1.0,4.0,1.0,71.0,30.0,71.0,1.0,1.0
16,5954322.0,240162.0,1.0,6.0,5.0,1.0,4.0,1.0,1.0,1.0,...,4.0,2.0,2.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0


In [13]:
y_train.head()

Unnamed: 0,title_0,title_1,title_2,title_3,title_4,title_5,title_6,title_7,title_8,title_9,...,title_90,title_91,title_92,title_93,title_94,title_95,title_96,title_97,title_98,title_99
192,1.0,2.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84,1.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
360,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
X_test.head()

Unnamed: 0,Views,Likes,description_0,description_1,description_2,description_3,description_4,description_5,description_6,description_7,...,description_90,description_91,description_92,description_93,description_94,description_95,description_96,description_97,description_98,description_99
327,2905095.0,166494.0,1.0,9.0,1.0,1.0,1.0,1.0,1.0,2.0,...,1.0,9.0,1.0,1.0,4.0,2.0,1.0,1.0,67.0,29.0
33,5110421.0,380996.0,1.0,1.0,1.0,9.0,1.0,5.0,1.0,1.0,...,1.0,1.0,9.0,2.0,1.0,3.0,1.0,5.0,1.0,11.0
15,6192369.0,377179.0,1.0,1.0,1.0,3.0,2.0,1.0,1.0,1.0,...,1.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
314,1572859.0,115742.0,2.0,1.0,5.0,1.0,7.0,1.0,2.0,1.0,...,52.0,52.0,52.0,52.0,52.0,52.0,52.0,1.0,3.0,1.0
57,7366204.0,370594.0,3.0,1.0,3.0,11.0,1.0,1.0,1.0,4.0,...,1.0,6.0,2.0,11.0,2.0,2.0,11.0,1.0,3.0,3.0


In [15]:
y_test.head()

Unnamed: 0,title_0,title_1,title_2,title_3,title_4,title_5,title_6,title_7,title_8,title_9,...,title_90,title_91,title_92,title_93,title_94,title_95,title_96,title_97,title_98,title_99
327,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
314,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Test Different **Machine Learning** Models.

## Try `Ridge Regression` Model

In [16]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# create a Ridge regression object with regularization parameter alpha=1.0
ridge = Ridge(alpha=1.0)

# fit the model on the training data
ridge.fit(X_train, y_train)

# predict the target values for the test data
y_pred = ridge.predict(X_test)

# calculate the mean squared error between the predicted and actual values
mse = mean_squared_error(y_test, y_pred)

# print the mean squared error
print("Mean Squared Error:", str(mse*100) + "%")

Mean Squared Error: 8.971893784195869%


In [17]:
description = """
Take Traya's free hair test:
https://trayahealth.com/DhruvR
Use my code 20DHRUV to get 20% off on your purchase.
*Offer ends in 5 days

If you think that we humans evolved from monkeys, gorillas, chimpanzees, or any other apes that we see today, you're wrong. Rather, all these apes and humans had a common ancestor millions of years ago. So it's not an evolution stream; rather, it's an evolution tree. But the question is, why did some of the species turn into chimpanzees and others into humans? What is the full story of evolution? And is it even true? Watch this video by Dhruv Rathee to find out, as he deep dives and explains the entire theory of evolution.

Link to the videos mentioned :
Mystery of Dinosaurs | How Did They Become Extinct? | Dhruv Rathee -   

 • Mystery of Dinosa...  
---------------------------------------------------- 
JOIN MY COURSE: 
✏️ Are you curious about how I manage to travel so much and still stay productive? Learn how to manage time and maximize productivity in my specialised online course. Join here: https://academy.dhruvrathee.com
Use GET20 for a straight up 20% discount!

LISTEN TO MY PODCAST: 
🎧 My Spotify exclusive podcast. Learn about how India works, only on Maha Bharat with Dhruv Rathee: https://spoti.fi/3IhBW51

FOLLOW ME ON:
📩 Telegram: https://t.me/dhruvrathee
📸 Instagram: http://www.instagram.com/dhruvrathee
🐦  Twitter: http://www.twitter.com/dhruv_rathee
▶️ Main Channel:   

 / dhruvrathee  
🎦 Vlog Channel:   

 / dhruvratheevlogs  
🩳 Shorts Channel:   

 / @drshorts  

MY VIDEO TOPICS:
🗺 Geopolitics:   

 • Geopolitics  
📘 History:   

 • History Education  
🔬 Science:   

 • Science  
☠️ Mystery:   

 • Mystery Videos  
💰Finance:   

 • Financial Education  
🇮🇳 Indian Current Affairs:   

 • Current Affairs (...  
🌍 International Current Affairs:   

 • Current Affairs (...  

DOWNLOAD MY APP: 
📱Android app: https://play.google.com/store/apps/de...
📱iOS App: https://apps.apple.com/ie/app/dhruv-r...
----------------------------------------------------
"""
exp_dict = {
    "Views": pd.Series(2721036),
    "Likes": pd.Series(161000)
}

test_videos = pd.DataFrame(exp_dict)
description_vectors = pd.DataFrame(pd.Series(description).apply(lambda x: vectorize_text(x)).tolist())
description_vectors.columns = ["description_" + str(col) for col in description_vectors.columns]
test_videos = pd.concat([test_videos, description_vectors], axis=1)
prediction = ridge.predict(test_videos)

# apply the devectorize_text() function to the title vectors
prediction = title_vectors.apply(lambda x: devectorize_text(x), axis=1)
prediction

0      A a aa aal aalii aam Aani aardvark aardwolf Aa...
1      A a aa aal aalii aam Aani aardvark aardwolf Aa...
2      A a aa aal aalii aam Aani aardvark aardwolf Aa...
3      A a aa aal aalii aam Aani aardvark aardwolf Aa...
4      A a aa aal aalii aam Aani aardvark aardwolf Aa...
                             ...                        
367    A a aa aal aalii aam Aani aardvark aardwolf Aa...
368    A a aa aal aalii aam Aani aardvark aardwolf Aa...
369    A a aa aal aalii aam Aani aardvark aardwolf Aa...
370    A a aa aal aalii aam Aani aardvark aardwolf Aa...
371    A a aa aal aalii aam Aani aardvark aardwolf Aa...
Length: 372, dtype: object

## Try `Elastic Net` Model

In [18]:
from sklearn.linear_model import ElasticNet

# create an Elastic Net regression object with regularization parameters alpha=1.0 and l1_ratio=0.5
elastic_net = ElasticNet(alpha=1.0, l1_ratio=0.5)

# fit the model on the training data
elastic_net.fit(X_train, y_train)

# predict the target values for the test data
y_pred = elastic_net.predict(X_test)

# calculate the mean squared error between the predicted and actual values
mse = mean_squared_error(y_test, y_pred)

# print the mean squared error
print("Mean Squared Error:", str(mse*100) + "%")

  model = cd_fast.enet_coordinate_descent(


Mean Squared Error: 4.860946857452168%


In [19]:
description = """
Take Traya's free hair test:
https://trayahealth.com/DhruvR
Use my code 20DHRUV to get 20% off on your purchase.
*Offer ends in 5 days

If you think that we humans evolved from monkeys, gorillas, chimpanzees, or any other apes that we see today, you're wrong. Rather, all these apes and humans had a common ancestor millions of years ago. So it's not an evolution stream; rather, it's an evolution tree. But the question is, why did some of the species turn into chimpanzees and others into humans? What is the full story of evolution? And is it even true? Watch this video by Dhruv Rathee to find out, as he deep dives and explains the entire theory of evolution.

Link to the videos mentioned :
Mystery of Dinosaurs | How Did They Become Extinct? | Dhruv Rathee -   

 • Mystery of Dinosa...  
---------------------------------------------------- 
JOIN MY COURSE: 
✏️ Are you curious about how I manage to travel so much and still stay productive? Learn how to manage time and maximize productivity in my specialised online course. Join here: https://academy.dhruvrathee.com
Use GET20 for a straight up 20% discount!

LISTEN TO MY PODCAST: 
🎧 My Spotify exclusive podcast. Learn about how India works, only on Maha Bharat with Dhruv Rathee: https://spoti.fi/3IhBW51

FOLLOW ME ON:
📩 Telegram: https://t.me/dhruvrathee
📸 Instagram: http://www.instagram.com/dhruvrathee
🐦  Twitter: http://www.twitter.com/dhruv_rathee
▶️ Main Channel:   

 / dhruvrathee  
🎦 Vlog Channel:   

 / dhruvratheevlogs  
🩳 Shorts Channel:   

 / @drshorts  

MY VIDEO TOPICS:
🗺 Geopolitics:   

 • Geopolitics  
📘 History:   

 • History Education  
🔬 Science:   

 • Science  
☠️ Mystery:   

 • Mystery Videos  
💰Finance:   

 • Financial Education  
🇮🇳 Indian Current Affairs:   

 • Current Affairs (...  
🌍 International Current Affairs:   

 • Current Affairs (...  

DOWNLOAD MY APP: 
📱Android app: https://play.google.com/store/apps/de...
📱iOS App: https://apps.apple.com/ie/app/dhruv-r...
----------------------------------------------------
"""
exp_dict = {
    "Views": pd.Series(2721036),
    "Likes": pd.Series(161000)
}

test_videos = pd.DataFrame(exp_dict)
description_vectors = pd.DataFrame(pd.Series(description).apply(lambda x: vectorize_text(x)).tolist())
description_vectors.columns = ["description_" + str(col) for col in description_vectors.columns]
test_videos = pd.concat([test_videos, description_vectors], axis=1)
prediction = elastic_net.predict(test_videos)

# apply the devectorize_text() function to the title vectors
prediction = title_vectors.apply(lambda x: devectorize_text(x), axis=1)
prediction

0      A a aa aal aalii aam Aani aardvark aardwolf Aa...
1      A a aa aal aalii aam Aani aardvark aardwolf Aa...
2      A a aa aal aalii aam Aani aardvark aardwolf Aa...
3      A a aa aal aalii aam Aani aardvark aardwolf Aa...
4      A a aa aal aalii aam Aani aardvark aardwolf Aa...
                             ...                        
367    A a aa aal aalii aam Aani aardvark aardwolf Aa...
368    A a aa aal aalii aam Aani aardvark aardwolf Aa...
369    A a aa aal aalii aam Aani aardvark aardwolf Aa...
370    A a aa aal aalii aam Aani aardvark aardwolf Aa...
371    A a aa aal aalii aam Aani aardvark aardwolf Aa...
Length: 372, dtype: object