# Youtube Video Title Predictor for Dhruv Rathee
A **Machine Learning** `Model` to Predict Title of Youtube Videos of Specifically Dhruv Rathee With Input of `Description` and `No. of Views`.

## Fetch The Data

In [1]:
from fetch_data import fetch
# fetch()

## Load The Data

In [2]:
import pandas as pd
videos = pd.read_csv("dhruv_rathee_videos.csv")
videos.head()

Unnamed: 0,Title,Description,Views,Likes
0,The Dark Reality of World's Fattest Country | ...,The most obese country in the world might surp...,2793372,145552
1,Why Hitler Lost? | World War 2 | Dhruv Rathee,"In the last part of our World War II series, w...",3314998,167680
2,The Truth of Pulwama | Satyapal Malik Allegati...,"Satyapal Malik, a senior politician and ex-gov...",5896099,328429
3,Why World War 2 Happened? | The Real Reason | ...,World War II was a significant event in world ...,4141493,198121
4,Why was Donald Trump Arrested? | Full Case Exp...,Former US President Donald Trump was recently ...,3296577,131908


## Vectorize The Data

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a function to vectorize the text data using the TfidfVectorizer
def vectorize_text(text):
    # Define the TfidfVectorizer with the desired parameters
    tfidf = TfidfVectorizer(max_features=1000, stop_words="english")

    # Fit the TfidfVectorizer to the text data and transform it to create the vector representation
    vector = tfidf.fit_transform([text])

    # Return the vector representation as a numpy array
    return vector.toarray()


videos["Title"] = videos["Title"].apply(lambda x: vectorize_text(x))
videos["Description"] = videos["Description"].apply(lambda x: vectorize_text(x))

## Info of the DataSet

In [4]:
videos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371 entries, 0 to 370
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        371 non-null    object
 1   Description  371 non-null    object
 2   Views        371 non-null    int64 
 3   Likes        371 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 11.7+ KB


In [5]:
videos.describe()

Unnamed: 0,Views,Likes
count,371.0,371.0
mean,3335111.0,183855.299191
std,2364448.0,113729.54529
min,54558.0,1424.0
25%,1612798.0,101329.0
50%,2826966.0,166028.0
75%,4581922.0,239584.5
max,12864340.0,718797.0


## Split Input and Output from the DataSet

In [6]:
# Store the remaining columns in 'X'
X = videos.drop('Title', axis=1)

# Split the 'Title' column into a separate variable 'y'
y = videos['Title']

In [7]:
X.head()

Unnamed: 0,Description,Views,Likes
0,"[[0.029411764705882353, 0.029411764705882353, ...",2793372,145552
1,"[[0.02982749931359468, 0.02982749931359468, 0....",3314998,167680
2,"[[0.03286203899503875, 0.03286203899503875, 0....",5896099,328429
3,"[[0.030414953233623677, 0.030414953233623677, ...",4141493,198121
4,"[[0.030233702757845024, 0.030233702757845024, ...",3296577,131908


In [8]:
y.head()

0    [[0.3333333333333333, 0.3333333333333333, 0.33...
1    [[0.4082482904638631, 0.4082482904638631, 0.40...
2    [[0.3779644730092272, 0.3779644730092272, 0.37...
3    [[0.3779644730092272, 0.3779644730092272, 0.37...
4    [[0.3779644730092272, 0.3779644730092272, 0.37...
Name: Title, dtype: object

# Split `Train` and `Test` Set

In [9]:
from sklearn.model_selection import train_test_split

# Split Dataset Into Train and Test Set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Take Info of Both `Train` and `Test` Set

In [10]:
X_train.head()

Unnamed: 0,Description,Views,Likes
192,"[[0.06119900613621046, 0.03059950306810523, 0....",1695648,117018
75,"[[0.03077287274483318, 0.03077287274483318, 0....",6060899,325096
84,"[[0.02786391062876764, 0.02786391062876764, 0....",2860754,137997
359,"[[0.01933834476267449, 0.01933834476267449, 0....",208711,13237
16,"[[0.03307706008278112, 0.06615412016556224, 0....",2035775,110762


In [11]:
y_train.head()

192    [[0.3333333333333333, 0.6666666666666666, 0.33...
75     [[0.35355339059327373, 0.35355339059327373, 0....
84     [[0.31622776601683794, 0.31622776601683794, 0....
359    [[0.30151134457776363, 0.30151134457776363, 0....
16     [[0.3333333333333333, 0.3333333333333333, 0.33...
Name: Title, dtype: object

In [12]:
X_test.head()

Unnamed: 0,Description,Views,Likes
327,"[[0.024485105343719588, 0.024485105343719588, ...",1138893,57851
33,"[[0.03205852818187507, 0.09617558454562522, 0....",5666776,213287
15,"[[0.031750031750047626, 0.031750031750047626, ...",5914905,239606
314,"[[0.02423219058363282, 0.02423219058363282, 0....",816234,50920
57,"[[0.03196013860502966, 0.03196013860502966, 0....",7154900,365901


In [13]:
y_test.head()

327    [[0.3333333333333333, 0.3333333333333333, 0.33...
33     [[0.3333333333333333, 0.3333333333333333, 0.33...
15     [[0.3779644730092272, 0.3779644730092272, 0.37...
314    [[0.3779644730092272, 0.3779644730092272, 0.37...
57     [[0.3333333333333333, 0.3333333333333333, 0.33...
Name: Title, dtype: object