In [1]:
import opendatasets as od
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings
import numpy as np
import statistics as st

warnings.filterwarnings('ignore')

In [2]:
!pip install opendatasets



In [3]:
dataset = 'https://www.kaggle.com/datasets/rsrishav/youtube-trending-video-dataset?select=US_youtube_trending_data.csv'

final_dir = '\\'.join(os.getcwd().split('\\')[:-1])

In [4]:
od.download(dataset, data_dir=final_dir)

final_dir += '\\youtube-trending-video-dataset\\US_youtube_trending_data.csv'

Skipping, found downloaded files in "C:\Users\timbe\Final Project\youtube-trending-video-dataset" (use force=True to force download)


In [5]:
data = pd.read_csv(final_dir)

data.head(3)

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description
0,3C66w5Z0ixs,I ASKED HER TO BE MY GIRLFRIEND...,2020-08-11T19:20:14Z,UCvtRTOMP2TqYqu51xNrqAzg,Brawadis,22,2020-08-12T00:00:00Z,brawadis|prank|basketball|skits|ghost|funny vi...,1514614,156908,5855,35313,https://i.ytimg.com/vi/3C66w5Z0ixs/default.jpg,False,False,SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib...
1,M9Pmf9AB4Mo,Apex Legends | Stories from the Outlands – “Th...,2020-08-11T17:00:10Z,UC0ZV6M2THA81QT9hrVWJG3A,Apex Legends,20,2020-08-12T00:00:00Z,Apex Legends|Apex Legends characters|new Apex ...,2381688,146739,2794,16549,https://i.ytimg.com/vi/M9Pmf9AB4Mo/default.jpg,False,False,"While running her own modding shop, Ramya Pare..."
2,J78aPJ3VyNs,I left youtube for a month and THIS is what ha...,2020-08-11T16:34:06Z,UCYzPXprvl5Y-Sf0g4vX-m6g,jacksepticeye,24,2020-08-12T00:00:00Z,jacksepticeye|funny|funny meme|memes|jacksepti...,2038853,353787,2628,40221,https://i.ytimg.com/vi/J78aPJ3VyNs/default.jpg,False,False,I left youtube for a month and this is what ha...


In [6]:
# Checking the shape of the DataFrame
data.shape

(185990, 16)

In [7]:
data['video_id'].nunique()

34066

In [8]:
with open('VideoIDs.txt', 'w') as file:
    file.write(','.join(data['video_id'].unique()))

In [9]:
videos_str = open("VideoIDs.txt", 'r').read()
videos_list = videos_str.split(',')

In [10]:
len(videos_list)

34066

In [11]:
# Dropping channelID column as it is not necessary for the analysis and prediction 

data.drop(['channelId'], axis = 1, inplace = True)

# Validating the above code

data.head(3)

Unnamed: 0,video_id,title,publishedAt,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description
0,3C66w5Z0ixs,I ASKED HER TO BE MY GIRLFRIEND...,2020-08-11T19:20:14Z,Brawadis,22,2020-08-12T00:00:00Z,brawadis|prank|basketball|skits|ghost|funny vi...,1514614,156908,5855,35313,https://i.ytimg.com/vi/3C66w5Z0ixs/default.jpg,False,False,SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib...
1,M9Pmf9AB4Mo,Apex Legends | Stories from the Outlands – “Th...,2020-08-11T17:00:10Z,Apex Legends,20,2020-08-12T00:00:00Z,Apex Legends|Apex Legends characters|new Apex ...,2381688,146739,2794,16549,https://i.ytimg.com/vi/M9Pmf9AB4Mo/default.jpg,False,False,"While running her own modding shop, Ramya Pare..."
2,J78aPJ3VyNs,I left youtube for a month and THIS is what ha...,2020-08-11T16:34:06Z,jacksepticeye,24,2020-08-12T00:00:00Z,jacksepticeye|funny|funny meme|memes|jacksepti...,2038853,353787,2628,40221,https://i.ytimg.com/vi/J78aPJ3VyNs/default.jpg,False,False,I left youtube for a month and this is what ha...


In [12]:
# Checking null values

def null_values(df):
    temp = df.isna().sum()
    temp_1 = round(temp * 100 / df.shape[0], 2)
    
    return pd.DataFrame((temp, temp_1), index = ['Count', 'Percentage']).T.sort_values('Count', ascending = False)


null_values(data)

Unnamed: 0,Count,Percentage
description,4048.0,2.18
video_id,0.0,0.0
title,0.0,0.0
publishedAt,0.0,0.0
channelTitle,0.0,0.0
categoryId,0.0,0.0
trending_date,0.0,0.0
tags,0.0,0.0
view_count,0.0,0.0
likes,0.0,0.0


We can see that there are around 10% missing values in `description` column and only 1 missing value in `channelTitle` column.
- We shall keep the `description` column as it is while analysis and do the null value treatment during model building.
- We shall delete one row from which has the null value in `channelTitle` column.

In [13]:
# Let's check the datatypes of each column in DataFrame

data.dtypes

video_id             object
title                object
publishedAt          object
channelTitle         object
categoryId            int64
trending_date        object
tags                 object
view_count            int64
likes                 int64
dislikes              int64
comment_count         int64
thumbnail_link       object
comments_disabled      bool
ratings_disabled       bool
description          object
dtype: object

- `publishedAt`, `trending_date` are object type. Let's convert it to Datetime format.

In [14]:
# Converting 'publishedAt' and 'trending_date' to datetime objects

data['publishedAt'] = pd.to_datetime(data['publishedAt'])
data['trending_date'] = pd.to_datetime(data['trending_date'])

# Validating the above changes

data.dtypes[['publishedAt', 'trending_date']]

publishedAt      datetime64[ns, UTC]
trending_date    datetime64[ns, UTC]
dtype: object

In [15]:
data['categoryId'].unique()


array([22, 20, 24, 10, 26, 27, 23, 28,  1, 25, 17, 19, 15,  2, 29],
      dtype=int64)

We can see that `categoryId` column has id's of repective categories. We can access the description of categoryId from the `US_category_id.json` file.

In [16]:
# Let's import US_category_id.json file and map the category id's respectively

category_path = '\\'.join(final_dir.split('\\')[:-1]) + '\\US_category_id.json'


# Creating a dictionary object which stores the category id and its respective category
category_dict = {}

with open(category_path, 'r') as file:
    json_data = json.load(file)
    for item in json_data['items']:
        category_dict[int(item['id'])] = item['snippet']['title']
    
data['categoryId'] = data['categoryId'].apply(lambda x: category_dict[x])

# Validating the above code
data['categoryId'].head()

0    People & Blogs
1            Gaming
2     Entertainment
3             Music
4     Howto & Style
Name: categoryId, dtype: object

<b> Note: </b>
- Although we were using IN data for analysis. The `IN_category_id.json` was missing some data.
- Upon research, we found out that the `id` and `title` are same irrespective of the country.
- Hence, we have used `US_category_id.json` in the above case.

In [17]:
# Checking the Dataframe after the changes

data.head()

Unnamed: 0,video_id,title,publishedAt,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description
0,3C66w5Z0ixs,I ASKED HER TO BE MY GIRLFRIEND...,2020-08-11 19:20:14+00:00,Brawadis,People & Blogs,2020-08-12 00:00:00+00:00,brawadis|prank|basketball|skits|ghost|funny vi...,1514614,156908,5855,35313,https://i.ytimg.com/vi/3C66w5Z0ixs/default.jpg,False,False,SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib...
1,M9Pmf9AB4Mo,Apex Legends | Stories from the Outlands – “Th...,2020-08-11 17:00:10+00:00,Apex Legends,Gaming,2020-08-12 00:00:00+00:00,Apex Legends|Apex Legends characters|new Apex ...,2381688,146739,2794,16549,https://i.ytimg.com/vi/M9Pmf9AB4Mo/default.jpg,False,False,"While running her own modding shop, Ramya Pare..."
2,J78aPJ3VyNs,I left youtube for a month and THIS is what ha...,2020-08-11 16:34:06+00:00,jacksepticeye,Entertainment,2020-08-12 00:00:00+00:00,jacksepticeye|funny|funny meme|memes|jacksepti...,2038853,353787,2628,40221,https://i.ytimg.com/vi/J78aPJ3VyNs/default.jpg,False,False,I left youtube for a month and this is what ha...
3,kXLn3HkpjaA,XXL 2020 Freshman Class Revealed - Official An...,2020-08-11 16:38:55+00:00,XXL,Music,2020-08-12 00:00:00+00:00,xxl freshman|xxl freshmen|2020 xxl freshman|20...,496771,23251,1856,7647,https://i.ytimg.com/vi/kXLn3HkpjaA/default.jpg,False,False,Subscribe to XXL → http://bit.ly/subscribe-xxl...
4,VIUo6yapDbc,Ultimate DIY Home Movie Theater for The LaBran...,2020-08-11 15:10:05+00:00,Mr. Kate,Howto & Style,2020-08-12 00:00:00+00:00,The LaBrant Family|DIY|Interior Design|Makeove...,1123889,45802,964,2196,https://i.ytimg.com/vi/VIUo6yapDbc/default.jpg,False,False,Transforming The LaBrant Family's empty white ...


In [19]:
def column_start_end(x):
    return([min(x), max(x)])

In [20]:
df = data.groupby('video_id').agg({'title':st.mode, 'publishedAt':np.min, 'channelTitle':st.mode, 'categoryId':st.mode,
              'trending_date': column_start_end, 'tags': st.mode, 'likes': column_start_end,
                'comments_disabled': st.mode, 'ratings_disabled': st.mode}).reset_index()

In [21]:
df.head()

Unnamed: 0,video_id,title,publishedAt,channelTitle,categoryId,trending_date,tags,likes,dislikes,comments_disabled,ratings_disabled
0,--14w5SOEUs,Migos - Avalanche (Official Video),2021-06-10 16:00:00+00:00,MigosVEVO,Music,"[2021-06-11 00:00:00+00:00, 2021-06-15 00:00:0...",Migos|Avalanche|Quality|Control|Music/Motown|R...,"[122830, 262692]","[867, 4107]",False,False
1,--2O86Z0hsM,MY TESLA PAYS FOR ITSELF,2022-03-09 23:19:08+00:00,jf.okay,Entertainment,"[2022-03-11 00:00:00+00:00, 2022-03-15 00:00:0...",[None],"[16481, 17290]","[0, 0]",False,False
2,--40TEbZ9Is,Supporting Actress in a Comedy: 73rd Emmys,2021-09-20 01:03:32+00:00,Television Academy,Entertainment,"[2021-09-21 00:00:00+00:00, 2021-09-25 00:00:0...",[None],"[6299, 8029]","[286, 369]",False,False
3,--5-brQiQFg,Washington Commanders vs. San Francisco 49ers ...,2022-12-25 00:30:17+00:00,NFL,Sports,"[2022-12-26 00:00:00+00:00, 2022-12-26 00:00:0...",[None],"[14603, 14603]","[0, 0]",False,False
4,--DKkzWVh-E,Why Retaining Walls Collapse,2021-12-07 13:00:00+00:00,Practical Engineering,Education,"[2021-12-08 00:00:00+00:00, 2021-12-11 00:00:0...",retaining wall|New Jersey highway|Direct Conne...,"[18445, 29991]","[147, 320]",False,False


In [22]:
df['trending_date_start'] = df['trending_date'].apply(lambda x: min(x))
df['trending_date_end'] = df['trending_date'].apply(lambda x: max(x))
df.drop('trending_date', axis = 1, inplace = True)

In [23]:
df.head()

Unnamed: 0,video_id,title,publishedAt,channelTitle,categoryId,tags,likes,dislikes,comments_disabled,ratings_disabled,trending_date_start,trending_date_end
0,--14w5SOEUs,Migos - Avalanche (Official Video),2021-06-10 16:00:00+00:00,MigosVEVO,Music,Migos|Avalanche|Quality|Control|Music/Motown|R...,"[122830, 262692]","[867, 4107]",False,False,2021-06-11 00:00:00+00:00,2021-06-15 00:00:00+00:00
1,--2O86Z0hsM,MY TESLA PAYS FOR ITSELF,2022-03-09 23:19:08+00:00,jf.okay,Entertainment,[None],"[16481, 17290]","[0, 0]",False,False,2022-03-11 00:00:00+00:00,2022-03-15 00:00:00+00:00
2,--40TEbZ9Is,Supporting Actress in a Comedy: 73rd Emmys,2021-09-20 01:03:32+00:00,Television Academy,Entertainment,[None],"[6299, 8029]","[286, 369]",False,False,2021-09-21 00:00:00+00:00,2021-09-25 00:00:00+00:00
3,--5-brQiQFg,Washington Commanders vs. San Francisco 49ers ...,2022-12-25 00:30:17+00:00,NFL,Sports,[None],"[14603, 14603]","[0, 0]",False,False,2022-12-26 00:00:00+00:00,2022-12-26 00:00:00+00:00
4,--DKkzWVh-E,Why Retaining Walls Collapse,2021-12-07 13:00:00+00:00,Practical Engineering,Education,retaining wall|New Jersey highway|Direct Conne...,"[18445, 29991]","[147, 320]",False,False,2021-12-08 00:00:00+00:00,2021-12-11 00:00:00+00:00


In [24]:
df['likes_start'] = df['likes'].apply(lambda x: min(x))
df['likes_end'] = df['likes'].apply(lambda x: max(x))
df.drop('likes', axis = 1, inplace = True)

In [26]:
df.head()

Unnamed: 0,video_id,title,publishedAt,channelTitle,categoryId,tags,comments_disabled,ratings_disabled,trending_date_start,trending_date_end,likes_start,likes_end,dislikes_start,dislikes_end
0,--14w5SOEUs,Migos - Avalanche (Official Video),2021-06-10 16:00:00+00:00,MigosVEVO,Music,Migos|Avalanche|Quality|Control|Music/Motown|R...,False,False,2021-06-11 00:00:00+00:00,2021-06-15 00:00:00+00:00,122830,262692,867,4107
1,--2O86Z0hsM,MY TESLA PAYS FOR ITSELF,2022-03-09 23:19:08+00:00,jf.okay,Entertainment,[None],False,False,2022-03-11 00:00:00+00:00,2022-03-15 00:00:00+00:00,16481,17290,0,0
2,--40TEbZ9Is,Supporting Actress in a Comedy: 73rd Emmys,2021-09-20 01:03:32+00:00,Television Academy,Entertainment,[None],False,False,2021-09-21 00:00:00+00:00,2021-09-25 00:00:00+00:00,6299,8029,286,369
3,--5-brQiQFg,Washington Commanders vs. San Francisco 49ers ...,2022-12-25 00:30:17+00:00,NFL,Sports,[None],False,False,2022-12-26 00:00:00+00:00,2022-12-26 00:00:00+00:00,14603,14603,0,0
4,--DKkzWVh-E,Why Retaining Walls Collapse,2021-12-07 13:00:00+00:00,Practical Engineering,Education,retaining wall|New Jersey highway|Direct Conne...,False,False,2021-12-08 00:00:00+00:00,2021-12-11 00:00:00+00:00,18445,29991,147,320


In [29]:
df['tags'] = df['tags'].apply(lambda x: x if x!= '[None]' else np.nan)
df['tagCount'] = df['tags'].apply(lambda x: 0 if type(x) == float else len(list(x.split('|'))))

In [56]:
df['hoursTakenToTrend'] = round((df['trending_date_start'] - df['publishedAt']).dt.seconds/(60*60), 1)

In [62]:
df['trendingDaysDuration'] = (df['trending_date_end'] - df['trending_date_start']).dt.days

## XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

In [None]:
plt.figure(figsize = (10, 5))
sns.histplot(data['tagCount'], kde = True)
plt.show()

In [None]:
# Let's check the correlation between the numerical columns

plt.figure(figsize = (10, 8))
sns.heatmap(data[['view_count', 'likes', 'dislikes', 'comment_count', 'daysTakenToTrend', 'tagCount']].corr(), linewidths=.5, annot=True, cmap='coolwarm')
plt.show()

- `view_count` and `likes` are highly correlated. It is more likely that the video with more views has more likes.
- `comment_count` and `likes` are relatively highly correlated when compared to `comment_count` and `views`.
- `daysTakenToTrend` is not correlated to any feature. Which is interesting as it is impossible to correlate how many days the video will take to trend based on comment_count or dislikes or likes or view_count.

In [None]:
######################### likes per view

In [None]:
# Checking number of videos based on each Category

plt.figure(figsize = (10, 4))
sns.countplot(data['categoryId'], order = data['categoryId'].value_counts().sort_values(ascending = False).index)
plt.xticks(rotation = 90)
plt.show()

There are more `Entertainment` videos and least type is `Nonprofits & Activism`

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(data = data, x = 'categoryId', y = 'likes',
            order = data.groupby('categoryId')['likes'].mean().sort_values(ascending = False).index, ci = 0)

plt.xticks(rotation = 90)
plt.show()

- `Pets & Animals` videos has most average likes and `New & Policts` videos has least average likes.

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(data = data, x = 'categoryId', y = 'comment_count',
            order = data.groupby('categoryId')['comment_count'].mean().sort_values(ascending = False).index, ci = 0)

plt.xticks(rotation = 90)
plt.show()

`Music` videos has most average comment count and `Nonprofits & Activism` has least average comment count.

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(data = data, x = 'categoryId', y = 'daysTakenToTrend',
            order = data.groupby('categoryId')['daysTakenToTrend'].mean().sort_values(ascending = False).index, ci = 0)

plt.xticks(rotation = 90)
plt.show()

It is interesting to note that `News & Politics` videos take less time to trend and `Music`, `Comedy` and `Pets & Animals` videos take more time to trend.

In [None]:
sns.countplot(data['comments_disabled'])
plt.show()

In [None]:
plt.figure(figsize = (10, 5))
sns.countplot(data[data['ratings_disabled'] == True]['categoryId'], 
              order = data[data['ratings_disabled'] == True].groupby('categoryId')['ratings_disabled'].count().sort_values(ascending = False).index)
plt.xticks(rotation = 90)
plt.show()

In [None]:
plt.figure(figsize = (10, 5))
sns.countplot(data[data['tags'] == '[None]']['categoryId'], 
              order = data[data['tags'] == '[None]'].groupby('categoryId')['ratings_disabled'].count().sort_values(ascending = False).index)
plt.xticks(rotation = 90)
plt.show()

In [None]:
# Let's extract weekday from the trendingdate
data['day'] = data['trending_date'].dt.day_name()

In [None]:
# Let's plot number of trending videos for each day of the week
sns.countplot(data['day'])
plt.show()

In [None]:
# Daily several videos trend but which video trends for the longest number of days will be the question.

In [None]:
# Modify
plt.figure(figsize = (15,4))
sns.countplot(data = data, x = 'categoryId', hue = 'day')

In [None]:
# categories that are taking minimum or moderate or maximum number of days to trend
(data.groupby('categoryId')['daysTakenToTrend'].var().sort_values()).plot.bar()

In [None]:

data['like/dislike ratio'] = round(data['likes']/data['dislikes'], 2)

In [None]:
# Modify
plt.figure(figsize = (8, 10))
sns.histplot(data = data, x = 'like/dislike ratio', y = 'categoryId')