In [1]:
!pip install opendatasets



In [2]:
import opendatasets as od
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings
import numpy as np

warnings.filterwarnings('ignore')

In [3]:
dataset = 'https://www.kaggle.com/datasets/rsrishav/youtube-trending-video-dataset?select=IN_youtube_trending_data.csv'

final_dir = '\\'.join(os.getcwd().split('\\')[:-1])

In [4]:
od.download(dataset, data_dir=final_dir)

final_dir += '\\youtube-trending-video-dataset\\US_youtube_trending_data.csv'

Skipping, found downloaded files in "C:\Users\timbe\Final Project\youtube-trending-video-dataset" (use force=True to force download)


In [5]:
data = pd.read_csv(final_dir)

data.head(3)

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description
0,3C66w5Z0ixs,I ASKED HER TO BE MY GIRLFRIEND...,2020-08-11T19:20:14Z,UCvtRTOMP2TqYqu51xNrqAzg,Brawadis,22,2020-08-12T00:00:00Z,brawadis|prank|basketball|skits|ghost|funny vi...,1514614,156908,5855,35313,https://i.ytimg.com/vi/3C66w5Z0ixs/default.jpg,False,False,SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib...
1,M9Pmf9AB4Mo,Apex Legends | Stories from the Outlands – “Th...,2020-08-11T17:00:10Z,UC0ZV6M2THA81QT9hrVWJG3A,Apex Legends,20,2020-08-12T00:00:00Z,Apex Legends|Apex Legends characters|new Apex ...,2381688,146739,2794,16549,https://i.ytimg.com/vi/M9Pmf9AB4Mo/default.jpg,False,False,"While running her own modding shop, Ramya Pare..."
2,J78aPJ3VyNs,I left youtube for a month and THIS is what ha...,2020-08-11T16:34:06Z,UCYzPXprvl5Y-Sf0g4vX-m6g,jacksepticeye,24,2020-08-12T00:00:00Z,jacksepticeye|funny|funny meme|memes|jacksepti...,2038853,353787,2628,40221,https://i.ytimg.com/vi/J78aPJ3VyNs/default.jpg,False,False,I left youtube for a month and this is what ha...


In [6]:
with open('VideoIDs.txt', 'w') as fp:
    fp.write(",".join(data['video_id']))

In [7]:
data.shape

(185990, 16)

In [12]:
data[data['video_id'] == "KOIgb6lQ1JY"]["publishedAt"]

838     2020-08-14T20:30:00Z
1073    2020-08-14T20:30:00Z
1315    2020-08-14T20:30:00Z
1555    2020-08-14T20:30:00Z
1791    2020-08-14T20:30:00Z
Name: publishedAt, dtype: object

In [None]:
data['video_id'][:2].to_list()*100

In [None]:
# Checking the shape of the DataFrame
data.shape

In [None]:
# Dropping ID columns as they are not necessary for the analysis and prediction

data.drop(['video_id', 'channelId'], axis = 1, inplace = True)

# Validating the above code

data.head(3)

In [None]:
# Checking null values

def null_values(df):
    temp = df.isna().sum()
    temp_1 = round(temp * 100 / df.shape[0], 2)
    
    return pd.DataFrame((temp, temp_1), index = ['Count', 'Percentage']).T.sort_values('Count', ascending = False)


null_values(data)

We can see that there are around 10% missing values in `description` column and only 1 missing value in `channelTitle` column.
- We shall keep the `description` column as it is while analysis and do the null value treatment during model building.
- We shall delete one row from which has the null value in `channelTitle` column.

In [None]:
# Deleting one row where there is null value in 'channelTitle' column

data.dropna(subset = ['channelTitle'], how = 'any', inplace = True)

# Validating the above code

null_values(data).loc['channelTitle',:]

In [None]:
# Let's check the datatypes of each column in DataFrame

data.dtypes

- `publishedAt`, `trending_date` are object type. Let's convert it to Datetime format.

In [None]:
# Converting 'publishedAt' and 'trending_date' to datetime objects

data['publishedAt'] = pd.to_datetime(data['publishedAt'])
data['trending_date'] = pd.to_datetime(data['trending_date'])

# Validating the above changes

data.dtypes[['publishedAt', 'trending_date']]

In [None]:
# Getting new column 'daysTakenToTrend' which gives us the information about the number of days taken by the video to get into the trending videos

data['daysTakenToTrend'] = (data['trending_date'] - data['publishedAt']).dt.days

# Validating the above code

data.head(3)

In [None]:
data['categoryId'].unique()

We can see that `categoryId` column has id's of repective categories. We can access the description of categoryId from the `US_category_id.json` file.

In [None]:
# Let's import US_category_id.json file and map the category id's respectively

category_path = '\\'.join(final_dir.split('\\')[:-1]) + '\\US_category_id.json'


# Creating a dictionary object which stores the category id and its respective category
category_dict = {}

with open(category_path, 'r') as file:
    json_data = json.load(file)
    for item in json_data['items']:
        category_dict[int(item['id'])] = item['snippet']['title']
    
data['categoryId'] = data['categoryId'].apply(lambda x: category_dict[x])

# Validating the above code
data['categoryId'].head()

<b> Note: </b>
- Although we were using IN data for analysis. The `IN_category_id.json` was missing some data.
- Upon research, we found out that the `id` and `title` are same irrespective of the country.
- Hence, we have used `US_category_id.json` in the above case.

In [None]:
# Checking the Dataframe after the changes

data.head()

In [None]:
# Let's plot a Boxplot for 'daysTakenToTrend' column

sns.boxplot(data['daysTakenToTrend'])
plt.show()

There are some values less than 0. It is impossible for a video to trend before it was published.

In [None]:
data[data['daysTakenToTrend'] < 0]['daysTakenToTrend'].plot.hist()

However, we can see that the negative days do no exceed -1.0. This might be because of some server lag or timezone difference. Let's conver those negative days to 0.

In [None]:
# Converting negative days in 'daysTakenToTrend' column to 0

data['daysTakenToTrend'] = data['daysTakenToTrend'].apply(lambda x: x if x>0 else 0)

In [None]:
# Plotting boxplot after the changes

sns.boxplot(data['daysTakenToTrend'])
plt.show()

In [None]:
data['tagCount'] = data['tags'].apply(lambda x: 0 if type(x) == float else len(list(x.split('|'))))

In [None]:
plt.figure(figsize = (10, 5))
sns.histplot(data['tagCount'], kde = True)
plt.show()

In [None]:
# Let's check the correlation between the numerical columns

plt.figure(figsize = (10, 8))
sns.heatmap(data[['view_count', 'likes', 'dislikes', 'comment_count', 'daysTakenToTrend', 'tagCount']].corr(), linewidths=.5, annot=True, cmap='coolwarm')
plt.show()

- `view_count` and `likes` are highly correlated. It is more likely that the video with more views has more likes.
- `comment_count` and `likes` are relatively highly correlated when compared to `comment_count` and `views`.
- `daysTakenToTrend` is not correlated to any feature. Which is interesting as it is impossible to correlate how many days the video will take to trend based on comment_count or dislikes or likes or view_count.

In [None]:
######################### likes per view

In [None]:
# Checking number of videos based on each Category

plt.figure(figsize = (10, 4))
sns.countplot(data['categoryId'], order = data['categoryId'].value_counts().sort_values(ascending = False).index)
plt.xticks(rotation = 90)
plt.show()

There are more `Entertainment` videos and least type is `Nonprofits & Activism`

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(data = data, x = 'categoryId', y = 'likes',
            order = data.groupby('categoryId')['likes'].mean().sort_values(ascending = False).index, ci = 0)

plt.xticks(rotation = 90)
plt.show()

- `Pets & Animals` videos has most average likes and `New & Policts` videos has least average likes.

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(data = data, x = 'categoryId', y = 'comment_count',
            order = data.groupby('categoryId')['comment_count'].mean().sort_values(ascending = False).index, ci = 0)

plt.xticks(rotation = 90)
plt.show()

`Music` videos has most average comment count and `Nonprofits & Activism` has least average comment count.

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(data = data, x = 'categoryId', y = 'daysTakenToTrend',
            order = data.groupby('categoryId')['daysTakenToTrend'].mean().sort_values(ascending = False).index, ci = 0)

plt.xticks(rotation = 90)
plt.show()

It is interesting to note that `News & Politics` videos take less time to trend and `Music`, `Comedy` and `Pets & Animals` videos take more time to trend.

In [None]:
sns.countplot(data['comments_disabled'])
plt.show()

In [None]:
plt.figure(figsize = (10, 5))
sns.countplot(data[data['ratings_disabled'] == True]['categoryId'], 
              order = data[data['ratings_disabled'] == True].groupby('categoryId')['ratings_disabled'].count().sort_values(ascending = False).index)
plt.xticks(rotation = 90)
plt.show()

In [None]:
plt.figure(figsize = (10, 5))
sns.countplot(data[data['tags'] == '[None]']['categoryId'], 
              order = data[data['tags'] == '[None]'].groupby('categoryId')['ratings_disabled'].count().sort_values(ascending = False).index)
plt.xticks(rotation = 90)
plt.show()

In [None]:
# Let's extract weekday from the trendingdate
data['day'] = data['trending_date'].dt.day_name()

In [None]:
# Let's plot number of trending videos for each day of the week
sns.countplot(data['day'])
plt.show()

In [None]:
plt.figure(figsize = (15,4))
sns.countplot(data = data, x = 'categoryId', hue = 'day')

In [None]:
(data.groupby('categoryId')['daysTakenToTrend'].var().sort_values()).plot.bar()

In [None]:
data['like/dislike ratio'] = round(data['likes']/data['dislikes'], 2)

In [None]:
plt.figure(figsize = (8, 10))
sns.histplot(data = data, x = 'like/dislike ratio', y = 'categoryId')

In [None]:
sns.boxplot(data = data, x = 'daysTakenToTrend', y = 'categoryId')