In [1]:
# Import all the necessary packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
print(5)

5


In [4]:
all_comment = pd.read_csv('UScomments.csv', error_bad_lines= False)

b'Skipping line 41589: expected 4 fields, saw 11\nSkipping line 51628: expected 4 fields, saw 7\nSkipping line 114465: expected 4 fields, saw 5\n'
b'Skipping line 142496: expected 4 fields, saw 8\nSkipping line 189732: expected 4 fields, saw 6\nSkipping line 245218: expected 4 fields, saw 7\n'
b'Skipping line 388430: expected 4 fields, saw 5\n'


In [5]:
# This is the sample of comments from US YouTube for each video:
all_comment

Unnamed: 0,video_id,comment_text,likes,replies
0,XpVt6Z1Gjjo,Logan Paul it's yo big day ‼️‼️‼️,4,0
1,XpVt6Z1Gjjo,I've been following you from the start of your...,3,0
2,XpVt6Z1Gjjo,Say hi to Kong and maverick for me,3,0
3,XpVt6Z1Gjjo,MY FAN . attendance,3,0
4,XpVt6Z1Gjjo,trending 😉,3,0
...,...,...,...,...
691395,EoejGgUNmVU,Лучшая,1,0
691396,EoejGgUNmVU,qu'est ce que j'aimerais que tu viennes à Roan...,0,0
691397,EoejGgUNmVU,Ven a mexico! 😍 te amo LP,0,0
691398,EoejGgUNmVU,Islığı yeter...,0,0


In [6]:
# Finding out missing values in the data:
all_comment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 691400 entries, 0 to 691399
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   video_id      691400 non-null  object
 1   comment_text  691375 non-null  object
 2   likes         691400 non-null  object
 3   replies       691400 non-null  object
dtypes: object(4)
memory usage: 21.1+ MB


In [7]:
# Drop mmissing values:
all_comment.dropna(inplace = True)

In [8]:
all_comment.isnull().sum()
# There is no null value in the data

video_id        0
comment_text    0
likes           0
replies         0
dtype: int64

### 1.Analyse the positive and negative comments with TextBlob and WordCloud:

In [9]:
# Using TextBlob to create positive and negative comments in the data:

In [10]:
from textblob import TextBlob

In [11]:
# Example:
TextBlob('so good happy for you').sentiment.polarity

0.75

In [None]:
# Create a column that contains the polarity of each comment:
polarity = []
for comment in all_comment['comment_text']:
    try:
        polarity.append(TextBlob(comment).sentiment.polarity)
    except:
        polarity.append(0)
        

In [None]:
all_comment['polarity'] = polarity

In [None]:
all_comment.head()

In [None]:
cmt_positive = all_comment[all_comment['polarity'] == 1]
cmt_negative = all_comment[all_comment['polarity'] == -1]

In [None]:
# This is the data where comments are positive (polarity = 1.0)
cmt_positive.head()

In [None]:
# And this is the data of negative comments (polarity = -1.0)
cmt_negative.head()

In [None]:
# Using WordCloud to show which words are used the most in positive and negative comments:

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
total_cmt_p = ''.join(cmt_positive['comment_text'])
total_cmt_n = ''.join(cmt_negative['comment_text'])

In [None]:
wordcloud = WordCloud(stopwords= set(STOPWORDS)).generate(total_cmt_p)
plt.figure(figsize = (12,5))
plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('positive')

In [None]:
wordcloud = WordCloud(stopwords= set(STOPWORDS)).generate(total_cmt_n)
plt.figure(figsize = (12,5))
plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('negative')

### 2.Performing Emoji analysis

In [None]:
import emoji

In [None]:
# Creat a list of all emoji used in the comments
emoji_list = []
for comment in all_comment['comment_text']:
    for char in comment:
        if char in emoji.UNICODE_EMOJI_ENGLISH:
            emoji_list.append(char)

In [None]:
from collections import Counter
# This library help us to count the emoji values.

In [None]:
common_emo = Counter(emoji_list).most_common(10)
# Count and extract top 10 common emoji used.

In [None]:
common_emo

In [None]:
emo = [common_emo[i][0] for i in range(10)]
emo
# Take out 10 emoji for x-axis values.

In [None]:
freq = [common_emo[i][1] for i in range(10)]
freq
# Take out their count for y-axis values.

In [None]:
# Now we have x-axis and y-axis so this is time for building a bar chart.
import plotly.graph_objs as go
from plotly.offline import iplot

In [None]:
trace = go.Bar(x = emo, y = freq)
iplot([trace])

#### Comment: As we can see, most of them are happy, like, encourage... emoji. We can see that viewers use more emoji when they love the video. The more they use emoji, the more they love your video. If people didnt use emoji in their comment, you can consider that if they don't like your video or not.

### (*) Collecting the YouTube video data

In [None]:
# Import the additional video data first.

In [None]:
# To do this we need OS library to take the all the file name we want:
import os

In [None]:
path = r'./additional_data'

In [None]:
file_name = os.listdir(path)
file_name
# Here are all the file name we want (This dataset only contain YouTube video data in some developed countries, not all the world)

In [None]:
len(file_name)

In [None]:
file_csv = [file_name[i] for i in range(0,20,2)]
file_csv

In [None]:
df = pd.DataFrame()
for file in file_csv:
    current_df = pd.read_csv(path + '/' + file, error_bad_lines= False, encoding='iso-8859-1')
    # We need 'iso-8859-1' to encode the data  here to prevent errors.
    df = pd.concat([df, current_df])

In [None]:
df.head()

### 3. Finding out which video category has the maximum likes

In [None]:
# Import category file:
cat = pd.read_csv('category_file.txt', delimiter = ':')
# This step will help us creat a dictionary which contain category names and the ID of them.

In [None]:
cat.head()

In [None]:
cat.reset_index(inplace = True)

In [None]:
cat.columns = ['Category_id', 'Category_name']

In [None]:
cat = cat.set_index('Category_id')

In [None]:
cat.head()

In [None]:
cat_dict = cat.to_dict()

In [None]:
cat_dict = cat_dict['Category_name']

In [None]:
cat_dict
# Now we have the category dictionary we need.

In [None]:
# From the dictionary we have just created, we add a column contain all the category names of videos in the df.
df['category_name'] = df['category_id'].map(cat_dict)

In [None]:
df.head()
# We got that 'category_name' in the last of the frame.

In [None]:
sns.set_theme(style = 'whitegrid') # Set white grid theme so we can see the chart more clearly.

In [None]:
# Now it's time to create a box plot to see which category have most likes on YouTube:
plt.figure(figsize = (10,3))
sns.boxplot(data = df, x = 'category_name', y = 'likes')
plt.xticks(rotation = 'vertical')
plt.show()
plt.savefig('mostlikes.png')

In [None]:
# We can finding out which categories have most views also:
plt.figure(figsize = (10,3))
sns.boxplot(data = df, x = 'category_name', y = 'views')
plt.xticks(rotation = 'vertical')
plt.savefig('mostviews.png')
plt.show()

#### Comment: From the box plot above, we can see that Music and Entertainment category have videos which have the maximum views and likes on YouTube. That mean most of people come to YouTube to enjoy the music videos and relax by entertainment ones. Trailers have less views and likes becaus they are not released frequently, and people may consider them as advertise videos.
#### If you want to have a better chance to have more views or likes on YouTube, try to make music contents such as create new songs, cover or remix famous songs with your own styles, realease your high quality MV... Or you can try entertainment content like doing some challenges, interesting things happen in your day, react to another videos...

### 5.Finding out whether the audience is engaged or not

In [None]:
df['likes_rate'] = df['likes']/df['views']*100
df['dislikes_rate'] = df['dislikes']/df['views']*100
df['comment_count_rate'] = df['comment_count']/df['views']*100

In [None]:
df.head()

In [None]:
plt.figure(figsize = (12,8))
sns.boxplot(data = df, x= 'category_name', y = 'likes_rate')
plt.xticks(rotation = 'vertical')
plt.show()

In [None]:
plt.figure(figsize = (12,8))
sns.boxplot(data = df, x= 'category_name', y = 'comment_count_rate')
plt.xticks(rotation = 'vertical')
plt.show()

In [None]:
# Analysing relationship between views and likes (dislikes, comment count):

In [None]:
df[['views', 'likes', 'dislikes', 'comment_count']]

In [None]:
sns.heatmap(df[['views', 'likes', 'dislikes', 'comment_count']].corr(), annot = True)

In [None]:
## --> Views have a good correlation with Likes and have a poor correlation with Dislikes

In [None]:
sns.regplot(data = df, x= 'views', y= 'likes', color= 'blue')
plt.show()

In [None]:
sns.regplot(data = df, x= 'views', y= 'dislikes', color= 'red')
plt.show()

In [None]:
sns.regplot(data = df, x= 'views', y= 'comment_count', color= 'green')
plt.show()

### 6. Which YouTube channels have the largest number of trending videos?

In [None]:
# Create a data containing channel titles and the number of their videos
df_chan = df.copy()

In [None]:
df_chan = df_chan.groupby('channel_title')['video_id'].count().sort_values(ascending = False)

In [None]:
top10_chan = df_chan.head(10)

In [None]:
top10_chan

In [None]:
top10_chan = top10_chan.to_frame().reset_index().rename(columns = {'video_id':'total_video'})

In [None]:
top10_chan

In [None]:
# Using bar plot to analyse the data:
sns.barplot(data= top10_chan, x = 'channel_title', y = 'total_video')
plt.xticks(rotation = 'vertical')
plt.show()

In [None]:
## --> The Late Show channels have the largest number of videos on Youtube

In [None]:
#Using plotly:

In [None]:
import plotly.express as px

In [None]:
px.bar(data_frame = top10_chan, x = 'channel_title', y = 'total_video')

### 7. Does Punctuations in title and tags have any relation with views, likes, dislikes comments?

In [None]:
import string

In [None]:
def count_punctuation(x):
    return len([i for i in x if i in string.punctuation])

In [None]:
sample = df.head(10000)

In [None]:
sample['punctuation'] = sample['title'].apply(count_punctuation)

In [None]:
sample.head()

In [None]:
plt.figure(figsize = (12,8))
sns.boxplot(data = sample, x = 'punctuation', y = 'views')
plt.show()

In [None]:
plt.figure(figsize = (12,8))
sns.boxplot(data = sample, x = 'punctuation', y = 'likes')
plt.show()

In [None]:
plt.figure(figsize = (12,8))
sns.boxplot(data = sample, x = 'punctuation', y = 'comment_count')
plt.show()

In [None]:
# --> Videos having 1-3 punctuations in their titles have more views, likes, and comments from the audience.