In [2]:
import os
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import collections
import json
from textblob import TextBlob, Word
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 
import time
import re

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


# Data Loading

In [4]:
dfs = {}

path = '../US_Datasets/Data'
for file in os.listdir(path):
    #print(file)
    df = pd.read_csv(path + '/' + file)
    date = datetime.strptime(file.split('_')[0], '%y.%d.%m').date()
    dfs[date] = df
    
dfs = collections.OrderedDict(sorted(dfs.items()))
print(dfs.keys())

odict_keys([datetime.date(2019, 12, 11), datetime.date(2019, 12, 12), datetime.date(2019, 12, 13), datetime.date(2019, 12, 14), datetime.date(2019, 12, 15), datetime.date(2019, 12, 17), datetime.date(2019, 12, 18), datetime.date(2019, 12, 19), datetime.date(2019, 12, 20), datetime.date(2019, 12, 21), datetime.date(2019, 12, 22), datetime.date(2019, 12, 23), datetime.date(2019, 12, 24), datetime.date(2019, 12, 25), datetime.date(2019, 12, 27), datetime.date(2019, 12, 29), datetime.date(2020, 1, 2), datetime.date(2020, 1, 3), datetime.date(2020, 1, 4), datetime.date(2020, 1, 5), datetime.date(2020, 1, 6), datetime.date(2020, 1, 7), datetime.date(2020, 1, 8), datetime.date(2020, 1, 9), datetime.date(2020, 1, 10), datetime.date(2020, 1, 11), datetime.date(2020, 1, 12), datetime.date(2020, 1, 13), datetime.date(2020, 1, 14), datetime.date(2020, 1, 15), datetime.date(2020, 1, 16), datetime.date(2020, 1, 17), datetime.date(2020, 1, 18), datetime.date(2020, 1, 19), datetime.date(2020, 1, 20), 

In [5]:
class YTVideo:
    def __init__(self, df_row, days_in_trending=1):
        self.video_id = df_row.video_id
        self.title = df_row.title
        self.published_at = df_row.publishedAt
        self.channel_id = df_row.channelId
        self.channel_title = df_row.channelTitle
        self.category_id = df_row.categoryId
        self.trending_date = df_row.trending_date
        self.tags = df_row.tags
        self.view_count = df_row.view_count
        self.likes = df_row.likes
        self.dislikes = df_row.dislikes
        self.comment_count = df_row.comment_count
        self.thumbnail_link = df_row.thumbnail_link
        self.comments_disabled = df_row.comments_disabled
        self.ratings_disabled = df_row.ratings_disabled
        self.description = df_row.description
        self.days_in_trending = days_in_trending
        
    def __str__(self):
        retval = 'Video ID: {}\nTitle: {}\nPublished at: {}\nChannel ID: {}\nChannel title: {}\nCategory ID: {}\nTrending date: {}\nTags: {}\nView count: {}\nLikes: {}\nDislikes: {}\nComment count: {}\nThumbnail link: {}\nComments disabled: {}\nRatings disabled: {}\nDescription: {}\nDays in trending: {}\n'.format(self.video_id, self.title, self.published_at, self.channel_id, self.channel_title, self.category_id, self.trending_date, self.tags, self.view_count, self.likes, self.dislikes, self.comment_count, self.thumbnail_link, self.comments_disabled, self.ratings_disabled, self.description, self.days_in_trending)
        return retval
    
    def to_dict(self):
        return {
            'video_id': self.video_id,
            'title': self.title,
            'published_at': self.published_at,
            'channel_id': self.channel_id,
            'channel_title': self.channel_title,
            'category_id': self.category_id,
            'trending_date': self.trending_date,
            'tags': self.tags,
            'view_count': self.view_count,
            'likes': self.likes,
            'dislikes': self.dislikes,
            'comment_count': self.comment_count,
            'thumbnail_link': self.thumbnail_link,
            'comments_disabled': self.comments_disabled,
            'ratings_disabled': self.ratings_disabled,
            'description': self.description,
            'days_in_trending': self.days_in_trending
        }

In [6]:
first_day = list(dfs.values())[0].iloc[0].trending_date
last_day = list(dfs.values())[-1].iloc[0].trending_date
print('First day: {}'.format(first_day))
print('Last day: {}'.format(last_day))

videos = dict()

# u obzir dolaze videi koji su usli i izasli iz trendinga za vreme skupljanja podataka
for df in dfs.values():
    for index, row in df.iterrows():
        if row.video_id in videos:
            videos[row.video_id].days_in_trending += 1
        else:
            # provera da li video iz naseg prvog dana
            if row.trending_date != first_day and row.trending_date != last_day:
                videos[row.video_id] = YTVideo(row)

First day: 19.11.12
Last day: 20.13.02


In [7]:
print('Broj videa: {}'.format(len(videos)))

Broj videa: 2272


In [8]:
# kreiranje dataframe-a
videos_df = pd.DataFrame([video.to_dict() for video in videos.values()])

In [9]:
videos_df

Unnamed: 0,category_id,channel_id,channel_title,comment_count,comments_disabled,days_in_trending,description,dislikes,likes,published_at,ratings_disabled,tags,thumbnail_link,title,trending_date,video_id,view_count
0,24,UCJ0uqCI0Vqr2Rrt1HseGirg,The Late Late Show with James Corden,20068,False,1,Late Late Show guest host Harry Styles challen...,8256,571835,2019-12-11T07:08:34.000Z,False,The Late Late Show|Late Late Show|James Corden...,https://i.ytimg.com/vi/vPx6M7eTYbc/default.jpg,Spill Your Guts: Harry Styles & Kendall Jenner,19.12.12,vPx6M7eTYbc,11636632
1,26,UCDSJCBYqL7VQrlXfhr1RtwA,Les Do Makeup,13130,False,7,Hi babygirls! Thank you so much for watching ...,773,52780,2019-12-12T05:25:42.000Z,False,[none],https://i.ytimg.com/vi/sg8WaeeFyNY/default.jpg,WE GOT UNEXPECTED NEWS..,19.12.12,sg8WaeeFyNY,423215
2,17,UCQIUhhcmXsu6cN6n3y9-Pww,Jesser,931,False,7,"Last to miss the basketball shot wins 10,000!●...",586,20178,2019-12-11T23:00:53.000Z,False,last to leave|nba basketball|nba basketball hi...,https://i.ytimg.com/vi/q1PR05q8l2g/default.jpg,"Last To Miss Layup Wins $10,000",19.12.12,q1PR05q8l2g,463685
3,17,UCWJ2lWNubArHWmf3FIHbfcQ,NBA,668,False,2,LAKERS at MAGIC | FULL GAME HIGHLIGHTS | Decem...,288,4605,2019-12-12T02:35:33.000Z,False,sp:ty=high|sp:dt=2019-12-12T00:00:00Z|sp:st=ba...,https://i.ytimg.com/vi/t6Z6RIXq0L0/default.jpg,LAKERS at MAGIC | FULL GAME HIGHLIGHTS | Decem...,19.12.12,t6Z6RIXq0L0,659579
4,24,UCtj45MepAoKxZoyR_Mnt86Q,Royal Family,1907,False,6,LAST VIDEO ; https://youtu.be/okujHUu_hmQ*MAKE...,164,18400,2019-12-12T02:38:37.000Z,False,queen Naija|Medicine|Queen|Spicy|Royalty Squad...,https://i.ytimg.com/vi/TGDpRB4ovvA/default.jpg,TIPPING DRIVE-THRU WORKERS $100 FOR THE HOLIDA...,19.12.12,TGDpRB4ovvA,175558
5,24,UCilwZiBBfI9X6yiZRzWty8Q,FaZe Rug,4510,False,4,THEY WERE SO HAPPY!I went up to random strange...,1085,66567,2019-12-11T22:00:56.000Z,False,faze rug|rug|rugfaze|fazerug|christmas|christm...,https://i.ytimg.com/vi/aS_dFYGKoKg/default.jpg,Santa Buys Random Strangers ANYTHING They Want...,19.12.12,aS_dFYGKoKg,927083
6,24,UCG8rbF3g2AMX70yOd8vqIZg,Logan Paul,22355,False,7,Install Raid for Free ✅ IOS: http://bit.ly/Log...,33927,145538,2019-12-11T18:59:22.000Z,False,logan paul vlog|logan paul|logan|paul|olympics...,https://i.ytimg.com/vi/y7TlnQq6XzI/default.jpg,congrats ksi,19.12.12,y7TlnQq6XzI,2231082
7,23,UCpi8TJfiA4lKGkaXs__YdBA,The Try Guys,4686,False,3,Will #TheTryGuys survive a #MrBeast challenge?...,1452,94998,2019-12-11T16:00:08.000Z,False,try guys|mrbeast|mr beast|last to leave|last t...,https://i.ytimg.com/vi/h2NXZTrZEBk/default.jpg,"Last To Leave Handcuffs Wins $10,000 ft. MrBeast",19.12.12,h2NXZTrZEBk,1745133
8,23,UC4G10tk3AHFuyMIuD3rHOBA,RDCworld1,3439,False,7,Please everybody for the safety of everyone pu...,452,87589,2019-12-11T23:17:31.000Z,False,[none],https://i.ytimg.com/vi/tCf0HPvviaM/default.jpg,When you don’t put your phone on Airplane Mode,19.12.12,tCf0HPvviaM,612775
9,24,UCaj53l_4tT1m5DfsxLtcxpQ,VH1 Love & Hip Hop,2726,False,5,Fizz and Boog talk about their relationship wi...,293,6776,2019-12-11T14:00:12.000Z,False,Fizz and Boog|Omarion|Millennium tour|Apryl an...,https://i.ytimg.com/vi/fkQfM8fetN8/default.jpg,Fizz & Boog Speak on Relationships w/ Omarion ...,19.12.12,fkQfM8fetN8,539720


In [10]:
# add category_name label to dataframe
category_df = pd.read_csv('../US_Datasets/Categories.csv')

def map_category_id_to_name(category_id):
    return category_df[category_df['category_id'] == category_id]['category_name'].values[0]

videos_df['category_name'] = videos_df['category_id'].apply(lambda var: map_category_id_to_name(var))

In [11]:
videos_df.head()

Unnamed: 0,category_id,channel_id,channel_title,comment_count,comments_disabled,days_in_trending,description,dislikes,likes,published_at,ratings_disabled,tags,thumbnail_link,title,trending_date,video_id,view_count,category_name
0,24,UCJ0uqCI0Vqr2Rrt1HseGirg,The Late Late Show with James Corden,20068,False,1,Late Late Show guest host Harry Styles challen...,8256,571835,2019-12-11T07:08:34.000Z,False,The Late Late Show|Late Late Show|James Corden...,https://i.ytimg.com/vi/vPx6M7eTYbc/default.jpg,Spill Your Guts: Harry Styles & Kendall Jenner,19.12.12,vPx6M7eTYbc,11636632,Entertainment
1,26,UCDSJCBYqL7VQrlXfhr1RtwA,Les Do Makeup,13130,False,7,Hi babygirls! Thank you so much for watching ...,773,52780,2019-12-12T05:25:42.000Z,False,[none],https://i.ytimg.com/vi/sg8WaeeFyNY/default.jpg,WE GOT UNEXPECTED NEWS..,19.12.12,sg8WaeeFyNY,423215,How-to & Style
2,17,UCQIUhhcmXsu6cN6n3y9-Pww,Jesser,931,False,7,"Last to miss the basketball shot wins 10,000!●...",586,20178,2019-12-11T23:00:53.000Z,False,last to leave|nba basketball|nba basketball hi...,https://i.ytimg.com/vi/q1PR05q8l2g/default.jpg,"Last To Miss Layup Wins $10,000",19.12.12,q1PR05q8l2g,463685,Sport
3,17,UCWJ2lWNubArHWmf3FIHbfcQ,NBA,668,False,2,LAKERS at MAGIC | FULL GAME HIGHLIGHTS | Decem...,288,4605,2019-12-12T02:35:33.000Z,False,sp:ty=high|sp:dt=2019-12-12T00:00:00Z|sp:st=ba...,https://i.ytimg.com/vi/t6Z6RIXq0L0/default.jpg,LAKERS at MAGIC | FULL GAME HIGHLIGHTS | Decem...,19.12.12,t6Z6RIXq0L0,659579,Sport
4,24,UCtj45MepAoKxZoyR_Mnt86Q,Royal Family,1907,False,6,LAST VIDEO ; https://youtu.be/okujHUu_hmQ*MAKE...,164,18400,2019-12-12T02:38:37.000Z,False,queen Naija|Medicine|Queen|Spicy|Royalty Squad...,https://i.ytimg.com/vi/TGDpRB4ovvA/default.jpg,TIPPING DRIVE-THRU WORKERS $100 FOR THE HOLIDA...,19.12.12,TGDpRB4ovvA,175558,Entertainment


In [12]:
# convert trending_date and published_at features to datetime
videos_df['trending_date'] = pd.to_datetime(videos_df['trending_date'], format='%y.%d.%m')
videos_df['published_at'] = pd.to_datetime(videos_df['published_at'], format='%Y-%m-%dT%H:%M:%S.%fZ')
videos_df.head()

Unnamed: 0,category_id,channel_id,channel_title,comment_count,comments_disabled,days_in_trending,description,dislikes,likes,published_at,ratings_disabled,tags,thumbnail_link,title,trending_date,video_id,view_count,category_name
0,24,UCJ0uqCI0Vqr2Rrt1HseGirg,The Late Late Show with James Corden,20068,False,1,Late Late Show guest host Harry Styles challen...,8256,571835,2019-12-11 07:08:34,False,The Late Late Show|Late Late Show|James Corden...,https://i.ytimg.com/vi/vPx6M7eTYbc/default.jpg,Spill Your Guts: Harry Styles & Kendall Jenner,2019-12-12,vPx6M7eTYbc,11636632,Entertainment
1,26,UCDSJCBYqL7VQrlXfhr1RtwA,Les Do Makeup,13130,False,7,Hi babygirls! Thank you so much for watching ...,773,52780,2019-12-12 05:25:42,False,[none],https://i.ytimg.com/vi/sg8WaeeFyNY/default.jpg,WE GOT UNEXPECTED NEWS..,2019-12-12,sg8WaeeFyNY,423215,How-to & Style
2,17,UCQIUhhcmXsu6cN6n3y9-Pww,Jesser,931,False,7,"Last to miss the basketball shot wins 10,000!●...",586,20178,2019-12-11 23:00:53,False,last to leave|nba basketball|nba basketball hi...,https://i.ytimg.com/vi/q1PR05q8l2g/default.jpg,"Last To Miss Layup Wins $10,000",2019-12-12,q1PR05q8l2g,463685,Sport
3,17,UCWJ2lWNubArHWmf3FIHbfcQ,NBA,668,False,2,LAKERS at MAGIC | FULL GAME HIGHLIGHTS | Decem...,288,4605,2019-12-12 02:35:33,False,sp:ty=high|sp:dt=2019-12-12T00:00:00Z|sp:st=ba...,https://i.ytimg.com/vi/t6Z6RIXq0L0/default.jpg,LAKERS at MAGIC | FULL GAME HIGHLIGHTS | Decem...,2019-12-12,t6Z6RIXq0L0,659579,Sport
4,24,UCtj45MepAoKxZoyR_Mnt86Q,Royal Family,1907,False,6,LAST VIDEO ; https://youtu.be/okujHUu_hmQ*MAKE...,164,18400,2019-12-12 02:38:37,False,queen Naija|Medicine|Queen|Spicy|Royalty Squad...,https://i.ytimg.com/vi/TGDpRB4ovvA/default.jpg,TIPPING DRIVE-THRU WORKERS $100 FOR THE HOLIDA...,2019-12-12,TGDpRB4ovvA,175558,Entertainment


### Adding new column for number of days from video posting to trending

In [13]:
publish_to_trend = videos_df['trending_date'].dt.date - videos_df['published_at'].dt.date
videos_df['publish_to_trend'] = publish_to_trend.dt.days
videos_df.head()

Unnamed: 0,category_id,channel_id,channel_title,comment_count,comments_disabled,days_in_trending,description,dislikes,likes,published_at,ratings_disabled,tags,thumbnail_link,title,trending_date,video_id,view_count,category_name,publish_to_trend
0,24,UCJ0uqCI0Vqr2Rrt1HseGirg,The Late Late Show with James Corden,20068,False,1,Late Late Show guest host Harry Styles challen...,8256,571835,2019-12-11 07:08:34,False,The Late Late Show|Late Late Show|James Corden...,https://i.ytimg.com/vi/vPx6M7eTYbc/default.jpg,Spill Your Guts: Harry Styles & Kendall Jenner,2019-12-12,vPx6M7eTYbc,11636632,Entertainment,1
1,26,UCDSJCBYqL7VQrlXfhr1RtwA,Les Do Makeup,13130,False,7,Hi babygirls! Thank you so much for watching ...,773,52780,2019-12-12 05:25:42,False,[none],https://i.ytimg.com/vi/sg8WaeeFyNY/default.jpg,WE GOT UNEXPECTED NEWS..,2019-12-12,sg8WaeeFyNY,423215,How-to & Style,0
2,17,UCQIUhhcmXsu6cN6n3y9-Pww,Jesser,931,False,7,"Last to miss the basketball shot wins 10,000!●...",586,20178,2019-12-11 23:00:53,False,last to leave|nba basketball|nba basketball hi...,https://i.ytimg.com/vi/q1PR05q8l2g/default.jpg,"Last To Miss Layup Wins $10,000",2019-12-12,q1PR05q8l2g,463685,Sport,1
3,17,UCWJ2lWNubArHWmf3FIHbfcQ,NBA,668,False,2,LAKERS at MAGIC | FULL GAME HIGHLIGHTS | Decem...,288,4605,2019-12-12 02:35:33,False,sp:ty=high|sp:dt=2019-12-12T00:00:00Z|sp:st=ba...,https://i.ytimg.com/vi/t6Z6RIXq0L0/default.jpg,LAKERS at MAGIC | FULL GAME HIGHLIGHTS | Decem...,2019-12-12,t6Z6RIXq0L0,659579,Sport,0
4,24,UCtj45MepAoKxZoyR_Mnt86Q,Royal Family,1907,False,6,LAST VIDEO ; https://youtu.be/okujHUu_hmQ*MAKE...,164,18400,2019-12-12 02:38:37,False,queen Naija|Medicine|Queen|Spicy|Royalty Squad...,https://i.ytimg.com/vi/TGDpRB4ovvA/default.jpg,TIPPING DRIVE-THRU WORKERS $100 FOR THE HOLIDA...,2019-12-12,TGDpRB4ovvA,175558,Entertainment,0


In [14]:
# adding new column for publishing hour
videos_df['publishing_hour'] = pd.to_datetime(videos_df['published_at']).dt.hour
videos_df.head()

Unnamed: 0,category_id,channel_id,channel_title,comment_count,comments_disabled,days_in_trending,description,dislikes,likes,published_at,ratings_disabled,tags,thumbnail_link,title,trending_date,video_id,view_count,category_name,publish_to_trend,publishing_hour
0,24,UCJ0uqCI0Vqr2Rrt1HseGirg,The Late Late Show with James Corden,20068,False,1,Late Late Show guest host Harry Styles challen...,8256,571835,2019-12-11 07:08:34,False,The Late Late Show|Late Late Show|James Corden...,https://i.ytimg.com/vi/vPx6M7eTYbc/default.jpg,Spill Your Guts: Harry Styles & Kendall Jenner,2019-12-12,vPx6M7eTYbc,11636632,Entertainment,1,7
1,26,UCDSJCBYqL7VQrlXfhr1RtwA,Les Do Makeup,13130,False,7,Hi babygirls! Thank you so much for watching ...,773,52780,2019-12-12 05:25:42,False,[none],https://i.ytimg.com/vi/sg8WaeeFyNY/default.jpg,WE GOT UNEXPECTED NEWS..,2019-12-12,sg8WaeeFyNY,423215,How-to & Style,0,5
2,17,UCQIUhhcmXsu6cN6n3y9-Pww,Jesser,931,False,7,"Last to miss the basketball shot wins 10,000!●...",586,20178,2019-12-11 23:00:53,False,last to leave|nba basketball|nba basketball hi...,https://i.ytimg.com/vi/q1PR05q8l2g/default.jpg,"Last To Miss Layup Wins $10,000",2019-12-12,q1PR05q8l2g,463685,Sport,1,23
3,17,UCWJ2lWNubArHWmf3FIHbfcQ,NBA,668,False,2,LAKERS at MAGIC | FULL GAME HIGHLIGHTS | Decem...,288,4605,2019-12-12 02:35:33,False,sp:ty=high|sp:dt=2019-12-12T00:00:00Z|sp:st=ba...,https://i.ytimg.com/vi/t6Z6RIXq0L0/default.jpg,LAKERS at MAGIC | FULL GAME HIGHLIGHTS | Decem...,2019-12-12,t6Z6RIXq0L0,659579,Sport,0,2
4,24,UCtj45MepAoKxZoyR_Mnt86Q,Royal Family,1907,False,6,LAST VIDEO ; https://youtu.be/okujHUu_hmQ*MAKE...,164,18400,2019-12-12 02:38:37,False,queen Naija|Medicine|Queen|Spicy|Royalty Squad...,https://i.ytimg.com/vi/TGDpRB4ovvA/default.jpg,TIPPING DRIVE-THRU WORKERS $100 FOR THE HOLIDA...,2019-12-12,TGDpRB4ovvA,175558,Entertainment,0,2


In [15]:
#get name of day from published_at
def map_date_to_day_name(date):
    return date.strftime('%A')

videos_df['publishing_day'] = videos_df['published_at'].apply(lambda var: map_date_to_day_name(var))

# Comments sentiment analysis

## Collecting comments for each video

In [16]:
#kljuc je id video,  vrednost predstavlja niz komentara za taj video
print(datetime.now())
videos_comments=dict() #recnik gde ce kljuc biti video id a vrednost niz komentara

#inicijalizacija recnika sa id videoa i komentarima
for v in videos:
    videos_comments[v]=[]

dfs_list=list(dfs.values())

empty_comments=dict()
comments_id=[]
# u obzir dolaze videi koji su usli i izasli iz trendinga za vreme skupljanja podataka

j=0
for df in dfs_list:    #iteriramo kroz foldere u kojima se nalaze fajlovi s komentarima za video                        
    comments_date_str=df.iloc[0].trending_date  
    comments_dir_path='../US_Datasets/'+comments_date_str+'_US' #ovde formiramo putanju do foledera gde se nalazi fajlovi sa komentarima 
    
    j=j+1
    
    for file in os.listdir(comments_dir_path):  #iteriramo kroz sve fajlove unutar foldera
        comments=[]        #ovde cemo smestiti sve komentare nekog videa
        hasContent=True    #sluzi za proveru da li fajl ima sadrzaj.(fajlovi sa 1kb)
        
        
        #naziv fajla s komentarima je Comments_"video_Id".json
        videoId=file[9:]
        videoId=videoId[:-5]

        if videoId in videos:    #provera da li imamo video i data frame 
            
            with open(comments_dir_path+'/'+file, "r") as read_file:
                data = json.load(read_file)
                comments=data
                if len(comments) == 0:
                    hasContent=False
                    empty_comments[videoId]=1

            if hasContent:

                if videoId in empty_comments:
                    empty_comments.pop(videoId,None)


                for comment in comments: 
                    tempDate=''
                    tempDate=comment['snippet']['topLevelComment']['snippet']['publishedAt'].split('T')[0]
                    tempDate=tempDate.split('-')
                    commentDate=datetime(int(tempDate[0]), int(tempDate[1]), int(tempDate[2])).date()
                    videoTrendingDate=videos[comment['snippet']['videoId']].trending_date
                    videoTrendingDate=datetime.strptime(videoTrendingDate, '%y.%d.%m').date()

                    commentId=comment['id']

                    if commentDate<=videoTrendingDate and commentId not in comments_id:
                        videos_comments[videoId].append(comment['snippet']['topLevelComment']['snippet']['textOriginal'])
                        comments_id.append(commentId)
#                     if len(video_comments) > 0:
#                         videos_comments[videoId]=video_comments
print('================================================================================')
print(datetime.now())


2020-03-12 16:40:33.305241
2020-03-12 17:20:36.034960


In [17]:
#PROVERA KOLIKO OD KOLIKO VIDEA NEMAMO KOMENTARA
print('Imamo videa:')
print(len(videos))
print('Videos_comments:')
print(len(videos_comments))
print('Nemaju komentare sledeci id:')
i=0
for v in videos_comments:  
    if len(videos_comments[v])== 0:
        i=i+1
        print(v)
        
print('Ukupno nema komentara')
print(i)

Imamo videa:
2272
Videos_comments:
2272
Nemaju komentare sledeci id:
aS_dFYGKoKg
-pM-GaLh5v0
hPZDZsHShwg
7iPyz6Yqwl4
T1qjjr2V-7g
4i1SnSrOLek
iA9F3pfoVXY
b3vJRFYGCOw
E5orqSr_rok
BDjNG2LQRfM
jlOncGequ8o
bsoGh1czMGw
I9t6YW3o-xc
uPS1qFK6PAM
5SnKP_jjWKg
GtXO5CL-tc0
amdbtHvSeeA
n-CzwR9bDA4
ScehDdDzPyI
VPHv8plPnNk
qBNVadmFj-c
_6jcZH6uq1Q
6KdLcd-sUQA
wRJL46wWfUY
BgTsKiZODZU
BCf5gsl3C08
mB31x23bB5g
TzMyhoKQ294
OHQJeCeQnFM
6CqGuFVPH4I
pke0Zx4xocM
XquCCHJuZDE
rdoXYZBWzXk
CABVnXoe9gw
SRuQzlVjOSM
BCJ3N4l-vyw
lBZikjecU28
soRjcajliHE
sdMzNTlUVbI
6r8eahVpQgc
NkMTKGM-efw
kjtoZhVJBSE
6ka_rPvE2J0
fc2YDa7FOjw
rXKRd1cYPm4
WCnxaYOujIU
xFC7llFS5CQ
L3XnKr0lvDw
cx-5Dm9PGKw
M0eFWoU4CEA
4OnwUz-I2GI
nHttheXCeTw
pgguJ-XYkXY
6rQy5wgfHa0
jehluGQyFLM
_IlZu9X9W_k
CEAK9Ewo-8E
8849UFQu2RQ
N20bMsFqv0o
Oqd84F6R33g
9RJaf28koN8
CoDPTJ-3qCM
mMhCdIat_RY
4jmMWohs1XM
f9YX36nHKYo
0vclPca_UkU
D6bGVWV-Yqg
lIlMtVGI5Pg
v3KrapfMXqU
Qa6ShFwhNWU
sLuAYsPUKic
prPvU0MJk38
VKqC5w207O4
y-eIhBThlrY
RQwX_UkJbDY
14Qsc7WfqJI
RIuk23XHYj0
BLbPPhC

In [19]:
#KOLIKO VIDEA UOPSTE NEMA KOMENTARE fajlovi s komentarima su prazni
print('Nemaju komentara:')
print(len(empty_comments))
print('================================================')
for v in videos:
    if v not in empty_comments and v not in videos_comments or len(videos_comments[v])==0:
        print(v)

Nemaju komentara:
687
aS_dFYGKoKg
-pM-GaLh5v0
hPZDZsHShwg
7iPyz6Yqwl4
T1qjjr2V-7g
4i1SnSrOLek
iA9F3pfoVXY
b3vJRFYGCOw
E5orqSr_rok
BDjNG2LQRfM
jlOncGequ8o
bsoGh1czMGw
I9t6YW3o-xc
uPS1qFK6PAM
5SnKP_jjWKg
GtXO5CL-tc0
amdbtHvSeeA
n-CzwR9bDA4
ScehDdDzPyI
VPHv8plPnNk
qBNVadmFj-c
_6jcZH6uq1Q
6KdLcd-sUQA
wRJL46wWfUY
BgTsKiZODZU
BCf5gsl3C08
mB31x23bB5g
TzMyhoKQ294
OHQJeCeQnFM
6CqGuFVPH4I
pke0Zx4xocM
XquCCHJuZDE
rdoXYZBWzXk
CABVnXoe9gw
SRuQzlVjOSM
BCJ3N4l-vyw
lBZikjecU28
soRjcajliHE
sdMzNTlUVbI
6r8eahVpQgc
NkMTKGM-efw
kjtoZhVJBSE
6ka_rPvE2J0
fc2YDa7FOjw
rXKRd1cYPm4
WCnxaYOujIU
xFC7llFS5CQ
L3XnKr0lvDw
cx-5Dm9PGKw
M0eFWoU4CEA
4OnwUz-I2GI
nHttheXCeTw
pgguJ-XYkXY
6rQy5wgfHa0
jehluGQyFLM
_IlZu9X9W_k
CEAK9Ewo-8E
8849UFQu2RQ
N20bMsFqv0o
Oqd84F6R33g
9RJaf28koN8
CoDPTJ-3qCM
mMhCdIat_RY
4jmMWohs1XM
f9YX36nHKYo
0vclPca_UkU
D6bGVWV-Yqg
lIlMtVGI5Pg
v3KrapfMXqU
Qa6ShFwhNWU
sLuAYsPUKic
prPvU0MJk38
VKqC5w207O4
y-eIhBThlrY
RQwX_UkJbDY
14Qsc7WfqJI
RIuk23XHYj0
BLbPPhCVdUo
al5aNl0z2lg
SUrK9cm8v_8
3ZhYjN0jBhc
8yXklH

## Sentiment Analyzis

### TextBlob sentiment analysis  

In [20]:
# #nltk.download('punkt')
# # nltk.download('wordnet')

# stop_words = set(stopwords.words('english'))
# lemmatizer = WordNetLemmatizer() 

# comments_sentiments=dict()
# for videoId in videos_comments:
#     positive=0
#     negative=0
#     neutral=0
    
#     for comment in videos_comments[videoId]:
# #         print(comment)
#         comment = comment.lower() #to lowerCase sve reci u komentaru
#         comment = re.sub('[^A-Za-z]+', ' ', comment)  #izbacuje brojeve, znakove interpukcije i sve sto nisu slova i brojevi
#         word_tokens = word_tokenize(comment)    #tokenizacija reci
#         stopped = [w for w in word_tokens if not w in stop_words]  #izbacivanje stop reci npr.: a, the, there
       
#         comment = ' '.join(lemmatizer.lemmatize(stopp) for stopp in stopped)    # lematizacija reci
        
#         blob=TextBlob(comment)
#         result=blob.sentiment.polarity
        
#         if result > 0:
#             positive=positive+1
#         elif result < 0:
#             negative=negative+1
#         else:
#             neutral=neutral+1
# #         print(comment)
# #         print(result)
# #         print('========================================')
# #     print(len(videos_comments[videoId]))
# #     print(videoId)
#     countComments=len(videos_comments[videoId])
# #     posRel=positive/countComments
# #     negRel=negative/countComments
# #     neuRel=neutral/countComments
#     if countComments > 0:
#         posRel=positive/countComments
#         negRel=negative/countComments
#         neuRel=neutral/countComments
#     else:
#         posRel=None
#         negRel=None
#         neuRel=None
# #     print(str(posRel))
# #     print(str(negRel))
# #     print(str(neuRel))
#     comments_sentiments[videoId]=[posRel, negRel, neuRel, countComments]
# #     print(comments_sentiments)
# #     print('=======================================')
# #     break


### Vader sentiment analysis

In [31]:
def sentiment_analyzer_scores(sentence):
    return analyser.polarity_scores(sentence)
    #print("{:-<40} {}".format(sentence, str(score)))

analyser = SentimentIntensityAnalyzer()
comments_sentiments=dict()

for videoId in videos_comments:
    positive=0
    negative=0
    neutral=0
    
    for comment in videos_comments[videoId]:
        
        score=sentiment_analyzer_scores(comment)
        
        
        if score['compound'] > 0.05:
            positive=positive+1
        elif score['compound'] < -0.05:
            negative=negative+1
        else:
            neutral=neutral+1

    countComments=len(videos_comments[videoId])

    if countComments > 0:
        posRel=positive/countComments
        negRel=negative/countComments
        neuRel=neutral/countComments
    else:
        posRel=None
        negRel=None
        neuRel=None

    comments_sentiments[videoId]=[posRel, negRel, neuRel, countComments]

In [32]:
i=0
for vv in comments_sentiments.values():
    if vv[0]==None:
        i=i+1
print(i)

250


In [33]:
def map_sentiment_in_dataframe(video_id):
    return comments_sentiments[video_id]

func = lambda var: pd.Series(map_sentiment_in_dataframe(var))

videos_df[['positive_sentiment', 'negative_sentiment', 'neutral_sentiment', 'analyzed_comments']] = videos_df['video_id'].apply(func)

In [34]:
# videos_df.head(10)
set(videos_df['category_name'])

{'Autos & Vehicles',
 'Comedy',
 'Education',
 'Entertainment',
 'Film & Animation',
 'Gaming',
 'How-to & Style',
 'Music',
 'News & Politics',
 'People & Blogs',
 'Pets & Animals',
 'Science & Technology',
 'Sport',
 'Travel & Events'}

# Exporting dataframe to CSV, so that other files can load it

In [35]:
videos_df.to_pickle('US_trending.pkl')  # where to save it, usually as a .pkl

In [36]:
videos_df

Unnamed: 0,category_id,channel_id,channel_title,comment_count,comments_disabled,days_in_trending,description,dislikes,likes,published_at,...,video_id,view_count,category_name,publish_to_trend,publishing_hour,publishing_day,positive_sentiment,negative_sentiment,neutral_sentiment,analyzed_comments
0,24,UCJ0uqCI0Vqr2Rrt1HseGirg,The Late Late Show with James Corden,20068,False,1,Late Late Show guest host Harry Styles challen...,8256,571835,2019-12-11 07:08:34,...,vPx6M7eTYbc,11636632,Entertainment,1,7,Wednesday,0.468750,0.131250,0.400000,160.0
1,26,UCDSJCBYqL7VQrlXfhr1RtwA,Les Do Makeup,13130,False,7,Hi babygirls! Thank you so much for watching ...,773,52780,2019-12-12 05:25:42,...,sg8WaeeFyNY,423215,How-to & Style,0,5,Thursday,0.750000,0.075000,0.175000,40.0
2,17,UCQIUhhcmXsu6cN6n3y9-Pww,Jesser,931,False,7,"Last to miss the basketball shot wins 10,000!●...",586,20178,2019-12-11 23:00:53,...,q1PR05q8l2g,463685,Sport,1,23,Wednesday,0.318182,0.181818,0.500000,88.0
3,17,UCWJ2lWNubArHWmf3FIHbfcQ,NBA,668,False,2,LAKERS at MAGIC | FULL GAME HIGHLIGHTS | Decem...,288,4605,2019-12-12 02:35:33,...,t6Z6RIXq0L0,659579,Sport,0,2,Thursday,0.462069,0.193103,0.344828,145.0
4,24,UCtj45MepAoKxZoyR_Mnt86Q,Royal Family,1907,False,6,LAST VIDEO ; https://youtu.be/okujHUu_hmQ*MAKE...,164,18400,2019-12-12 02:38:37,...,TGDpRB4ovvA,175558,Entertainment,0,2,Thursday,0.631250,0.131250,0.237500,160.0
5,24,UCilwZiBBfI9X6yiZRzWty8Q,FaZe Rug,4510,False,4,THEY WERE SO HAPPY!I went up to random strange...,1085,66567,2019-12-11 22:00:56,...,aS_dFYGKoKg,927083,Entertainment,1,22,Wednesday,,,,0.0
6,24,UCG8rbF3g2AMX70yOd8vqIZg,Logan Paul,22355,False,7,Install Raid for Free ✅ IOS: http://bit.ly/Log...,33927,145538,2019-12-11 18:59:22,...,y7TlnQq6XzI,2231082,Entertainment,1,18,Wednesday,0.366667,0.266667,0.366667,60.0
7,23,UCpi8TJfiA4lKGkaXs__YdBA,The Try Guys,4686,False,3,Will #TheTryGuys survive a #MrBeast challenge?...,1452,94998,2019-12-11 16:00:08,...,h2NXZTrZEBk,1745133,Comedy,1,16,Wednesday,0.000000,0.000000,1.000000,1.0
8,23,UC4G10tk3AHFuyMIuD3rHOBA,RDCworld1,3439,False,7,Please everybody for the safety of everyone pu...,452,87589,2019-12-11 23:17:31,...,tCf0HPvviaM,612775,Comedy,1,23,Wednesday,0.500000,0.100000,0.400000,20.0
9,24,UCaj53l_4tT1m5DfsxLtcxpQ,VH1 Love & Hip Hop,2726,False,5,Fizz and Boog talk about their relationship wi...,293,6776,2019-12-11 14:00:12,...,fkQfM8fetN8,539720,Entertainment,1,14,Wednesday,0.250000,0.562500,0.187500,32.0
