# DATA EXTRACTION CODE:
Logic for extracting various features is implemented entirely by us

# REFERENCES:
    All API calls are referred from this source: https://developers.google.com/youtube/v3/docs/videos
    Progress Bar code referenced from: 
        https://github.com/allenwang28/YouTube-Virality-Predictor/blob/master/scripts/get_last_video_count.ipynb
    Emoticon Removal:
        http://stackoverflow.com/a/13752628/6762004

In [11]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import requests
import pandas as pd
import numpy as np
from pathlib import Path
from textblob import TextBlob
import re

In [5]:
DEVELOPER_KEY = 'Your developers key'
YOUTUBE_API_SERVICE_NAME = 'youtube'
YOUTUBE_API_VERSION = 'v3'

# FOLLOWING CODE EXTRACTS VIDEOIDs AND STATISTICS FOR THE 'SEARCH TERM' and NUMBER OF PAGES GIVEN BY THE USER

In [None]:
#METHOD 1
#FUNCTION: Performs keyword search for all the channels matching the keyword
#returnslist of channelIDs
def youtube_channel_search(options, num_of_pages):
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,developerKey=DEVELOPER_KEY)

    channelids = []  
    #print(2)
    for page in range(0,int(num_of_pages)):
        if page==0:
            #print(3)
            search_response = youtube.search().list(
                q=options.q,
                type='channel',
                part='id,snippet',
                relevanceLanguage='en',
                maxResults=options.max_results
                ).execute()
            #print(search_response)
            next_page_token=search_response.get('nextPageToken')
            for search_result in search_response.get('items', []):
                #print(5)
                if search_result['id']['kind'] == 'youtube#channel':
                    channelids.append('%s' % (search_result['id']['channelId']))
                    #print(6)

        else:
            if next_page_token is not None:
                search_response = youtube.search().list(
                    q=options.q,
                    type='channel',
                    pageToken=next_page_token,
                    part='id,snippet',
                    relevanceLanguage='en',
                    maxResults=options.max_results
                    ).execute()
                next_page_token=search_response.get('nextPageToken')
                for search_result in search_response.get('items', []):
                    if search_result['id']['kind'] == 'youtube#channel':
                        channelids.append('%s' % (search_result['id']['channelId']))

    return channelids

In [None]:
#METHOD 2
#Function: Converts duration of videos to seconds
#PURPOSE: since the duration format provided by youtube is of type PT1H1M1S
def convert_to_seconds(duration):
    h=0
    m=0
    s=0
    if 'H' not in duration:
        if 'M' not in duration:
            if 'S' in duration:
                s=duration.split('S')[0]
        else:
            m=duration.split('M')[0]
            if 'S' in duration:
                s=duration.split('M')[1].split('S')[0]
    else:
        h=duration.split('H')[0]
        if 'M' not in duration:
            if 'S' in duration:
                s=duration.split('H')[1].split('S')[0]
        else:
            m=duration.split('H')[1].split('M')[0]
            if 'S' in duration:
                s=duration.split('H')[1].split('M')[1].split('S')[0]
    timeinseconds=int(h)*3600+int(m)*60+int(s)
    return timeinseconds

In [None]:
#METHOD 3
#FUNCTION: Converts date time string to date
#PURPOSE: used by other methods for conversion to time
from datetime import datetime
from dateutil.parser import parse
def convert_to_date(dt):
    d=parse(dt)
    return d.strftime('%m/%d/%Y')

In [None]:
#METHOD 4
#FUNCTION: Converts date time string to weeks
#PURPOSE: for finding age of channel
from datetime import datetime
from dateutil.parser import parse
def convert_to_weeks(dt):
    d=parse(dt)
    a=d.strftime('%m/%d/%Y')
    x=datetime.strptime(a, '%m/%d/%Y').date()
    y=datetime.now().date()
    weeks=((y-x).days)/7
    return int(weeks)

In [None]:
#METHOD 5
#FUNCTION: to get playlistids
#PURPOSE: fetches the upload playlist of the channel and gets channel statistics
def youtube_channel_videos(channelids):
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY)
    playlist_ids=[]
    video_ids=[]
    channelstats={}
    channeldetails = youtube.channels().list(
                         part="snippet,contentDetails,statistics",
                         id=channelids,
                         ).execute()
     
    for i in channeldetails.get('items', []):
        #print(i)
        playlistid=i["contentDetails"]["relatedPlaylists"]["uploads"]
        channelstats[playlistid]=[i['statistics']['viewCount'] if i['statistics'].get('viewCount') is not None else 0,
                                    i['statistics']['subscriberCount'] if i['statistics'].get('subscriberCount') is not None else 0,
                                    i['statistics']['videoCount'] if i['statistics'].get('videoCount') is not None else 0,
                                    convert_to_weeks(i['snippet']['publishedAt']),playlistid]
        if playlistid is not None:
            playlist_ids.append(playlistid) ##Fetching playlist ids of all the channels
    df= youtube_playlist_videos(playlist_ids,channelstats)
    return df

In [None]:
#METHOD 6
#FUNCTION: gets videos in playlist
#PURPOSE: fetches list of videos in a given playlist
import progressbar
bar = progressbar.ProgressBar()
def youtube_playlist_videos(playlist_ids,channelstats):
    COLUMN_NAMES=['VideoID','Title','Description','Thumbnail','PublishedDate','CategoryID','ChannelID','ChannelTitle',
                  'CHViewCount','CHSubscriberCount','CHAge','CHVideoCount','PlaylistID','Tags','Duration','Caption','ViewCount',
                  'LikeCount','DislikeCount','FavouriteCount','CommentCount']
    dfmain=pd.DataFrame(columns=COLUMN_NAMES)
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY)
    for i in playlist_ids:
        video_ids=[] 
        playlistdetails = youtube.playlistItems().list(
                             part="snippet,contentDetails",
                             maxResults=49,
                             playlistId=i
                             ).execute()
        for j in playlistdetails.get('items', []):
            videoid=j["contentDetails"]["videoId"]
            if videoid is not None:
                video_ids.append(videoid) ##Fetching playlist ids of all the channels  
        df=youtube_videos_metadata(','.join(map(str, video_ids)),channelstats[i])

        dfmain=dfmain.append(df)
    return dfmain

In [None]:
#METHOD 7
#FUNCTION: gets video stats
#PURPOSE: Used to get video stats from videoids and stores the data into dataframe
def youtube_videos_metadata(videoids,channelstats):
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY)
    COLUMN_NAMES=['VideoID','Title','Description','Thumbnail','PublishedDate','CategoryID','ChannelID','ChannelTitle',
                  'CHViewCount','CHSubscriberCount','CHAge','CHVideoCount','PlaylistID','Tags','Duration','Caption','ViewCount',
                  'LikeCount','DislikeCount','FavouriteCount','CommentCount']
    dfmain=pd.DataFrame(columns=COLUMN_NAMES)
    i=0
    videos={}
    prev_like=0
    prev_dislike=0
    prev_comm=0
    prev_view=0
    # Call the videos.list method to retrieve location details for each video.
    video_response = youtube.videos().list(
                    part='snippet,contentDetails,statistics',
                    id=videoids
                    ).execute()

    for video_result in video_response.get('items', []):
        dfmain.loc[i]=[video_result['id']
                       ,video_result['snippet']['title']
                       ,video_result['snippet']['description']
                       ,video_result['snippet']['thumbnails']['default']['url']
                       ,convert_to_date(str(video_result['snippet']['publishedAt']))
                       ,video_result['snippet']['categoryId']
                       ,video_result['snippet']['channelId']
                       ,video_result['snippet']['channelTitle']
                       ,channelstats[0]
                       ,channelstats[1]
                       ,channelstats[2]
                       ,channelstats[3]
                       ,channelstats[4]
                       ,', '.join(map(str, video_result['snippet']['tags'])) if video_result['snippet'].get('tags') is not None else ''
                       ,convert_to_seconds(str(video_result['contentDetails']['duration']).replace('PT',''))
                       ,video_result['contentDetails']['caption']
                       ,video_result['statistics']['viewCount'] if video_result['statistics'].get('viewCount') is not None else 0
                       ,video_result['statistics']['likeCount'] if video_result['statistics'].get('likeCount') is not None else 0
                       ,video_result['statistics']['dislikeCount'] if video_result['statistics'].get('dislikeCount') is not None else 0                      
                       ,video_result['statistics']['favoriteCount'] if video_result['statistics'].get('favoriteCount') is not None else 0
                       ,video_result['statistics']['commentCount'] if video_result['statistics'].get('commentCount') is not None else 0
                      ]
        i=int(i)+1
    return dfmain

In [None]:
#METHOD 8
#FUNCTION: Main method-Starting point
#PURPOSE: Used to get input search term from user and then gets video stats after which calls the method to get comment sentiment
#count
import argparse
search_term=input("Kindly enter search term :")
num_of_pages= input("Enter number of pages :")
vid_ids=''
COLUMN_NAMES=['VideoID','Title','Description','Thumbnail','PublishedDate','CategoryID','ChannelID','ChannelTitle',
                  'CHViewCount','CHSubscriberCount','CHAge','CHVideoCount','PlaylistID','Tags','Duration','Caption','ViewCount',
                  'LikeCount','DislikeCount','FavouriteCount','CommentCount']
dfmain1=pd.DataFrame(columns=COLUMN_NAMES)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--q', help='Search term', default=search_term)
    parser.add_argument('--max-results', help='Max results', default=40)
    args = parser.parse_args(args=[])
    channelids=youtube_channel_search(args, num_of_pages)
    channel_ids=''
    a=0
    start=0
    end=49
    playlistvidscaptions={}
    len_cha=int(len(channelids)/50)
    print(len_cha)
    for x in range(a,len_cha+1): 
        print(x)
        channel_ids=''
        for index,val in enumerate(channelids[start:end]):  
            if index==48:
                channel_ids = channel_ids+','+val
                start=start+49
                end=end+49
                break
            else:
                channel_ids = channel_ids+','+val
        channelplaylistsvideos=youtube_channel_videos(str(channel_ids[1:]))  
        dfmain1=dfmain1.append(channelplaylistsvideos)
    writer = pd.ExcelWriter('FinalVideoList.xlsx')
    dfmain1.to_excel(writer)
    writer.save()

# CODE TO EXTRACT COMMENT FOR EACH VIDEO AND APPLY SENTIMENT ANALYSIS ON THE COMMENT. 
THIS CODE MAINTAINS THE COUNT OF POSITIVE, NEGATIVE AND NEUTRAL COMMENTS FOR EACH VIDEO HENCE TAKES LOT OF TIME TO PROCESS.

In [None]:
#METHOD 9
#FUNCTION: Cleans text of emoticons
def no_emoji(text):
    RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
    text_no_emoji = RE_EMOJI.sub(r'', text)
    return text_no_emoji

In [None]:
#METHOD 10
#FUNCTION: gets sentiment of text
#PURPOSE: Used to get sentiment of comments of the video
def get_comment_sentiment(text_emoji):
    text_no_emoji=no_emoji(text_emoji)
    if text_no_emoji is not None and text_no_emoji.strip()!='' and len(text_no_emoji)>3:
        if TextBlob(text_no_emoji).detect_language()=='en':
            blob = TextBlob(text_no_emoji)
            if blob.sentiment.polarity > 0:
                val = "positive"
            elif blob.sentiment.polarity == 0:
                val = "neutral"
            else:
                val = "negative"
        else:
            val=None
    else:
        val= None
    return val

In [None]:
#METHOD 11
#FUNCTION: gets video comments
#PURPOSE: Used to get video comment sentiment count for 2 and 7 days and merges it with the dataframe passed as argument  
import progressbar
bar = progressbar.ProgressBar()
def youtube_videos_comments(df):
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY)
    COLUMN_NAMES=['VideoID','Comm_count_p7','Comm_count_ng7','Comm_count_n7','Comm_count_p30','Comm_count_ng30','Comm_count_n30']
    dfvideocomments=pd.DataFrame(columns=COLUMN_NAMES)
    video_comments={}
    channel_ids=[]
    i=0
    for index, row in bar(df.iterrows()):
    # Call the videos.list method to retrieve location details for each video.
        count_positive2=0;count_negative2=0;count_neutral2=0;count_positive7=0;count_negative7=0;count_neutral7=0;
        try:
            viddate=convert_to_date(row["PublishedDate"])
            comments = youtube.commentThreads().list(
                         part="snippet",
                         videoId=row["VideoID"],
                         textFormat="plainText"
                         ).execute()
            for item in comments["items"]: 
                date=convert_to_date(item["snippet"]["topLevelComment"]["snippet"]["publishedAt"])
                x=datetime.strptime(viddate, '%m/%d/%Y').date()
                y=datetime.strptime(date, '%m/%d/%Y').date()
                days=(y-x).days
                if days <=7:
                    comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
                    sentiment=get_comment_sentiment(comment)
                    if sentiment=='positive':
                        count_positive2=count_positive2+1
                        count_positive7=count_positive7+1
                    elif sentiment=='negative':
                        count_negative2=count_negative2+1
                        count_negative7=count_negative7+1
                    elif sentiment=='neutral':
                        count_neutral2=count_neutral2+1
                        count_neutral7=count_neutral7+1
                    else:
                        continue
                elif days<=30:
                    comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
                    sentiment=get_comment_sentiment(comment)
                    if sentiment=='positive':
                        count_positive7=count_positive7+1
                    elif sentiment=='negative':
                        count_negative7=count_negative7+1
                    elif sentiment=='neutral':
                        count_neutral7=count_neutral7+1
                    else:
                        continue

            dfvideocomments.loc[i]=[item["snippet"]["videoId"],count_positive7,count_negative7,count_neutral7,count_positive30,
                                    count_negative30,count_neutral30]
            i=int(i)+1
        except:
            continue
    dfresult = pd.merge(df, dfvideocomments, how='inner', on=['VideoID', 'VideoID'])
    return dfresult

In [None]:
dfvideolst = pd.read_excel('FinalVideoList.xlsx')
dfvidcomms=youtube_videos_comments(dfvideolst)
writer1 = pd.ExcelWriter('FinalVideoListCommentInfo.xlsx')
dfvidcomms.to_excel(writer1)
writer1.save()

# CODE TO EXTRACT DETAILS OF PREVIOUS VIDEO PER VIDEO

In [7]:
dfvidcomm = pd.read_excel('FinalVideoListCommentInfo_1.xlsx')

In [8]:
dfChannel_new = dfvidcomm[dfvidcomm.groupby('ChannelID')['ChannelID'].transform(len) > 1]

In [9]:
uniqueChannelID = list(set(dfChannel_new['ChannelID'].tolist()))

In [12]:
dfChannel_new = dfChannel_new[dfChannel_new['ViewCount'] > 0]
dfChannel_new["LikePerView"]=dfChannel_new['LikeCount'] / dfChannel_new['ViewCount'].astype(np.float)*100

In [None]:
#FUNCTION: Abstracts a subset of Channel and sorts it by video published date and assigns previous video stats to a given video
import progressbar
import numpy as np
bar = progressbar.ProgressBar()
df=pd.DataFrame()
for channel in bar(uniqueChannelID):
        #lst=pd.DataFrame({'PView' : 0,'PLike' : 0,'PComments' : 0,'PDislike' : 0,'PFavorite' : 0},index=['49'])
        arr=np.array
        dfPtemp=pd.DataFrame()
        dftemp = dfChannel_new[dfChannel_new['ChannelID'] == channel]
        dftemp['PublishedDate']=pd.to_datetime(dftemp.PublishedDate)
        dftemp.sort_values(by='PublishedDate',inplace=True,ascending=False)
        dftemp.reset_index(inplace=True)
        arrVC=dftemp["ViewCount"][1:]
        arrLC=dftemp["LikeCount"][1:]
        arrCC=dftemp["CommentCount"][1:]
        arrDC=dftemp["DislikeCount"][1:]
        arrFC=dftemp["FavouriteCount"][1:]
        arrLPV=dftemp["LikePerView"][1:]
        arrPubDate=dftemp["PublishedDate"][1:]
        dfPtemp["PView"]=arrVC
        dfPtemp["PLike"]=arrLC
        dfPtemp["PComments"]=arrCC
        dfPtemp["PDislike"]=arrDC
        dfPtemp["PFavorite"]=arrFC
        dfPtemp["PLikePerView"]=arrLPV
        dfPtemp["PPublishedDate"]=arrPubDate
        dfPtemp.append([0,0,0,0,0,0,0])
        dfPtemp.reset_index(inplace=True)
        dffinal=pd.concat([dftemp, dfPtemp], axis=1,ignore_index=False)
        df=pd.concat([df,dffinal])
df.drop('index',inplace=True,axis=1)
df.fillna(0,inplace=True)
writer1 = pd.ExcelWriter('FinalVideoListCommentPrevInfo.xlsx')
df.to_excel(writer1)
writer1.save()

# THE MAJOR CHALLENGE WHILE IMPLEMENTING THIS CODE WAS TO FETCH MAXIMUM AMOUNT OF VIDEO DETAILS IN ONE GO SINCE VIDEOIDs ARE UPDATED BY YOUTUBE ON DAILY BASIS, HENCE WE STARTED WITH FETCHING
# CHANNELIDs ---> UPLOAD PLAYLISTIDs---> VIDEOIDs---> VIDEODETAILS--->COMMENTS(SENTIMENT ANALYSIS)--->PREVIOUS VIDEO DETAILS.

BY EXECUTING ABOVE THREE MODULES OF LINES OF CODE WE GET ALL THE REQUIRED FEATURES AND CAN PROCEED TO EXPLORATORY DATA ANALYSIS.