# Youtube Comment Analysis Project Part 1: Webscraping

## Let's first import the relevant libraries

In [1]:
from selenium import webdriver
import time
import pandas as pd

### Next we grab the url of the youtube channel for whom we want to srape info from, create the driver object to navigate the site, and have the driver open via the url

In [2]:
#!!!!!!!!Enter the channel that you want to scrape inbetween c and videos
#!!!!!!!!Remember you need to have a chromedriver installed
url = 'https://www.youtube.com/c//videos'
driver = webdriver.Chrome('C:/bin/chromedriver.exe')
driver.get(url)

### We then create a function that will have the driver scroll to the bottom of whatever page is currently open

In [3]:
def scroll_bottom():
    #Instructs the driver to scroll to the bottom of the page and returns the end scrollHeight
    pageLength = driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);return document.documentElement.scrollHeight;")
    end = False

    #Loops the above exec until the page can't scroll any further
    while(end==False):
        lastLength = pageLength
        time.sleep(3)
        pageLength = driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);return document.documentElement.scrollHeight;")
        if lastLength==pageLength:
            end=True

### This block of code will auto scroll to the bottom of the videos page of a YouTube channel and then proceed to scrape each video's mos basic data which includes its title, link, and number of views

In [4]:
#Create some lists to hold all of the basic video info from the youtube channel
title = []
link = []
views = []
   
scroll_bottom()

#Create a videos list that holds all of the video elements that hold the basic info such as title, link, and views
videos = driver.find_elements_by_class_name('style-scope ytd-grid-video-renderer')

#!!!!!!!!We're only scraping 155 of the most recent videos
#!!!!!!!!You can replace 155 with any number of videos you want to scrape
del videos[155:]

#Loop through each video element, retrieve each video's basic info and append it to the prepared lists title, link, and views
for video in videos:
    v_title = video.find_element_by_xpath('.//*[@id="video-title"]').text
    v_link = video.find_element_by_xpath('.//*[@id="video-title"]').get_attribute('href')
    v_views = video.find_element_by_xpath('.//*[@id="metadata-line"]/span[1]').text
    title.append(v_title)
    link.append(v_link)
    views.append(v_views)


### This block of code will loop through each video page and each comment to retrieve the post date, likes, dislikes, comment author, comment text, and comment likes.
### It will then create dictionary items that hold of the info and append them to a final list

In [5]:
#Create a list to hold each individual comment along with all relevant video data.
#Each entry will be a dictionary item that will hold title, link, views, date, likes, dislikes, author, comment text, and comment likes.
comment_list = []

#Loop through each video link, and scrape all relevant data, and appending it to comment_list
for v_link in range(len(link)):
    
    #Create a new driver session each loop to avoid failures to load url that often occurs during continuous sessions
    driver.quit()
    driver = webdriver.Chrome('C:/bin/chromedriver.exe')
    driver.get(link[v_link])
    
    time.sleep(3) #Give link time to load
    
    ###In case you need to play/pause video at start
    #button = driver.find_element_by_xpath('.//*[@id="movie_player"]')
    #button.click()
    
    scroll_bottom() #Scroll to bottom to make sure every comment is loaded in
    
    #Video_info will be the element that will hold more the video's basic info
    video_info = driver.find_element_by_class_name('style-scope ytd-video-primary-info-renderer')
    
    #Retrieve the video's post date, likes, and dislikes
    video_date = video_info.find_element_by_xpath('.//*[@id="date"]/yt-formatted-string').text
    video_likes = video_info.find_elements_by_xpath('.//*[@id="text"]')[0].text
    video_dislikes = video_info.find_elements_by_xpath('.//*[@id="text"]')[1].text

    #Comments list will hold all instances of a comment and the info relevant to it
    comments = driver.find_elements_by_class_name('style-scope ytd-comment-renderer')
    
    #Loop through each comment element and extract the author, comment text, and comment likes
    for comment in comments:
        v_author = comment.find_element_by_xpath('.//*[@id="author-text"]/span').text
        v_comment = comment.find_element_by_xpath('.//*[@id="content-text"]').text
        v_likes = comment.find_element_by_xpath('.//*[@id="vote-count-middle"]').text
       
        #Create a dictionary item to append all the video information to the comment_list
        comment_item = {
        'title': title[v_link],
        'link': link[v_link],
        'views': views[v_link],    
        'date': video_date,
        'likes': video_likes,  
        'dislikes': video_dislikes,   
        'author': v_author,
        'comment_text': v_comment,
        'comment_likes': v_likes
        }
        comment_list.append(comment_item)

### Then we convert the final comment list into a pandas Data Frame

In [1]:
comment_df = pd.DataFrame(comment_list)
comment_df

NameError: name 'pd' is not defined

### Let's Finally export this info for later use

In [7]:
comment_df.to_csv("comment_info.csv", index = False, header = True)
comment_df.to_excel("comment_info.xlsx", index = False, header = True)