# 1. Importing in Libraries

In [26]:
from selenium import webdriver

import pandas as pd

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 2. Setting YouTube URL Link

#### To find the needed URL, we have to first access YouTube and search. In this case, we searched 'Travel'. This will redirect us to a page full of results. On the top, there is a filter button, which is used to select only 'Videos'.

#### The webdriver.Chrome() command launches open the web browser selected and awaits further information. In this case, our URL link.

In [27]:
driver = webdriver.Chrome()

#### The URL link is provided to the driver using driver.get('insert_url_link_here'). This will bring the blank web browser to the specified url. The youtube link follows a fixed format -> search_query='topic of interest' followed by sp = 'EgIQAQ%253D%253D'. SP refers to the filter type. In this case EgIQAQ%253D%253D means only videos.

In [28]:
driver.get('https://www.youtube.com/results?search_query=travel&sp=EgIQAQ%253D%253D')

#### In order to fetch all the video links present and create a list to store these links, we have to first inspect the elements in the browser window. This can be achieved through the F12 key. Search for the anchor tag with id = ”video-title” and then right-click on it -> Copy -> XPath. The XPath should look something like : //*[@id=”video-title”]

In [None]:
video_data = driver.find_elements_by_xpath('//*[@id="video-title"]')

#### With the previous step, we have segmented out all the HTML portions with the video information out from the original website. Now, we need to run through the remaining HTML data and obtain the URL using the get_attribute('href') function.

In [None]:
individual_video_links = []

for each_video in video_data:
    
    individual_video_links.append(each_video.get_attribute('href'))

In [None]:
individual_video_links

In [None]:
individual_video_titles = []

for each_video in video_data:
    
    individual_video_titles.append(each_video.get_attribute('title'))

In [None]:
individual_video_titles

# 2b. To get links for multiple topics

In [None]:
import time

In [None]:
#List container to hold all the subsequent links obtained
individual_video_links = []
individual_video_topics = []

#Interested topic list
topic_list = ['game coc', 'travel singapore', 'food singapore']

#To replicate Youtube search engine, which takes spaces in search terms and replace it with a '+'
for each_topic in topic_list:
    
    #If a search term is more than 2 words long, then it will have to be split, and a '+' have to be added to join the words together
    if len(each_topic.split(' ')) > 1:
        
        length_of_topic = len(each_topic.split(' ')) #this variable helps us by creating an index for the join later
        individual_words_in_topic = each_topic.split(' ') #this variable creates the split words in the search term
        
        #List container to hold the broken down words of a search term
        pieced_up_topic = []
        
        for i in range(0, length_of_topic):
        
            pieced_up_topic.append(individual_words_in_topic[i])
        
        #Finally, piecing it all up together with '+' in between instead of spaces
        final_edited_topic = '+'.join(pieced_up_topic)
        
        #Chromedriver will now automatically key in the search URL into the browser and retrieve the appropriate website
        driver.get('https://www.youtube.com/results?search_query={}&sp=EgIQAQ%253D%253D'.format(final_edited_topic))
        
        #Only the video information from YouTube search page is extracted out
        topic_video_data = driver.find_elements_by_xpath('//*[@id="video-title"]')
        
        #However, each search page contains multiple video links, therefore we need to iterate through the list and open each individually
        for each_video in topic_video_data:

            individual_video_links.append(each_video.get_attribute('href'))
            individual_video_topics.append(each_topic)
    
    #This section is only used when the search term contain only 1 word
    else:
        
        driver.get('https://www.youtube.com/results?search_query={}&sp=EgIQAQ%253D%253D'.format(each_topic))
        topic_video_data = driver.find_elements_by_xpath('//*[@id="video-title"]')
        
        for each_video in topic_video_data:
            
            individual_video_links.append(each_video.get_attribute('href'))
            individual_video_topics.append(each_topic)

In [None]:
pre_merged_list = pd.DataFrame({'first_column':individual_video_links, 'second_column':individual_video_topics})

In [None]:
pre_merged_list = pre_merged_list.dropna(axis=0)

#### There are scenarios where a 'None' result is obtained. This could probably be from video advertisers found on the YouTube search page itself. Nevertheless, this might be a trivial scenario and a simple list comprehension should filter out the 'None' values.

In [None]:
compiled_video_links = [each_video_link for each_video_link in individual_video_links if each_video_link]

#### The secondary problem with parsing in YouTube video data is that YouTube search results page adopts a infinite scroll and loading mechanism until Amazon webpages which requires you to click on the next page button. Therefore, the Selenium code for such a mechanism is as follows:

# 3. Creating DataFrame to store Data

In [None]:
df = pd.DataFrame(columns = ['link', 'title', 'description', 'category'])

# 4. Scraping Video Details from Youtube

In [None]:
# “wait” will ignore instances of NotFoundException that are encountered (thrown) by default in the ‘until’ condition. It will immediately propagate all others.
wait = WebDriverWait(driver, 15) # 10 means timeout in 10 seconds when an exception is called

i = 0

for each_link in compiled_video_links:
    
    driver.get(each_link)
    
    video_id = each_link.strip('https://www.youtube.com/watch?v=')
    video_title = wait.until(EC.presence_of_element_located(
                    (By.CSS_SELECTOR, 'h1.title yt-formatted-string'))).text
    video_description = wait.until(EC.presence_of_element_located(
                    (By.CSS_SELECTOR, 'div#description yt-formatted-string'))).text
    
    df.loc[len(df)] = [video_id, video_title, video_description, pre_merged_list.iloc[i,1]]
    i += 1

# 5. Preparing Data for clean up using NLTK

#### It is recommended to store each column seperately so that we can perform easier cleaning up quickly and easily. We begin this step by first creating multiple empty dataframes with the appropriate column names. At this stage, the row values are all null values.

In [None]:
df_link = pd.DataFrame(columns = ["link"])        
df_title = pd.DataFrame(columns = ["title"])        
df_description = pd.DataFrame(columns = ["description"])        
df_category = pd.DataFrame(columns = ["category"])        

#### Since all the row values are null, we now need to reference to our original dataframe to obtain the values to replace the multiple empty dataframe!

In [None]:
df_link['link'] = df['link'] 
df_title ['title']= df['title'] 
df_description['description'] = df['description'] 
df_category['category'] = df['category']

#### Importing in the necessary NLTK libraries

In [None]:
import re #regular expression
import nltk #NLP library
nltk.download('stopwords') #downloading dictionary of stopwords which are common words in a language that add no intrinsic value to the sentence meaning
from nltk.corpus import stopwords #loading the stopwords into python
from nltk.stem.porter import PorterStemmer

# 6. Cleaning up Data

#### PorterStemmer is the function to stemming. The idea of stemming is a sort of normalizing method. Many variations of words carry the same meaning, other than when tense is involved. The reason why we stem is to shorten the lookup, and normalize sentences.

In [None]:
ps = PorterStemmer()

#### To clean up the title segment of the data scrapped.

In [None]:
title_corpus = []

for i in range(0, len(df)):
    
    title = re.sub('[^a-zA-Z]', ' ', df_title['title'][i])
    title = title.lower()
    title = title.split()
    
    title = [ps.stem(word) for word in title if not word in set(stopwords.words('english'))]
    title = ' '.join(title)
    title_corpus.append(title)

#### To clean up the description segment of the data scrapped.

In [None]:
description_corpus = []

for i in range(0, len(df)): #this creates the indexing of each row of description
    
    review = re.sub('[^a-zA-Z]', ' ', df_description['description'][i]) #regular expression operation
    review = review.lower() #changing all letters to lowercase
    review = review.split() #splitting each word
        
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] #if words are not part of the stopword dictionary, then it will be stemmed
    review = ' '.join(review) #and joined with a space
    description_corpus.append(review) #and finally appended to a list

#### Converting it back into a dataframe.

In [None]:
dftitle = pd.DataFrame({'title': title_corpus})
dfdescription = pd.DataFrame({'description': description_corpus})

# 7. Label Encoding Categories

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
dfcategory = df_category.apply(LabelEncoder().fit_transform)

# 8. Final Dataset for ML

In [None]:
df_final = pd.concat([df_link, dftitle, dfdescription, dfcategory], axis = 1, join_axes = [df_link.index])

#### Creating a bag-of-words for our model to understand the keywords to classify videos accordingly.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(max_features = 1500) #The number of features created from data stored in the corpus list

X = cv.fit_transform(title_corpus, description_corpus).toarray() #This stores all the features
y = df_final.iloc[:, 3].values #This stores the encoded categories

#### Now that we have our X's values and Y's values, we can begin building the model for ML. We begin first splitting the X and y into training and test sets.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 777)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy')

In [None]:
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
classifier.score(X_test, y_test)

In [None]:
from sklearn.metrics import *

print(classification_report(y_test, y_pred))