In [1]:
import numpy as np
import pandas as pd
import time
from datetime import datetime, date
import glob
import sys
import os
import re

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

from func_timeout import func_timeout, FunctionTimedOut
import traceback

In [2]:
def create_df_scrape():
    # 'rank': video xếp hạng thứ mấy trong tìm kiếm với key_search
    # 'title': Tiêu đề video
    # 'date_public': Ngày công chiếu
    # 'description': Mô tả video
    # 'tags': Tag video
    # 'views': Số lượng người xem
    # 'likes',: Số lượt like
    # 'dislikes': Số lượt dislike
    # 'num_cmt': Số lượng comment
    # 'duration': Thời lượng video
    # 'video_url': url của video
    # 'channel_name': Tên kênh
    # 'subs': Số lượt sub
    # 'link_top_re': link of top recommended videos
    return pd.DataFrame(columns=['key_search', 'rank', 'title', 'date_public',
                                 'description', 'tags', 'views', 'likes',
                                 'dislikes', 'num_cmt', 'duration_s', 'video_url',
                                 'channel_name', 'subs', 'link_top_re', 'tit_des_tags_re'])

def create_dic_find_key():
    # Các thuộc tính quan trọng dùng để tìm thông tin trên web
    return {
            'title_date_public':['yt-formatted-string',{'class':'style-scope ytd-video-primary-info-renderer'}],
            'description':['yt-formatted-string',{'class':'content style-scope ytd-video-secondary-info-renderer'}],
            'tags':['meta',{'name':'keywords'}],
            'views':['span',{'class':'view-count style-scope ytd-video-view-count-renderer'}],
            'likes_dislikes':['yt-formatted-string',{'class':'style-scope ytd-toggle-button-renderer style-text'},'aria-label'],
            'num_cmt':['h2',{'id':'count','class':'style-scope ytd-comments-header-renderer'},
                       'span',{'class':'style-scope yt-formatted-string'}],
            'duration_s':"return document.getElementById('movie_player').getDuration()",
            'channel_name':['div',{'class':'style-scope ytd-video-owner-renderer', 'id':'upload-info'},'yt-formatted-string'],
            'subs':['yt-formatted-string',{'id':'owner-sub-count'}],
            'link_top_re':['a',{'id':'thumbnail','class':'yt-simple-endpoint inline-block style-scope ytd-thumbnail'}]
           }

def create_xpath():
    # Tạo dictionary chứa các xpath
    return {
            'title':'//*[@id="container"]/h1/yt-formatted-string',
            'date_public':'//*[@id="date"]/yt-formatted-string',
            'description':'//*[@id="description"]',
            'views':'//*[@id="count"]/ytd-video-view-count-renderer/span[1]',
            'likes':'//*[@id="text"]',
            'dislikes':'//*[@id="text"]',
            'num_cmt':'//*[@id="count"]/yt-formatted-string/span[1]',
            'channel_name':'//*[@id="text"]/a',
            'subs':'//*[@id="owner-sub-count"]',
            'link_top_re':'//*[@id="thumbnail"]'
           }

def create_others():
    # Tạo dictionary chứa các thông tin quan trọng khác
    return {
            'search_link':'https://www.youtube.com/results?search_query=',
            'translate_symbols':{'`':'%60','@':'%40','#':'%23','$':'%24',
                                 '%':'%25','^':'%5E','&':'%26','+':'%2B',
                                 '=':'%3D','|':'%7C','\\':'%5C','}':'%7D',
                                 ']':'%5D','{':'%7B','[':'%5B','\'':'%27',
                                 ':':'%3A',';':'%3B','?':'%3F','/':'%2F',
                                 ',':'%2C',' ':'+'},
            'PATH':'C:\Program Files (x86)\chromedriver.exe',
            'youtube_url':'https://www.youtube.com',
            'video10_xpath':'/html/body/ytd-app/div/ytd-page-manager/ytd-search/div[1]/ytd-two-column-search-results-renderer/div/ytd-section-list-renderer/div[2]/ytd-item-section-renderer/div[3]/ytd-video-renderer[10]/div[1]/div/div[1]/div/h3/a/yt-formatted-string',
            'path_df_scrape':'D:\python_proj\scrape_youtube\scrape_top_videos\df_scrape\\',
            'path_error_urls':'D:\python_proj\scrape_youtube\scrape_top_videos\error_urls\\',
            'path_log_tracking':'D:\python_proj\scrape_youtube\scrape_top_videos\log_tracking\\',

           }

def update_path(start_time, path):
    year = 'y_'+start_time.strftime('%Y')
    month = 'm_'+start_time.strftime('%m')
    day = 'd_'+start_time.strftime('%d')
    if year not in os.listdir(path):
        os.mkdir(path+year+'\\')
    path += year + '\\'
    if month not in os.listdir(path):
        os.mkdir(path+month+'\\')
    path += month + '\\'
    if day not in os.listdir(path):
        os.mkdir(path+day+'\\')
    path += day + '\\'
    return path

def errors():
    return{
        'fatalError':'!! Exceed the max_error !!\n\t=> Terminate!',
        '1stError':'Error in creating DataFrame df_scrape or in assert',
        'main_driver_fail':'! Can not access to search youtube web !',
        '2ndError':'Error in setting driver, or in BeautifulSoup driver lxml video_urls\n\t=> Try to redo',
        'congesting':'May be congested network or driver\n\t=> Try to redo',
        'all_driver_fail':'Can not access any link of the list video_urls',
        'driver_fail':'! Can not access to video, or soup !\n\t=> Switch to another url',
        '3rdError':'Error in driver.get, or not being able to soup, or error in set driver, or in creating csv, txt\n\t=> Try to redo',
        '4thError':'Error in  driver.get, or in concating to dataframe df_scrape, or not being ablle to soup for certain video\n\t=> Try to redo',
        '1stErrorRe':'Fail for loop (re)',
        '2ndErrorRe':'Error in setting driver, or in BeautifulSoup driver lxml video_urls (re)\n\t=> Try to redo',
        'driver_fail_re':'! Can not access to video, or soup, or record to dataframe (re) !\n\t=> Switch to another url',
        'data_loc':'Fail to record data (re)',
    }

def wait_loading_web(wait, list_xpath):
    for i in list_xpath:
        wait.until(EC.presence_of_element_located((By.XPATH, i)))
    return

# def sys_error():
#     return '\n'.join(['Traceback:'] + list(map(lambda x: '\t'+str(x).strip('\n'), list(sys.exc_info()))))+'\n'

def traceback_error():
    return '='*164+'\n'+traceback.format_exc().strip('\n')+'\n'+'='*164+'\n'

def force_kill_driver(driver, path_log_detail, start_time):
    try:
        os.system('taskkill /f /im chrome.exe')
        driver.quit()
    except Exception as err:
        posible_bugs = 'Can not kill or quit driver'
        print(posible_bugs)
        print(err)
        traceback.print_exc()
        with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
            f.write(posible_bugs+'\n')
            f.write(traceback_error())
    return

def force_build_driver(path_log_detail, start_time):
    try:
        driver = webdriver.Chrome(create_others()['PATH'])
    except Exception as err:
        posible_bugs = 'Can not build driver'
        print(posible_bugs)
        print(err)
        traceback.print_exc()
        with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
            f.write(posible_bugs+'\n')
            f.write(traceback_error())
    return driver

def scrape_top_videos(key_search='', top_videos = 10, limit_num_link_re = 10,
    repeat_driver_congestion = 3, max_error = 10, limited_time_load_web = 13):
    # Nhập từ khóa, function tự động điền lên youtube và thu thập thông tin top video
    # (đọc thông tin trong function create_df_scrape())
    try: # 1st error
        driver = ''
        num_error = 0
        fatal_error = 0
        start_time = datetime.now()
        path_df_detail = update_path(start_time, create_others()['path_df_scrape'])
        path_err_detail = update_path(start_time, create_others()['path_error_urls'])
        path_log_detail = update_path(start_time, create_others()['path_log_tracking'])
        assert (top_videos >= 1 and top_videos <= 10), 'Minimum 1 video, maximum 10 videos, or adjust video10_xpath and set Keys.PAGE_DOWN if want to get more videos urls'
        assert (limit_num_link_re >= 1 and limit_num_link_re <= 10), 'Minimum 1 recommended video, maximum 10 recommended videos, or adjust link_top_re and set Keys.PAGE_DOWN if want to get more recommended videos urls'
        assert (repeat_driver_congestion >= 1), 'Minimum 1 repreat, it would be counted as the first visit to the website'
        assert (max_error >= 1), 'Minimum 1 repreat, it would be assumed that no errors are allowed when running the program'
        assert (repeat_driver_congestion <= max_error), 'repeat_driver_congestion must be less than or equal to max_error'
        assert (limited_time_load_web >= 5), 'Minimum 5 seconds, it is the time allowed to wait for the web page to load'
        with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
            f.write('Start\n')
        df_scrape = create_df_scrape()
        driver = ''
        tit_des_tags_re = ''
        if key_search == '':
            key_search=input('Search: ')
        while key_search == '':
            key_search=input('Search: ')
        yt_word=''
        for word_char in key_search:
            if word_char in create_others()['translate_symbols'].keys():
                yt_word += create_others()['translate_symbols'][word_char]
            else:
                yt_word += word_char
        main_driver_success = 0
        main_driver_repeat = 0
        done = 0
        while not (main_driver_success == 1 or main_driver_repeat >= repeat_driver_congestion):
            try: # 2nd error
                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                    f.write('while I\n')
                driver = force_build_driver(path_log_detail, start_time)
                func_timeout(limited_time_load_web,driver.get,args=(create_others()['search_link']+yt_word,))
                html = driver.find_element_by_tag_name('html')
                for i in range(1):
                    html.send_keys(Keys.PAGE_DOWN)
                wait = WebDriverWait(driver, limited_time_load_web)
                wait.until(EC.presence_of_element_located((By.XPATH, create_others()['video10_xpath'])))
                video_urls = list(map(lambda x: x.get('href'),
                                        list(BeautifulSoup(driver.page_source.encode('utf-8').strip(), 'lxml').\
                                            findAll('a', id='video-title'))))
                print(len(video_urls))
                if len(video_urls) >= top_videos:
                    print('\n\t=> Num. urls: Ok.')
                    video_urls = video_urls[:top_videos]
                else:
                    print('\t=> Num. urls: Not enough!')
                main_driver_success = 1
                num_error = 0
                id_video_url = 0
                while not (id_video_url >= len(video_urls) or done == 1 or num_error >= max_error):
                    with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                        f.write('while I.I\n')
                    driver_success = 0
                    driver_repeat = 0
                    while not (driver_success == 1 or driver_repeat >= repeat_driver_congestion or num_error >= max_error):
                        try: # 3rd error
                            func_timeout(limited_time_load_web,driver.get,args=(create_others()['youtube_url']+video_urls[id_video_url],))
                            html = driver.find_element_by_tag_name('html')
                            for i in range(1):
                                html.send_keys(Keys.PAGE_DOWN)
                            wait = WebDriverWait(driver, limited_time_load_web)
                            func_timeout(limited_time_load_web, wait_loading_web, args=(wait, create_xpath().values(),))
                            soup = BeautifulSoup(driver.page_source.encode('utf-8').strip(), 'lxml')
                            try:
                                duration_s = driver.execute_script(create_dic_find_key()['duration_s'])
                            except:
                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                    f.write('Fail to find duration\n')
                                    f.write(traceback_error())
                                duration_s = np.NaN
                                traceback.print_exc()
                            driver_success = 1
                            num_error = 0
                            soup_update = 1
                            id_video_url_new = 0
                            video_urls_new = video_urls[id_video_url+1:]
                            while not (id_video_url_new >= (len(video_urls_new)+1) or num_error >= max_error):
                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                    f.write('while I.I.I\n')
                                driver_2_success = 0
                                driver_2_repeat = 0
                                while not (driver_2_success == 1 or driver_2_repeat >= repeat_driver_congestion or num_error >= max_error):
                                    try: # 4th error
                                        if id_video_url_new < (len(video_urls_new)):
                                            func_timeout(limited_time_load_web,driver.get,args=(create_others()['youtube_url']+video_urls_new[id_video_url_new],))
                                            html = driver.find_element_by_tag_name('html')
                                            for i in range(1):
                                                html.send_keys(Keys.PAGE_DOWN)
                                        else:
                                            driver_2_success = 1
                                        if soup_update == 1:
                                            # -------------------------------------------------------------
                                            try:
                                                title, date_public = list(map(lambda x: x.text,
                                                                                list(soup.find_all(create_dic_find_key()['title_date_public'][0],
                                                                                                    attrs=create_dic_find_key()['title_date_public'][1]))))
                                            except:
                                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                    f.write('Fail to find title and date_public\n')
                                                    f.write(traceback_error())
                                                title = np.NaN
                                                date_public = np.NaN
                                                traceback.print_exc()
                                            # -------------------------------------------------------------
                                            try:
                                                description = ' '.join(soup.find_all(create_dic_find_key()['description'][0],
                                                                                attrs=create_dic_find_key()['description'][1])\
                                                                        [0].find('span').text.replace('\n',' ').replace('\t',' ').split(' ')[:100])
                                            except:
                                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                    f.write('Fail to find description\n')
                                                    f.write(traceback_error())
                                                description = np.NaN
                                                traceback.print_exc()
                                            # -------------------------------------------------------------
                                            try:
                                                tags = soup.find_all(create_dic_find_key()['tags'][0],attrs=create_dic_find_key()['tags'][1])[0]['content']
                                            except:
                                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                    f.write('Fail to find tags\n')
                                                    f.write(traceback_error())
                                                tags = np.NaN
                                                traceback.print_exc()
                                            # -------------------------------------------------------------
                                            try:
                                                views = int(soup.find(create_dic_find_key()['views'][0], attrs=create_dic_find_key()['views'][1])\
                                                            .text.strip(' views').replace(',','').replace('No','0'))
                                            except:
                                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                    f.write('Fail to find views\n')
                                                    f.write(traceback_error())
                                                views = np.NaN
                                                traceback.print_exc()
                                            # -------------------------------------------------------------
                                            try:
                                                likes, dislikes = list(map(lambda x: int(x[create_dic_find_key()['likes_dislikes'][2]].strip(' dislikes').replace(',','').replace('No','0')),
                                                                            list(soup.find_all(create_dic_find_key()['likes_dislikes'][0],
                                                                                                attrs=create_dic_find_key()['likes_dislikes'][1]))))
                                            except:
                                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                    f.write('Fail to find likes and dislikes\n')
                                                    f.write(traceback_error())
                                                likes = np.NaN
                                                dislikes = np.NaN
                                                traceback.print_exc()
                                            # -------------------------------------------------------------
                                            try:
                                                num_cmt = int(soup.find(create_dic_find_key()['num_cmt'][0],attrs=create_dic_find_key()['num_cmt'][1]).\
                                                    find(create_dic_find_key()['num_cmt'][2],attrs=create_dic_find_key()['num_cmt'][3]).text.replace(',',''))
                                            except:
                                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                    f.write('Fail to find number of comments\n')
                                                    f.write(traceback_error())
                                                channel_name = np.NaN
                                                traceback.print_exc()
                                            # -------------------------------------------------------------
                                            try:
                                                channel_name = soup.find(create_dic_find_key()['channel_name'][0],attrs=create_dic_find_key()['channel_name'][1]).\
                                                                find(create_dic_find_key()['channel_name'][2]).text
                                            except:
                                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                    f.write('Fail to find channel_name\n')
                                                    f.write(traceback_error())
                                                channel_name = np.NaN
                                                traceback.print_exc()
                                            # -------------------------------------------------------------
                                            try:
                                                subs = soup.find(create_dic_find_key()['subs'][0],attrs=create_dic_find_key()['subs'][1]).text.strip(' subscribers')
                                                if subs[-1].isdigit():
                                                    subs = int(subs)
                                                else:
                                                    subs = int(float(subs[:-1])*{'K':1000, 'M':1000000, 'B':1000000000}[subs[-1]])
                                            except:
                                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                    f.write('Fail to find number of subscriptions\n')
                                                    f.write(traceback_error())
                                                subs = np.NaN
                                                traceback.print_exc()
                                            # ------------------------------------------------------------
                                            try:
                                                link_top_re = []
                                                top = 0
                                                for i in soup.find_all(create_dic_find_key()['link_top_re'][0],attrs=create_dic_find_key()['link_top_re'][1]):
                                                    try:
                                                        if 'list' not in i['href']:
                                                            link_top_re += [i['href']]
                                                            top += 1
                                                            if top == limit_num_link_re:
                                                                break
                                                    except:
                                                        pass
                                                print('Len of link_top_re: '+str(len(link_top_re)))
                                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                    f.write('Len of link_top_re: '+str(len(link_top_re))+'\n')
                                            except:
                                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                    f.write('Fail to find link_top_re\n')
                                                    f.write(traceback_error())
                                                link_top_re = np.NaN
                                                traceback.print_exc()
                                            # ------------------------------------------------------------
                                            df_scrape = pd.concat([df_scrape,pd.DataFrame([[key_search, id_video_url+id_video_url_new+1, title, date_public, description, tags, views,
                                                likes, dislikes, num_cmt, duration_s, create_others()['youtube_url']+video_urls[id_video_url+id_video_url_new],
                                                channel_name, subs, link_top_re, tit_des_tags_re]],
                                                columns = df_scrape.columns)], axis=0, ignore_index=True)
                                            with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                f.write('Recorded data\n')
                                            soup_update = 0
                                        # -------------------------------------------------------------
                                        if id_video_url_new < (len(video_urls_new)):
                                            wait = WebDriverWait(driver, limited_time_load_web)
                                            func_timeout(limited_time_load_web, wait_loading_web, args=(wait, create_xpath().values(),))
                                            try:
                                                duration_s = driver.execute_script(create_dic_find_key()['duration_s'])
                                            except:
                                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                    f.write('Fail to find duration\n')
                                                    f.write(traceback_error())
                                                duration_s = np.NaN
                                                traceback.print_exc()
                                            soup = BeautifulSoup(driver.page_source.encode('utf-8').strip(), 'lxml')
                                            driver_2_success = 1
                                            num_error = 0
                                            soup_update = 1
                                        # -------------------------------------------------------------
                                    except FunctionTimedOut as err:
                                        print(errors()['congesting'])
                                        print(err)
                                        traceback.print_exc()
                                        with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                            f.write(errors()['congesting']+'\n')
                                            f.write(traceback_error())
                                        driver_2_success = 0
                                        driver_2_repeat += 1
                                        num_error += 1
                                        force_kill_driver(driver, path_log_detail, start_time)
                                        driver = force_build_driver(path_log_detail, start_time)
                                    except Exception as err: # 4th error
                                        print(errors()['4thError'])
                                        print(err)
                                        traceback.print_exc()
                                        with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                            f.write(errors()['4thError']+'\n')
                                            f.write(traceback_error())
                                        driver_2_success = 0
                                        driver_2_repeat += 1
                                        num_error += 1
                                        force_kill_driver(driver, path_log_detail, start_time)
                                        driver = force_build_driver(path_log_detail, start_time)
                                if num_error < max_error and driver_2_success == 0:
                                    print(errors()['driver_fail'])
                                    with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                        f.write(errors()['driver_fail']+'\n')
                                    with open(path_err_detail+'err_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                        f.write(video_urls_new[id_video_url_new]+'\n')
                                id_video_url_new += 1
                            # =============================================================================
                            try:
                                ind_ = 0
                                num_rows = df_scrape.shape[0]
                                while not (ind_ >= num_rows or num_error >= max_error):
                                    link_top_re_data = df_scrape.loc[ind_, 'link_top_re']
                                    tit_des_tags_re = ''
                                    id_link_re = 0
                                    if isinstance(link_top_re_data, list):
                                        while not (id_link_re >= len(link_top_re_data) or num_error >= max_error):
                                            with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                f.write('while I.I.I.I\n')
                                            driver_success_re = 0
                                            driver_repeat_re = 0
                                            while not (driver_success_re == 1 or driver_repeat_re >= repeat_driver_congestion or num_error >= max_error):
                                                try:
                                                    func_timeout(limited_time_load_web,driver.get,args=(create_others()['youtube_url']+link_top_re_data[id_link_re],))
                                                    html = driver.find_element_by_tag_name('html')
                                                    for i in range(1):
                                                        html.send_keys(Keys.PAGE_DOWN)
                                                    list_wait_element_re = [create_xpath()['title'],create_xpath()['description']]
                                                    wait = WebDriverWait(driver, limited_time_load_web)
                                                    func_timeout(limited_time_load_web, wait_loading_web, args=(wait, list_wait_element_re,))
                                                    soup = BeautifulSoup(driver.page_source.encode('utf-8').strip(), 'lxml')
                                                    driver_success_re = 1
                                                    # -------------------------------------------------------------
                                                    try:
                                                        title, date_public = list(map(lambda x: x.text,
                                                                                        list(soup.find_all(create_dic_find_key()['title_date_public'][0],
                                                                                                            attrs=create_dic_find_key()['title_date_public'][1]))))
                                                    except:
                                                        with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                            f.write('\tFail to find title and date_public (re)\n')
                                                            f.write(traceback_error())
                                                        title = np.NaN
                                                        date_public = np.NaN
                                                        traceback.print_exc()
                                                    # -------------------------------------------------------------
                                                    try:
                                                        description = ' '.join(soup.find_all(create_dic_find_key()['description'][0],
                                                                                        attrs=create_dic_find_key()['description'][1])\
                                                                                [0].find('span').text.replace('\n',' ').replace('\t',' ').split(' ')[:100])
                                                    except:
                                                        with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                            f.write('\tFail to find description (re)\n')
                                                            f.write(traceback_error())
                                                        description = np.NaN
                                                        traceback.print_exc()
                                                    # -------------------------------------------------------------
                                                    try:
                                                        tags = soup.find_all(create_dic_find_key()['tags'][0],attrs=create_dic_find_key()['tags'][1])[0]['content']
                                                    except:
                                                        with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                            f.write('\tFail to find tags (re)\n')
                                                            f.write(traceback_error())
                                                        tags = np.NaN
                                                        traceback.print_exc()
                                                    # -------------------------------------------------------------
                                                    tit_des_tags_re += '{}\n{}\n{}\n\n'.format(title, description, tags)
                                                    num_error = 0
                                                    id_link_re += 1
                                                except FunctionTimedOut as err:
                                                    print(errors()['congesting'])
                                                    print(err)
                                                    traceback.print_exc()
                                                    with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                        f.write('\t'+errors()['congesting']+'(re)'+'\n')
                                                        f.write(traceback_error())
                                                    driver_success_re = 0
                                                    driver_repeat_re += 1
                                                    num_error += 1
                                                    force_kill_driver(driver, path_log_detail, start_time)
                                                    driver = force_build_driver(path_log_detail, start_time)
                                                except Exception as err:
                                                    print(errors()['2ndErrorRe'])
                                                    print(err)
                                                    traceback.print_exc()
                                                    with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                        f.write('\t'+errors()['2ndErrorRe']+'\n')
                                                        f.write(traceback_error())
                                                    driver_success_re = 0
                                                    driver_repeat_re += 1
                                                    num_error += 1
                                                    force_kill_driver(driver, path_log_detail, start_time)
                                                    driver = force_build_driver(path_log_detail, start_time)
                                            if num_error < max_error and driver_success_re == 0:
                                                print(errors()['driver_fail_re'])
                                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                    f.write('\t'+errors()['driver_fail_re']+'\n')
                                                with open(path_err_detail+'err_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                    f.write('\t'+link_top_re_data[id_link_re]+'\n')
                                                id_link_re += 1
                                        try:
                                            df_scrape.loc[ind_,'tit_des_tags_re'] = tit_des_tags_re
                                            print('Loc successful (re)')
                                            with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                f.write('\t'+'Loc successful (re)'+'\n')
                                        except:
                                            print(errors()['data_loc'])
                                            with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                                f.write('\t'+errors()['data_loc']+'\n')
                                    ind_ += 1
                            except Exception as err:
                                print(errors()['1stErrorRe'])
                                print(err)
                                traceback.print_exc()
                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                    f.write(errors()['1stErrorRe']+'\n')
                                    f.write(traceback_error())
                            # =============================================================================
                            if df_scrape.notnull().any().any():
                                df_scrape.set_index(df_scrape.columns[0]).to_csv(path_df_detail+'df_'+start_time.strftime('%Hh_%Mm_%Ss')+'.csv')
                                success_mess = 'Successfully save data to file csv.'
                                print(success_mess)
                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                    f.write(success_mess + '\n')
                            else:
                                fail_mess = 'Fail to scrape data, data is empty.'
                                print(fail_mess)
                                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                    f.write(fail_mess + '\n')
                            done = 1
                        except FunctionTimedOut as err:
                            print(errors()['congesting'])
                            print(err)
                            traceback.print_exc()
                            with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                f.write(errors()['congesting']+'\n')
                                f.write(traceback_error())
                            driver_success = 0
                            driver_repeat += 1
                            num_error += 1
                            force_kill_driver(driver, path_log_detail, start_time)
                            driver = force_build_driver(path_log_detail, start_time)
                        except Exception as err: # 3rd error
                            print(errors()['3rdError'])
                            print(err)
                            traceback.print_exc()
                            with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                                f.write(errors()['3rdError']+'\n')
                                f.write(traceback_error())
                            driver_success = 0
                            driver_repeat += 1
                            num_error += 1
                            force_kill_driver(driver, path_log_detail, start_time)
                            driver = force_build_driver(path_log_detail, start_time)
                    if num_error < max_error and driver_success == 0:
                        print(errors()['driver_fail'])
                        with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                            f.write(errors()['driver_fail']+'\n')
                        with open(path_err_detail+'err_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                            f.write(video_urls[id_video_url]+'\n')
                    id_video_url += 1
                if num_error < max_error and driver_success == 0:
                    print(errors()['all_driver_fail'])
                    with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                        f.write(errors()['all_driver_fail']+'\n')
            except FunctionTimedOut as err:
                print(errors()['congesting'])
                print(err)
                traceback.print_exc()
                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                    f.write(errors()['congesting']+'\n')
                    f.write(traceback_error())
                main_driver_success = 0
                main_driver_repeat += 1
                num_error += 1
                force_kill_driver(driver, path_log_detail, start_time)
            except Exception as err: #2nd error
                print(errors()['2ndError'])
                print(err)
                traceback.print_exc()
                with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                    f.write(errors()['2ndError']+'\n')
                    f.write(traceback_error())
                main_driver_success = 0
                main_driver_repeat += 1
                num_error += 1
                force_kill_driver(driver, path_log_detail, start_time)
        if main_driver_success == 0:
            print(errors()['main_driver_fail'])
            with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                f.write(errors()['main_driver_fail']+'\n')
    except Exception as err: # 1st error
        print(errors()['1stError'])
        print(err)
        traceback.print_exc()
        with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
            f.write(errors()['1stError']+'\n')
            f.write(traceback_error())
    force_kill_driver(driver, path_log_detail, start_time)
    try:
        if num_error >= max_error:
            fatal_error = 1
            print(errors()['fatalError'])
            with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
                f.write(errors()['fatalError']+'\n')
        with open(path_log_detail+'log_'+start_time.strftime('%Hh_%Mm_%Ss')+'.txt','a+') as f:
            f.write('Total execution time:\n\t'+str(datetime.now() - start_time)+'\nFinish.\n')
        print('Finish.')
    except Exception as err:
        print('!! Can not run the end part !!\n\tTerminate!')
        print(err)
        traceback.print_exc()
    return fatal_error

In [None]:
fatal_error = scrape_top_videos(key_search='', top_videos = 10, limit_num_link_re = 10,
    repeat_driver_congestion = 3, max_error = 10, limited_time_load_web = 13)