In [1]:
import pandas as pd
import numpy as np
import re
import time
import os
from time import sleep
import threading
from queue import Queue

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException,ElementNotInteractableException
import warnings
warnings.filterwarnings('ignore')

In [2]:
driver = webdriver.Chrome()
driver.get('https://shopeefood.vn/ho-chi-minh/food/deals')
driver.maximize_window()
wait = WebDriverWait(driver,10)

# I. Crawling url from ShopeeFood

In [None]:
def CrawlFoodData():
    all_web_item = driver.find_elements(By.CSS_SELECTOR,".item-restaurant .item-content")
    result = pd.DataFrame(columns=['name_res', 'img_link', 'food_link','favorite_tag', 'is_quality_mer','address','promotion'])
    for i in all_web_item:
        try:
            ##
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.info-restaurant .name-res')))
            name_res = i.find_element(By.CSS_SELECTOR, '.info-restaurant .name-res').text
            ##
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.img-restaurant img')))
            link_img = i.find_element(By.CSS_SELECTOR, '.img-restaurant img').get_attribute("src")
            ##
            food_link = i.get_attribute("href")
            ##
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.img-restaurant')))
            favorite_tag = i.find_element(By.CSS_SELECTOR, '.img-restaurant').text
            ##
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.info-restaurant .icon.icon-quality-merchant')))
            try:
                is_quality_mer = i.find_element(By.CSS_SELECTOR, '.info-restaurant .icon.icon-quality-merchant').get_attribute('title')
            except NoSuchElementException:
                is_quality_mer = np.nan
            ##
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.info-restaurant .address-res')))
            address_res = i.find_element(By.CSS_SELECTOR, '.info-restaurant .address-res').text

            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.info-restaurant .content-promotion')))
            promotion = i.find_element(By.CSS_SELECTOR, '.info-restaurant .content-promotion').text
        except:
            print("Lỗi ở item: ", all_web_item.index(i+1))

        record = pd.DataFrame([[name_res, link_img, food_link, favorite_tag, is_quality_mer, address_res, promotion]]
                              ,columns=['name_res', 'img_link', 'food_link','favorite_tag', 'is_quality_mer','address','promotion'])
        result = pd.concat([result, record])
    return result

In [None]:
# Hàm lấy số kết quả hiện tại
def MenuCounts():
    no_of_result = driver.find_element(By.CSS_SELECTOR,".result")
    no_of_result = int(no_of_result.text.replace(" Kết quả",""))
    return no_of_result

In [None]:
# Tiến hành crawl dữ liệu
crawling_result = pd.DataFrame(columns=['name_res', 'img_link', 'food_link','favorite_tag', 'is_quality_mer','address','promotion'])
n_result = MenuCounts()
while crawling_result.shape[0] < n_result:
    active_page = driver.find_element(By.CSS_SELECTOR,".pagination .active").text
    page_data = CrawlFoodData()
    crawling_result = pd.concat([crawling_result,page_data])
    sleep(2)
    print("Crawling done from page: " + active_page, "| Current Data Frame rows: ",crawling_result.shape[0])
    driver.find_element(By.CSS_SELECTOR,".icon.icon-paging-next").click()
    sleep(2)
# @title Lưu file
crawling_result.to_csv("test_crawl.csv",index=False)

Crawling done from page: 1 | Current Data Frame rows:  30
Crawling done from page: 2 | Current Data Frame rows:  30
Crawling done from page: 3 | Current Data Frame rows:  60


KeyboardInterrupt: 

# II. Crawl details from Foody

In [3]:
## Phân luồng cho list quán ăn từ data món ăn
# crawling_result_final = pd.read_csv("tests.csv")
# crawling_result_final = crawling_result_final. drop_duplicates().reset_index()
# print(crawling_result_final.shape)
crawling_result_final = pd.read_csv("test_crawl.csv")
crawling_result_final.drop_duplicates(subset = ['food_link'],inplace=True)
sp_res_link = crawling_result_final['food_link'].unique().tolist()
fd_res_link = [link.replace("shopeefood.vn","foody.vn") for link in sp_res_link]
df_res = pd.DataFrame()
df_res['fd_res_link'] = fd_res_link
df_res['name_res'] = crawling_result_final[crawling_result_final['food_link'] == sp_res_link]['name_res']
df_res = df_res[~df_res['name_res'].isnull()]
collected_data = pd.read_csv("RestaurantInfo.csv")
df_res = df_res[~df_res['fd_res_link'].isin(collected_data['url'])]
df_res.shape

(26, 2)

## Setup function Crawl

In [4]:
# Functnion crawl dữ liệu
def CrawlRestaurantData(driver1):
    score_names, score_values =[],[]

    ## Đặc tính chấm điểm rating points
    items = driver1.find_elements(By.CSS_SELECTOR, ".microsite-top-points")
    for item in items:

        try:
            name = item.find_element(By.CSS_SELECTOR, '.label').text
        except NoSuchElementException:
            name = np.nan
        score_names.append(name)

        try:
            value = item.find_element(By.CSS_SELECTOR, 'span[class=""]').text
            # value = item.text
        except NoSuchElementException:
            try:
                value = item.find_element(By.CSS_SELECTOR, '.avg-txt-highlight').text
            except NoSuchElementException:
                value = np.nan
        score_values.append(value)
    try:
        df_score = [score_values[score_names.index("Giá cả")],
                    score_values[score_names.index("Vị trí")],
                    score_values[score_names.index("Chất lượng")],
                    score_values[score_names.index("Phục vụ")],
                    score_values[score_names.index("Không gian")]]
    except:
        df_score = [np.nan] * 5
    # Phân loại cửa hàng
    try:
        res_category = driver1.find_element(By.CSS_SELECTOR, ".category-items").text
    except NoSuchElementException:
        res_category = np.nan
    # Phong cách ẩm thực
    try:
        cuisine_category = driver1.find_element(By.CSS_SELECTOR, ".microsite-cuisine").text
    except NoSuchElementException:
        cuisine_category = np.nan
    # Đối tượng quán ăn
    try:
        res_audience = driver1.find_element(By.CSS_SELECTOR, ".audiences").text
    except NoSuchElementException:
        res_audience = np.nan
    # Khu vực quán ăn
    try:
        element = driver1.find_element(By.XPATH, '//span[@itemprop="itemListElement"][meta[@itemprop="position"][number(@content) >= 3]]/a/span[@itemprop="name"]')
        area = element.text
    except NoSuchElementException:
        area = np.nan
    ## Tọa độ địa lý
    try:
        map_iframe = driver1.find_element(By.CSS_SELECTOR, ".microsite-map iframe")
        map_src = map_iframe.get_attribute("src")
        lat, long = re.search(r"q=(-?\d+\.\d+),(-?\d+\.\d+)", map_src).groups()
    except NoSuchElementException:
        lat, long = np.nan, np.nan
    # Thời gian hoạt động
    try:
        open_times_element = driver1.find_element(By.CSS_SELECTOR, '.micro-timesopen')
        open_times = open_times_element.find_element(By.XPATH, "./span[normalize-space() and not(@class)]").text
    except NoSuchElementException:
        open_times = np.nan
    # Số lượt xem
    try:
        total_views = driver1.find_element(By.CSS_SELECTOR, '.total-views span').text
    except NoSuchElementException:
        total_views = np.nan
    # Số bình luận
    try:
        total_comment = driver1.find_element(By.CSS_SELECTOR, '.microsite-review-count').text
    except NoSuchElementException:
        total_comment = np.nan

    try:
        total_like = driver1.find_element(By.CSS_SELECTOR, '._5n6j._5n6l').text
    except NoSuchElementException:
        total_comment = np.nan

    #Danh sách món ăn
    foody_url = str(driver1.current_url)
    sp_food_url = foody_url.replace('www.foody','shopeefood') + '?source_url=foody_ordernow_pc'
    driver1.get(sp_food_url)
    time.sleep(0.5)

    title_menus = []
    food_menus =[]
    prev_scr = 1
    while True:
        driver1.execute_script("window.scrollBy(0, arguments[0]);", 600)
        curr_scr = driver1.execute_script("return window.scrollY;")
        if curr_scr == prev_scr:
            break

        time.sleep(1.5)
        title_menu = driver1.find_elements(By.CSS_SELECTOR, '.title-menu')
        food_menu = driver1.find_elements(By.CSS_SELECTOR, '.item-restaurant-name')

        title_menus += [item.text for item in title_menu]
        food_menus += [item.text for item in food_menu]
        prev_scr = curr_scr
    title_menus = ','.join(list(set(title_menus)))
    food_menus = ','.join(list(set(food_menus)))

    result_list = [res_category, cuisine_category, res_audience, area, lat, long, open_times, total_views, total_comment] + df_score + [title_menus, food_menus]
    return result_list

## Setup function

In [None]:
# Phân chia workload cho các luồng xử lí
def allocate_thread(n_item, n_thread, df):
    result = pd.DataFrame()
    group_size = n_item // n_thread
    remaining = n_item % n_thread

    # Create categories with equal group sizes
    categories = []
    for i in range(1, n_thread+1):
        categories += [i]*group_size
    categories += [n_thread] * remaining

    result['link'] = df.fd_res_link
    result['name_res'] = df.name_res
    result['thread_id'] = categories
    result = result[['thread_id','link','name_res']]
    return result

----Running Thread  1 !----
-------- Opening link 1 https://foody.vn/ho-chi-minh/tocotoco-bubble-tea-le-van-viet in Thread 1
----Running Thread  2 !----
-------- Opening link 1 https://foody.vn/ho-chi-minh/lee-vu-coffee-phan-van-hon in Thread 2
----Running Thread  3 !----
-------- Opening link 1 https://foody.vn/ho-chi-minh/ca-vien-chien-ty-dai-an-vat-ga-ran-com-cuon-kimbap in Thread 3
----Running Thread  4 !----
-------- Opening link 1 https://foody.vn/ho-chi-minh/tra-sua-royaltea-truong-chinh in Thread 4
----Running Thread  5 !----
-------- Opening link 1 https://foody.vn/ho-chi-minh/ep-mix-nuoc-ep-trai-cay in Thread 5
-------- Opening link 2 https://foody.vn/ho-chi-minh/lyly-house-tra-sua-an-vat in Thread 4
-------- Opening link 2 https://foody.vn/ho-chi-minh/com-ga-chao-goi-ga-ta-tuong-vy-nguyen-van-qua in Thread 1
-------- Opening link 2 https://foody.vn/ho-chi-minh/delica-mi-y-pizza-hoang-hoa-tham in Thread 2
-------- Opening link 2 https://foody.vn/ho-chi-minh/the-st-coffee-le-d

In [None]:
# Chức năng của từng luồng:
# duyệt qua từng link của luồng, thực hiện crawl dữ liệu
def thread_function(driver,que, df, thread_order):
    page_prod_features = []
    for i in range (0, df.shape[0]):
        # mở một driver mới
        print("-------- Opening link" ,i+1 , df.iloc[i,1], "in Thread", thread_order)
        driver.get(df.iloc[i,1])
        # crawl data quán ăn
        res_ft = CrawlRestaurantData(driver)
        # trả về kết quả
        features = [thread_order, i+1, df['link'].iloc[i], df['name_res'].iloc[i]]
        feature_all = features + res_ft
        page_prod_features.append(feature_all)
    que.put(page_prod_features)

In [None]:
#FUNCTION TO RUN THREADING OF TAKE DATA AND RETURN IN A LIST
def runInParallbel(func, drivers, df):
    threads = []
    que = Queue()
    for driver in drivers:
        # Run only link_id correspond to thread_id
        thread_id = drivers.index(driver)+1
        df2 = df[df.iloc[:,0] == thread_id]
        print('----Running Thread ', thread_id, "!----")
        t = threading.Thread(target = func, args = (driver, que, df2, thread_id))
        t.start()
        threads.append(t)

    for t in threads:
        t.join()

    results = []
    while not que.empty():
        results.extend(que.get())
    return results

In [None]:
# Khơi tạo các browser
def open_multi_browser(n_browser):
    drivers = []
    for _ in range(n_browser):
        driver = webdriver.Chrome()
        driver.maximize_window()
        sleep(1)
        drivers.append(driver)
    return drivers

## Run and Save

In [None]:
# Khởi tạo tham số cho Process
## Khởi tạo số luồng
n = 5
drivers = open_multi_browser(n)
## Chia nhiệm vụ cho luồng
df = allocate_thread(df_res.shape[0], n, df_res)
## Chạy các luồng đã khởi tạo
all_prod_features = runInParallbel(thread_function, drivers, df)

# Gán tên cho các Feature collect được
data = pd.DataFrame(all_prod_features, columns=['thread_id', 'link_id', 'url','name_res','res_category'
                            ,'cuisine_category','res_audience','area','lat','long','open_time','total_views', 'total_comment'
                            ,"price_score", "position_score", "quality_score", "service_score", "space_score", 'menu_list', 'food_list'])

(3138, 20)

# III. Crawl Commment - bình luận về quán ăn

## Setup main function

In [2]:
df_res = pd.read_csv("RestaurantInfo.csv")
fd_link = [link.replace("shopeefood.vn","foody.vn") for link in df_res['url']]
df_res['url'] = fd_link

In [3]:
# def retry_element():
#     for i in range(10):
#         try:
#             wait.until(EC.presence_of_element_located(("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/ul/li[{}]/div[1]/div[2]/div[2]/a".format(i))))
#             user_url = driver.find_element("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/ul/li[{}]/div[1]/div[2]/div[2]/a".format(i)).get_attribute('href')
#             return user_url
#         except:
#             driver.refresh()
#             sleep(5)
#     return np.nan


def CrawlComment(driver, link_restaurant):
    wait = WebDriverWait(driver,10)
    while True:
        try:
            view_more = driver.find_element("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/div[2]")
            view_more.click()
            sleep(1)
        except:
            break
    no_comments = driver.find_element("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/div[1]/div/ul/li[1]/a/span").text
    no_comments = 100 if int(no_comments)>100 else int(no_comments)

    # duyệt từng comment lấy thông tin từng comment
    df_comment = pd.DataFrame()
    # print(f"- - Clawling {no_comments} comments!")

    for i in range(1, int(no_comments) + 1):
        
        try:
            wait.until(EC.presence_of_element_located(("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/ul/li[{}]/div[1]/div[2]/div[2]/a".format(i))))
            user_url = driver.find_element("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/ul/li[{}]/div[1]/div[2]/div[2]/a".format(i)).get_attribute('href')
            # return user_url
        except:
            user_url =np.nan

        try:
            wait.until(EC.presence_of_element_located(("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/ul/li[{}]/div[1]/div[2]/div[2]/a".format(i))))
            name = driver.find_element("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/ul/li[{}]/div[1]/div[2]/div[2]/a".format(i)).text
        except:
            name =np.nan
        
        try:
            wait.until(EC.presence_of_element_located(("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/ul/li[{}]/div[1]/div[2]/div[3]/a".format(i))))
            platform = driver.find_element("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/ul/li[{}]/div[1]/div[2]/div[3]/a".format(i)).text
        except:
            platform = np.nan
        
        try:
            wait.until(EC.presence_of_element_located(("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/ul/li[{}]/div[1]/div[2]/div[3]/span".format(i))))
            datetime = driver.find_element("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/ul/li[{}]/div[1]/div[2]/div[3]/span".format(i)).text
        except:
            datetime = np.nan
        
        try:
            wait.until(EC.presence_of_element_located(("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/ul/li[{}]/div[2]/div/span".format(i))))    
            content = driver.find_element("xpath",  "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/ul/li[{}]/div[2]/div/span".format(i)).text
        except:
            content = np.nan

        try:
            wait.until(EC.presence_of_element_located(("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/ul/li[{}]/div[4]/div/div[2]/ul".format(i))))
            interact = driver.find_element("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/ul/li[{}]/div[4]/div/div[2]/ul".format(i)).text
        except:
            interact = np.nan
        
        sleep(1)
        try:
            user_like = driver.find_element("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/ul/li[{}]/div[4]/div/div[2]/div/a".format(i)).text
        except:
            user_like = np.nan
            
        score = driver.find_element("xpath", "/html/body/div[2]/div[2]/div[2]/section/div/div/div/div/div[1]/div/div/div[1]/div/ul/li[{}]/div[1]/div[2]/div[1]/span".format(i)).text
    
        comment_of_this_page = pd.DataFrame([[link_restaurant, user_url, name, platform, datetime, content, interact, user_like, score]], 
                        columns = ['res_link', 'user_url', 'name','platform','datetimes', 'content','interact', 'user_like', 'score'])
        df_comment = pd.concat([df_comment, comment_of_this_page])
    return df_comment

#Check condition then Get comment
def GetCommentOfItem(driver, link):
    try:
        driver.find_element("xpath", "/html/body/div[2]/div[2]/section/div")
        return CrawlComment(driver, link)
    except:
        print(f"This place is no longer in Foody")
        return pd.DataFrame()

## Setup Multithreading

In [4]:
# Phân chia workload cho các luồng xử lí
def AllocateThread(n_thread, df):
    result = df.copy()
    n_item = df.shape[0]
    group_size = n_item // n_thread
    remaining = n_item % n_thread

    # Create categories with equal group sizes
    categories = []
    for i in range(1, n_thread+1):
        categories += [i]*group_size
    categories += [n_thread] * remaining

    result['thread_id'] = categories
    return result[['thread_id', 'url']]

# Chức năng của từng luồng:
def thread_function(driver, que, df, thread_order):
    page_prod_features = pd.DataFrame()
    for i in range (0, df.shape[0]):
        # mở một driver mới
        url = df['url'].iloc[i]
        print("----- Open link" , i+1, "in Thread", thread_order, ":", url.replace("https://foody.vn/ho-chi-minh/",""))
        driver.get(url + '/binh-luan')
        # sleep(1)
        # crawl data quán ăn
        res_ft = GetCommentOfItem(driver, url)
        res_ft['thread_order'] = thread_order
        res_ft['link_order'] = i+1
        page_prod_features = pd.concat([page_prod_features, res_ft])
    que.put(page_prod_features)


#FUNCTION TO RUN THREADING OF TAKE DATA AND RETURN IN A LIST
def runInParallbel(func, drivers, df):
    threads = []
    que = Queue()
    for driver in drivers:
        # Run only link_id correspond to thread_id
        thread_id = drivers.index(driver) + 1
        df_allocated = df[df['thread_id'] == thread_id]
        print('- Running Thread', thread_id, "!")
        t = threading.Thread(target = func, args = (driver, que, df_allocated, thread_id))
        # sleep(1)
        t.start()
        threads.append(t)

    for t in threads:
        t.join()

    results = pd.DataFrame()
    while not que.empty():
        results = pd.concat([results, que.get()])
    return results

# Khơi tạo các browser
def open_multi_browser(n_browser):
    drivers = []
    for _ in range(n_browser):
        driver = webdriver.Chrome()
        driver.maximize_window()
        sleep(1)
        drivers.append(driver)
    return drivers

# Đóng browser
def close_multi_browser(drivers):
    for driver in drivers:
        driver.close()

## Run & Save

In [16]:
df_to_crawl = df_res[750:800]
## Khởi tạo số luồng
n = 5
df_final = pd.DataFrame()
## Chỉ crawl 100 items cho mỗi process
items_crawl_each_pocess = 25
n_process = len(df_to_crawl) // items_crawl_each_pocess

## Chạy từng process
for i in range(n_process):
# for i in range(2):
    drivers = open_multi_browser(n)
    ## Chia nhiệm vụ cho luồng
    start_loc = i * items_crawl_each_pocess
    end_loc = (i+1) * items_crawl_each_pocess
    df_allocated = AllocateThread(n, df_to_crawl[start_loc: end_loc])
    # Chạy các luồng đã khởi tạo
    df_comment = runInParallbel(thread_function, drivers, df_allocated)
    close_multi_browser(drivers)
    df_final = pd.concat([df_comment, df_final])
    sleep(1)


- Running Thread 1 !
----- Open link- Running Thread  2 1 in Thread 1 : bep-cua-bul-ga-ran-sot-mi-y-78-ngo-chi-quoc
!
----- Open link- Running Thread 3  1 in Thread !
2 : bun-thit-nuong-ba-ba-304-pham-the-hien
----- Open link- Running Thread  4 1!
 in Thread 3 : ty-muoi-quan-dua-dam-hai-phong-chanh-hung
----- Open link 1 in Thread 4 : ga-ta-dung-cha-bo-da-nang
- Running Thread 5 !
----- Open link 1 in Thread 5 : co-hang-rau-cau-dua
----- Open link 2 in Thread 5 : quan-co-bay-nui-mi-xao-duong-so-17.ch1o3k
----- Open link 2 in Thread 1 : healthy-man-chay-quan-7-duong-so-10
----- Open link 2 in Thread 2 : tra-sua-luan-map-chung-cu-an-quang
----- Open link 2 in Thread 3 : banh-xep-banh-duc-mien-tay-nguyen-du
----- Open link 3 in Thread 5 : tiem-banh-ngon-hui-bakery-banh-ngot-banh-man-hau-giang
----- Open link 3 in Thread 3 : manh-anh-ga-ac-tiem-goi-cuon-vit-lon-huong-lo-2
This place is no longer in Foody
----- Open link 3 in Thread 1 : chicken-cheese-ga-ran-han-quoc-tokbokki-my-y-dong-hung

In [17]:
test = pd.read_csv("comment_data.csv")
test = pd.concat([test,df_final])
test.to_csv("comment_data.csv", index = False)
test.shape

(1911, 11)

# IV. Crawl User's Info

In [None]:
# # 1. Khai báo + mở browser
driver = webdriver.Chrome()
# driver.get("https://www.foody.vn/ho-chi-minh/nha-hang-san-fu-lou/binh-luan")
driver.get("https://www.foody.vn/ho-chi-minh/bubble-tea-tra-sua-xien-que-nguyen-chi-thanh/binh-luan")
driver.maximize_window()
# wait = WebDriverWait(driver,10)

In [None]:
#@Return the number of friends of user
def GetFriends(user_link, option):
    print(f"Get friend of {user_link}: ")
    link_to_friend = user_link + "#friends/" + option
    driver.get(link_to_friend)
    sleep(2.5)
    #Click view more cho đến khi không còn nút viewmore
    while True:
        try:
            view_more = driver.find_element("xpath", "/html/body/div[2]/div[6]/div/div[2]/div/div/div[2]/a")
            view_more.click()
            sleep(1)
        except:
            break
    html_list = driver.find_element("xpath", "/html/body/div[2]/div[6]/div/div[2]/div/div/div[2]/ul")
    number_of_items = len(html_list.find_elements(By.CSS_SELECTOR , ".ng-scope"))
    return number_of_items

In [None]:
df_all_user = pd.DataFrame(columns=['name', 'avatar_url', 'activitiy', 'following', 'follower'])
for link in df_comment['user_url']:
    driver.get(link)

    name = driver.find_element(By.CSS_SELECTOR,".u-name").text
    activities = driver.find_element("xpath" , "/html/body/div[2]/div[6]/div/div[2]/div/div/div[1]/div/div/ul").text
    avatar = driver.find_element(By.CSS_SELECTOR,".u-avatar img").get_attribute("src")
    # following = GetFriends(link, "following")
    follower = GetFriends(link, "follower")

    df_1_user = pd.DataFrame([[link, name, avatar, activities, follower]],
                            columns = ['user_url', 'name', 'avatar_url', 'activitiy', 'friend'])
    df_all_user = pd.concat([df_all_user, df_1_user])
df_all_user

Get friend of https://www.foody.vn/thanh-vien/kubihuy: 
Get friend of https://www.foody.vn/thanh-vien/kubihuy: 
Get friend of https://www.foody.vn/thanh-vien/luong.tran.1694059: 
Get friend of https://www.foody.vn/thanh-vien/luong.tran.1694059: 
Get friend of https://www.foody.vn/thanh-vien/ngoclinh.hotcold: 
Get friend of https://www.foody.vn/thanh-vien/ngoclinh.hotcold: 


Unnamed: 0,name,avatar_url,activitiy,following,follower
0,Kubi Huy,https://images.foody.vn/usr/g837/8364559/avt/c...,10 Hoạt động,40,41
0,Luong Tran,https://images.foody.vn/usr/g59/586669/avt/c20...,2 Hoạt động,75,74
0,CHT NCT Ngọc Linh,https://images.foody.vn/usr/g207/2062476/avt/c...,1 Hoạt động,506,505


# Xử lý kết quả

In [None]:
# Nối với data cũ
final = pd.concat([collected_data,data])
final.shape

In [None]:
final.to_csv("RestaurantInfo.csv",index=False)