In [None]:
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse, parse_qs

In [2]:
# Defining a function to scrap data from Tokopedia search result page
def get_tokopedia_data(datas, url, driver):

        # Use driver to access the page, then locate it's elements when it was loaded to finish
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#zeus-root")))
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        for _ in range(25):
            time.sleep(0.5)
            driver.execute_script("window.scrollBy(0,250)")

        # Find all item cards on page, and scrap name, link, price, and image url on each
        elements = soup.findAll('div', class_='pcv3__container css-1izdl9e')
        for element in elements:
            name = element.find('div', class_='prd_link-product-name css-3um8ox')
            link = element.find('a', class_='pcv3__info-content css-gwkf0u')
            harga = element.find('div', class_='prd_link-product-price css-h66vau')
            gambar = element.find('img', class_='css-1q90pod')

            # Append array with scrapped data
            if name and link and harga and gambar:
                datas.append({
                    'name': name.text,
                    'link': link['href'],
                    'price': harga.text,
                    'img': gambar['src']
                })

In [3]:
def get_dataframe(query, pages):
    # Initialize driver, array, and assign number of pages
    driver = webdriver.Chrome()
    datas = []
    
    # Use pages as looping parameter for scraping
    for i in range(pages):
        url = f'https://www.tokopedia.com/search?navsource=&page={i + 1}&q={query}&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title='
        get_tokopedia_data(datas, url, driver)
    
    # Quit the driver after scraping, turn array into DataFrame, delete array, then show DataFrame
    driver.quit()
    df = pd.DataFrame(datas)
    del(datas)
    return df

In [4]:
# Hiking
tenda_df = get_dataframe('tenda', 100)
kemeja_tactical_df = get_dataframe('kemeja%20tactical', 100)
celana_tactical_df = get_dataframe('celana%20tactical', 100)
sepatu_mendaki_df = get_dataframe('sepatu%20mendaki', 100)
kaos_kaki_df = get_dataframe('kaos%20kaki', 100)
sarung_tangan_df = get_dataframe('sarung%20tangan', 100)
topi_gunung_df = get_dataframe('topi%20gunung', 100)
trekking_pole_df = get_dataframe('trekking%20pole', 100)
sleeping_bag_df = get_dataframe('sleeping%20bag', 100)
gaiter_df = get_dataframe('gaiter', 100)
portable_kitchenware_df = get_dataframe('alat%20masak%20portable', 100)
survival_kit_df = get_dataframe('survival%20kit', 100)

# Cosplay
topeng_cosplay_df = get_dataframe('cosplay%20mask', 100)
kostum_df = get_dataframe('costume', 100)
wig_df = get_dataframe('wig', 100)
sepatu_cosplay_df = get_dataframe('cosplay%20shoes', 100)
aksesoris_cosplay_df = get_dataframe('aksesoris%20cosplay', 100)

In [8]:
import os
hiking_dir = '../data/fathur_scraped/hiking/'
cosplay_dir = '../data/fathur_scraped/cosplay/'
os.makedirs(hiking_dir, exist_ok=True)
os.makedirs(cosplay_dir, exist_ok=True)

In [10]:
def save_df_to_csv(df_dict, category_dir):
    for df_name, df in df_dict.items():
        file_name = df_name + '.csv'
        df.to_csv(os.path.join(category_dir, file_name), index=False)

In [11]:
hiking_dfs = {
    'tenda': tenda_df,
    'kemeja_tactical': kemeja_tactical_df,
    'celana_tactical': celana_tactical_df,
    'sepatu_mendaki': sepatu_mendaki_df,
    'kaos_kaki': kaos_kaki_df,
    'sarung_tangan': sarung_tangan_df,
    'topi_gunung': topi_gunung_df,
    'trekking_pole': trekking_pole_df,
    'sleeping_bag': sleeping_bag_df,
    'gaiter': gaiter_df,
    'portable_kitchenware': portable_kitchenware_df,
    'survival_kit': survival_kit_df,
}

cosplay_dfs = {
    'topeng_cosplay': topeng_cosplay_df,
    'kostum': kostum_df,
    'wig': wig_df,
    'sepatu_cosplay': sepatu_cosplay_df,
    'aksesoris_cosplay': aksesoris_cosplay_df,
}

In [12]:
# Save hiking DataFrames
save_df_to_csv(hiking_dfs, hiking_dir)

# Save cosplay DataFrames
save_df_to_csv(cosplay_dfs, cosplay_dir)