In [None]:
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse, parse_qs

In [None]:
# Defining a function to scrap data from Tokopedia search result page
def get_tokopedia_data(datas, url, driver):

        # Use driver to access the page, then locate it's elements when it was loaded to finish
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#zeus-root")))
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        for _ in range(25):
            time.sleep(0.5)
            driver.execute_script("window.scrollBy(0,250)")

        # Find all item cards on page, and scrap name, link, price, and image url on each
        elements = soup.findAll('div', class_='pcv3__container css-1izdl9e')
        for element in elements:
            name = element.find('div', class_='prd_link-product-name css-3um8ox')
            link = element.find('a', class_='pcv3__info-content css-gwkf0u')
            harga = element.find('div', class_='prd_link-product-price css-h66vau')
            gambar = element.find('img', class_='css-1q90pod')

            # Append array with scrapped data
            if name and link and harga and gambar:
                datas.append({
                    'product_name': name.text,
                    'rent_price': harga.text,
                    'url_photo': gambar['src'],
                    'link': link['href'],
                })

In [None]:
def get_dataframe(query, pages):
    # Initialize driver, array, and assign number of pages
    driver = webdriver.Chrome()
    datas = []
    
    # Use pages as looping parameter for scraping
    for i in range(pages):
        url = f'https://www.tokopedia.com/search?navsource=&page={i + 1}&q={query}&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title='
        get_tokopedia_data(datas, url, driver)
    
    # Quit the driver after scraping, turn array into DataFrame, delete array, then show DataFrame
    driver.quit()
    df = pd.DataFrame(datas)
    del(datas)
    return df

In [None]:
# Hiking
tenda_gunung_df = get_dataframe('tenda%20gunung', 3)
alat_mendaki_df = get_dataframe('alat%20mendaki', 3)
tongkat_gunung_df = get_dataframe('tongkat%20gunung', 3)
matras_gunung_df = get_dataframe('matras%20gunung', 3)
alat_masak_gunung_df = get_dataframe('alat%20masak%20gunung', 3)

# Cosplay
baju_cosplay_anak_df = get_dataframe('baju%20cosplay%20anak', 3)
cosplay_anime_anak_df = get_dataframe('cosplay%20anime%20anak', 3)
sepatu_cosplay_df = get_dataframe('sepatu%20cosplay', 3)
aksesoris_anak_df = get_dataframe('aksesoris%20anak', 3)
kostum_anak_df = get_dataframe('kostum%20anak', 3)


In [None]:
import os
hiking_dir = 'scraped/hiking/'
cosplay_dir = 'scraped/cosplay/'
os.makedirs(hiking_dir, exist_ok=True)
os.makedirs(cosplay_dir, exist_ok=True)

In [None]:
def save_df_to_csv(df_dict, category_dir):
    for df_name, df in df_dict.items():
        file_name = df_name + '.csv'
        df.to_csv(os.path.join(category_dir, file_name), index=False)

In [None]:
hiking_dfs = {
    'tenda_gunung': tenda_gunung_df,
    'alat_mendaki': alat_mendaki_df,
    'tongkat_gunung': tongkat_gunung_df,
    'matras_gunung': matras_gunung_df,
    'alat_masak_gunung': alat_masak_gunung_df,
}

cosplay_dfs = {
    'topeng_cosplay': baju_cosplay_anak_df,
    'kostum': cosplay_anime_anak_df,
    'sepatu_cosplay': sepatu_cosplay_df,
    'aksesoris_anak': aksesoris_anak_df,
    'kostum_anak': kostum_anak_df
}

In [None]:
# Save hiking DataFrames
save_df_to_csv(hiking_dfs, hiking_dir)

# Save cosplay DataFrames
save_df_to_csv(cosplay_dfs, cosplay_dir)