In [1]:
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import os
from urllib.parse import urlparse, parse_qs
import logging

In [2]:
hiking_dir = '../data/fathur_scraped/hiking/'
cosplay_dir = '../data/fathur_scraped/cosplay/'
os.listdir(hiking_dir)

['celana_tactical.csv',
 'gaiter.csv',
 'kaos_kaki.csv',
 'kemeja_tactical.csv',
 'portable_kitchenware.csv',
 'sarung_tangan.csv',
 'sepatu_mendaki.csv',
 'sleeping_bag.csv',
 'survival_kit.csv',
 'tenda.csv',
 'topi_gunung.csv',
 'trekking_pole.csv']

In [3]:
os.listdir(cosplay_dir)

['aksesoris_cosplay.csv',
 'kostum.csv',
 'sepatu_cosplay.csv',
 'topeng_cosplay.csv',
 'wig.csv']

In [4]:
def csv_to_df(dir, file):
    return pd.read_csv(os.path.join(dir, file))

In [5]:
# Hiking
celana_tactical_df = csv_to_df(hiking_dir, 'celana_tactical.csv')
gaiter_df = csv_to_df(hiking_dir, 'gaiter.csv')
kaos_kaki_df = csv_to_df(hiking_dir, 'kaos_kaki.csv')
kemeja_tactical_df = csv_to_df(hiking_dir, 'kemeja_tactical.csv')
portable_kitchenware_df = csv_to_df(hiking_dir, 'portable_kitchenware.csv')
sarung_tangan_df = csv_to_df(hiking_dir, 'sarung_tangan.csv')
sepatu_mendaki_df = csv_to_df(hiking_dir, 'sepatu_mendaki.csv')
sleeping_bag_df = csv_to_df(hiking_dir, 'sleeping_bag.csv')
survival_kit_df = csv_to_df(hiking_dir, 'survival_kit.csv')
tenda_df = csv_to_df(hiking_dir, 'tenda.csv')
topi_gunung_df = csv_to_df(hiking_dir, 'topi_gunung.csv')
trekking_pole_df = csv_to_df(hiking_dir, 'trekking_pole.csv')

# Cosplay
aksesoris_cosplay_df = csv_to_df(cosplay_dir, 'aksesoris_cosplay.csv')
kostum_df = csv_to_df(cosplay_dir, 'kostum.csv')
sepatu_cosplay_df = csv_to_df(cosplay_dir, 'sepatu_cosplay.csv')
topeng_cosplay_df = csv_to_df(cosplay_dir, 'topeng_cosplay.csv')
wig_df = csv_to_df(cosplay_dir, 'wig.csv')

In [6]:
def get_desc(df):
    descs = []
    urls = df['link']  # Limiting to process only the first 100 links
    
    # Initialize the Chrome driver
    driver = webdriver.Chrome()
    
    visited_urls = set()

    # Loop and parse each link
    for link in urls:
        try:
            # Parse the URL and get the direct URL
            parsed_url = urlparse(str(link))
            query_params = parse_qs(parsed_url.query)
            direct_url = query_params.get('r', [link])[0]
            
            # Visit the direct URL if not already visited
            if direct_url not in visited_urls:
                driver.get(direct_url)
                time.sleep(2)
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                
                # Extract the necessary information
                product_name = soup.find('h1', class_='css-ga6qsf')
                rent_price = soup.find('div', class_='price')
                seller = soup.find('h2', class_='css-1wdzqxj-unf-heading')
                img_url = soup.find('img', class_='css-1c345mg')
                desc = soup.find('div', attrs={'data-testid': 'lblPDPDescriptionProduk'})
    
                # Append the extracted information to the list
                descs.append({
                    'product_name': product_name.text if product_name else 'N/A',
                    'rent_price': rent_price.text if rent_price else 'N/A',
                    'seller': seller.text if seller else 'N/A',
                    'img_url': img_url['src'] if img_url else 'N/A',
                    'desc': desc.get_text(separator=' ') if desc else 'N/A',
                    'link': link
                })
                time.sleep(1)
                
                # Mark the URL as visited
                visited_urls.add(direct_url)

            else:
                continue
                
        except Exception as e:
            continue

    # Quit the driver, then return the DataFrame
    driver.quit()
    return pd.DataFrame(descs)


In [7]:
# Hiking
tenda_desc_df = get_desc(tenda_df)
kemeja_tactical_desc_df = get_desc(kemeja_tactical_df)
celana_tactical_desc_df = get_desc(celana_tactical_df)
sepatu_mendaki_desc_df = get_desc(sepatu_mendaki_df)
kaos_kaki_desc_df = get_desc(kaos_kaki_df)
sarung_tangan_desc_df = get_desc(sarung_tangan_df)
topi_gunung_desc_df = get_desc(topi_gunung_df)
trekking_pole_desc_df = get_desc(trekking_pole_df)
sleeping_bag_desc_df = get_desc(sleeping_bag_df)
gaiter_desc_df = get_desc(gaiter_df)
portable_kitchenware_desc_df = get_desc(portable_kitchenware_df)
survival_kit_desc_df = get_desc(survival_kit_df)

# Cosplay
topeng_cosplay_desc_df = get_desc(topeng_cosplay_df)
kostum_desc_df = get_desc(kostum_df)
wig_desc_df = get_desc(wig_df)
sepatu_cosplay_desc_df = get_desc(sepatu_cosplay_df)
aksesoris_cosplay_desc_df = get_desc(aksesoris_cosplay_df)

In [8]:
hiking_desc_dfs = {
    'tenda_desc': tenda_desc_df,
    'kemeja_tactical_desc': kemeja_tactical_desc_df,
    'celana_tactical_desc': celana_tactical_desc_df,
    'sepatu_mendaki_desc': sepatu_mendaki_desc_df,
    'kaos_kaki_desc': kaos_kaki_desc_df,
    'sarung_tangan_desc': sarung_tangan_desc_df,
    'topi_gunung_desc': topi_gunung_desc_df,
    'trekking_pole_desc': trekking_pole_desc_df,
    'sleeping_bag_desc': sleeping_bag_desc_df,
    'gaiter_desc': gaiter_desc_df,
    'portable_kitchenware_desc': portable_kitchenware_desc_df,
    'survival_kit_desc': survival_kit_desc_df,
}

cosplay_desc_dfs = {
    'topeng_cosplay_desc': topeng_cosplay_desc_df,
    'kostum_desc': kostum_desc_df,
    'wig_desc': wig_desc_df,
    'sepatu_cosplay_desc': sepatu_cosplay_desc_df,
    'aksesoris_cosplay_desc': aksesoris_cosplay_desc_df,
}

In [9]:
hiking_desc_dir = 'new_scraped_desc/hiking/'
cosplay_desc_dir = 'new_scraped_desc/cosplay/'
os.makedirs(hiking_desc_dir, exist_ok=True)
os.makedirs(cosplay_desc_dir, exist_ok=True)

In [10]:
def save_df_to_csv(df_dict, category_dir):
    for df_name, df in df_dict.items():
        file_name = df_name + '.csv'
        df.to_csv(os.path.join(category_dir, file_name), index=False)

In [11]:
# Save hiking DataFrames
save_df_to_csv(hiking_desc_dfs, hiking_desc_dir)

# Save cosplay DataFrames
save_df_to_csv(cosplay_desc_dfs, cosplay_desc_dir)