In [63]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime
import time
import random
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
import os


## Get Product Link


In [64]:
# Inputs
search_box_text = 'camera picture'  
website_link = 'https://www.amazon.com/'
max_pages = 10         # ปรับจำนวนหน้าตามต้องการ
wait_timeout = 20     # ลดเวลา timeout ลงเล็กน้อยเพื่อให้เร็วขึ้นถ้าเน็ตดี

session_start_time = datetime.now().time()
print(f"Session Start Time: {session_start_time} --------------------------->")

# Start browser
driver = webdriver.Chrome()
driver.get(website_link)

# Accept/dismiss any initial popups (best-effort)
try:
    WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.CLASS_NAME, "a-button-text"))
    ).click()
except Exception:
    pass

print('Waiting for search box...')
try:
    search_input = WebDriverWait(driver, wait_timeout).until(
        EC.presence_of_element_located((By.ID, 'twotabsearchtextbox'))
    )
    print('Typing search query...')
    search_input.clear()
    search_input.send_keys(search_box_text)
    search_input.send_keys(Keys.RETURN)
except Exception as e:
    print("Search box not found or error interacting.")
    driver.quit()
    exit()

print('Waiting for first results page...')
try:
    WebDriverWait(driver, wait_timeout).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'div.s-main-slot'))
    )
except:
    print("Results not loaded.")

# Build pagination URLs
base_url = driver.current_url
parsed = urlparse(base_url)
qs = parse_qs(parsed.query)
qs.pop('page', None) # Remove existing page param

all_pagination_links = []
# ถ้าหน้าแรกคือหน้า 1 อยู่แล้ว ให้เริ่ม loop เพื่อสร้างลิงก์หน้า 2 ถึง max_pages
# แต่ Amazon URL บางทีหน้า 1 ไม่มี page param, หน้า 2 มี page=2
# เพื่อความชัวร์ เราจะเก็บ URL หน้าปัจจุบันไว้เป็นหน้า 1
all_pagination_links.append(base_url) 

for i in range(2, max_pages + 1):
    qs['page'] = [str(i)]
    new_query = urlencode({k: v[0] if isinstance(v, list) and len(v)==1 else v for k, v in qs.items()}, doseq=True)
    new_url = urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, new_query, parsed.fragment))
    all_pagination_links.append(new_url)

print('Pagination Links Count:', len(all_pagination_links))

print('Collecting Product Detail Page Links...')
all_product_links = []

# *** KEY PART: ใช้ Set เพื่อเก็บลิงก์ที่เจอแล้ว ป้องกันการซ้ำตั้งแต่ตอนเก็บ ***
seen = set() 

for i, link in enumerate(all_pagination_links):
    print(f"Scraping Page {i+1}...")
    driver.get(link)
    
    # Wait for slot readiness
    try:
        WebDriverWait(driver, wait_timeout).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div.s-main-slot'))
        )
    except Exception:
        print(f'Page load timeout for {link}; skipping.')
        continue

    # หา Container หลักของผลการค้นหา
    try:
        results_container = driver.find_element(By.CSS_SELECTOR, 'div.s-main-slot')
        # หาการ์ดสินค้าทั้งหมด (ไม่เอาพวก Sponsored หรือตัวคั่นอื่นๆ ถ้าเป็นไปได้)
        result_items = results_container.find_elements(By.CSS_SELECTOR, "div[data-component-type='s-search-result']")
        
        print(f"  -> Found {len(result_items)} items on page")

        for item in result_items:
            try:
                # พยายามหา Link จาก Title (วิธีที่แม่นยำที่สุด)
                link_element = item.find_element(By.CSS_SELECTOR, '[data-cy="title-recipe"] a')
                href = link_element.get_attribute('href')

                if not href:
                    continue

                # *** LOGIC ตัดลิงก์ซ้ำ (Deduplication) ***
                
                # 1. Clean URL: ตัดส่วน tracking (/ref=...) และ query params (?...) ออก
                # ตัวอย่าง: .../dp/B0CXX/?ref=sr_1...  ->  .../dp/B0CXX/
                clean_href = href.split('/ref=')[0].split('?')[0]
                
                # 2. Check & Add: ถ้ายังไม่เคยเจอ ให้เพิ่มลง list และ set
                if clean_href not in seen:
                    seen.add(clean_href)           # จดจำว่าเจอแล้ว
                    all_product_links.append(clean_href) # เก็บของจริง
                    # print(f"     Added: {clean_href}") # Uncomment to see logic working
                
            except Exception:
                continue
                
        # Random sleep to behave like human
        time.sleep(random.uniform(1.5, 3.5))
        
    except Exception as e:
        print(f"Error processing page elements: {e}")

print('---------------------------------------------------------')
print('Total Unique Product Links Collected:', len(all_product_links))

# Create DataFrame
amazon_df = pd.DataFrame(all_product_links, columns=['product_link'])

# Double Check: ลบ duplicates อีกรอบในระดับ DataFrame (เผื่อพลาด)
amazon_df = amazon_df.drop_duplicates(subset=['product_link'])

output_csv = f'amazon_product_links_{search_box_text}.csv'
amazon_df.to_csv(output_csv, index=False)
print(f'Saved CSV: {output_csv}')

driver.close()
session_end_time = datetime.now().time()
print(f"Session End Time: {session_end_time} --------------------------->")

Session Start Time: 23:49:28.824400 --------------------------->
Waiting for search box...
Waiting for search box...
Typing search query...
Waiting for first results page...
Typing search query...
Waiting for first results page...
Pagination Links Count: 10
Collecting Product Detail Page Links...
Scraping Page 1...
Pagination Links Count: 10
Collecting Product Detail Page Links...
Scraping Page 1...
  -> Found 16 items on page
  -> Found 16 items on page
Scraping Page 2...
Scraping Page 2...
  -> Found 16 items on page
  -> Found 16 items on page
Scraping Page 3...
Scraping Page 3...
  -> Found 16 items on page
  -> Found 16 items on page
Scraping Page 4...
Scraping Page 4...
  -> Found 22 items on page
  -> Found 22 items on page
Scraping Page 5...
Scraping Page 5...
  -> Found 16 items on page
  -> Found 16 items on page
Scraping Page 6...
Scraping Page 6...
  -> Found 16 items on page
  -> Found 16 items on page
Scraping Page 7...
Scraping Page 7...
  -> Found 22 items on page
  -> 

## Get Data from product link

In [65]:
df = pd.read_csv(output_csv)
print(len(df))
df.head()

160


Unnamed: 0,product_link
0,https://www.amazon.com/4K-Digital-Camera-Photo...
1,https://www.amazon.com/Digital-Cameras-Photogr...
2,https://www.amazon.com/Autofocus-Vlogging-Phot...
3,https://www.amazon.com/KODAK-Friendly-FZ45-WH-...
4,https://www.amazon.com/Yatao-Photography-Vlogg...


In [66]:

# --- 1. กำหนดคำค้นหาตรงนี้ (Inputs) ---
search_box_text = search_box_text
# ------------------------------------

# session start time
session_start_time = datetime.now().time()
print(f"Session Start Time: {session_start_time} ---------------------------> ")

# อ่านไฟล์ CSV
try:
    # ตรวจสอบชื่อไฟล์ให้ตรงกับที่มีอยู่จริง
    df_product_links = pd.read_csv(f'amazon_product_links_{search_box_text}.csv')
except FileNotFoundError:
    print("Warning: CSV file not found, creating empty DataFrame for demo.")
    df_product_links = pd.DataFrame({'product_link': []}) 

all_product_links = df_product_links['product_link'].tolist()
print("Collecting Individual Product Detail Information")

# Start browser

driver = webdriver.Chrome()


complete_product_details = []
unavailable_products = []
successful_parsed_urls_count = 0
complete_failed_urls_count = 0
first_time = True

# กำหนด Category จากตัวแปร search_box_text เลย
product_category = search_box_text 

for product_page_link in all_product_links:
    try: 
        driver.get(product_page_link)
        if first_time:
          first_time = False
          WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CLASS_NAME, "a-button-text"))
          ).click()
        
        # รอให้หน้าโหลดเสร็จ (รอ Product Title)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "productTitle")))
        
        # --- 1. Title (ชื่อสินค้า) ---
        try:
            title = driver.find_element(By.ID, 'productTitle').text.strip()
        except:
            title = ""

        # --- 2. Brand (ยี่ห้อ) ---
        try:
            brand = driver.find_element(By.ID, 'bylineInfo').text.strip()
            brand = brand.replace('Visit the', '').replace('Store', '').strip()
        except:
            brand = ""

        # --- 3. Price (ราคา) ---
        try:
            # หาจากราคาส่วนลดก่อน (priceToPay)
            price_element = driver.find_element(By.CSS_SELECTOR, '.priceToPay')
            price_text = price_element.text.strip()
            price = price_text.replace('\n', '.')
        except Exception:
            try:
                # ถ้าไม่มีส่วนลด หาจากราคาปกติที่ซ่อนอยู่
                price = driver.find_element(By.CSS_SELECTOR, '#corePriceDisplay_desktop_feature_div .a-price .a-offscreen').get_attribute("textContent").strip()
            except:
                price = "N/A"

        # --- 4. Availability (สถานะสินค้า) ---
        try:
            availability = driver.find_element(By.ID, 'availability').text.strip()
            if "Currently unavailable" in availability:
                unavailable_products.append(product_page_link)
                print(f"Product Unavailable: {title[:20]}...")
        except:
            availability = "Unknown"

        # --- 5. Rating (คะแนนดาว) ---
        try:
            visible_rating = driver.find_element(By.CSS_SELECTOR, '#averageCustomerReviews span.a-size-base.a-color-base')
            avg_rating = visible_rating.text.strip()
            
        except:
            avg_rating = ""

        # --- 6. Total Ratings (จำนวนรีวิว) ---
        try:
            total_ratings = driver.find_element(By.ID, 'acrCustomerReviewText').text.split(' ')[0]
            total_ratings = total_ratings.replace(',', '')
        except:
            total_ratings = "0"

        # --- 7. Discount (ส่วนลด) ---
        try:
            discount = driver.find_element(By.CLASS_NAME, 'savingsPercentage').text.replace('-', '').replace('%', '')
            discount = int(discount) / 100
        except:
            discount = 0.0

        successful_parsed_urls_count += 1
        print(f"URL {successful_parsed_urls_count} | Cat: {product_category} | Price: {price}")
        
        # บันทึกข้อมูลลง List (ใช้ตัวแปร product_category ที่เรากำหนดไว้ตอนแรก)
        complete_product_details.append([
            product_page_link, 
            title, 
            brand, 
            price, 
            discount, 
            avg_rating, 
            total_ratings, 
            availability, 
            product_category 
        ])
        
        time.sleep(random.uniform(1, 3))

    except Exception as e:
        print(f"Failed URL: {product_page_link} | Error: {e}")
        unavailable_products.append(product_page_link)
        complete_failed_urls_count += 1

# --- Creating DataFrame ---
columns = [
    'product_link', 
    'title', 
    'brand', 
    'price', 
    'discount', 
    'avg_rating', 
    'total_ratings', 
    'availability', 
    'category'
]
# --- Creating DataFrame ---
df = pd.DataFrame(complete_product_details, columns=columns)
df_unavailable = pd.DataFrame(unavailable_products, columns=['link'])

# --- Save Files (Logic: Append if exists, Create if not) ---
main_csv_file = 'amazon_product_data.csv'
error_csv_file = 'amazon_unavailable_products.csv'

# 1. บันทึกไฟล์ข้อมูลสินค้าหลัก
if os.path.exists(main_csv_file):
    # ถ้ามีไฟล์อยู่แล้ว ให้ต่อท้าย (mode='a') และไม่ต้องใส่ Header (header=False)
    df.to_csv(main_csv_file, mode='a', header=False, index=False, encoding='utf-8-sig')
    print(f"--> Appended {len(df)} new rows to {main_csv_file}")
else:
    # ถ้ายังไม่มีไฟล์ ให้สร้างใหม่ (mode='w') และใส่ Header (header=True)
    df.to_csv(main_csv_file, mode='w', header=True, index=False, encoding='utf-8-sig')
    print(f"--> Created new file {main_csv_file} with {len(df)} rows")

# 2. บันทึกไฟล์ Link ที่ Error (ทำเหมือนกัน)
if len(df_unavailable) > 0:
    if os.path.exists(error_csv_file):
        df_unavailable.to_csv(error_csv_file, mode='a', header=False, index=False, encoding='utf-8-sig')
    else:
        df_unavailable.to_csv(error_csv_file, mode='w', header=True, index=False, encoding='utf-8-sig')

# --- Stats ---
print("Total product pages processed this session: ", len(all_product_links))
print("Final Total Products collected this session: ", len(df))
print("Total Failed this session: ", len(unavailable_products))

driver.close()
session_end_time = datetime.now().time()
print(f"Session End Time: {session_end_time} ---------------------------> ")

Session Start Time: 23:50:30.636499 ---------------------------> 
Collecting Individual Product Detail Information
URL 1 | Cat: camera picture | Price: $82.99
URL 1 | Cat: camera picture | Price: $82.99
URL 2 | Cat: camera picture | Price: $123.48
URL 2 | Cat: camera picture | Price: $123.48
URL 3 | Cat: camera picture | Price: $46.99
URL 3 | Cat: camera picture | Price: $46.99
URL 4 | Cat: camera picture | Price: $119.99
URL 4 | Cat: camera picture | Price: $119.99
URL 5 | Cat: camera picture | Price: $59.98
URL 5 | Cat: camera picture | Price: $59.98
URL 6 | Cat: camera picture | Price: N/A
URL 6 | Cat: camera picture | Price: N/A
URL 7 | Cat: camera picture | Price: $56.99
URL 7 | Cat: camera picture | Price: $56.99
URL 8 | Cat: camera picture | Price: $83.95
URL 8 | Cat: camera picture | Price: $83.95
URL 9 | Cat: camera picture | Price: $139.99
URL 9 | Cat: camera picture | Price: $139.99
URL 10 | Cat: camera picture | Price: $104.99
URL 10 | Cat: camera picture | Price: $104.99
U

In [67]:
df_amazon = pd.read_csv("amazon_product_data.csv")

df_amazon.insert(0, 'id', range(1, 1 + len(df_amazon)))

df_amazon.to_csv("amazon_product_data_final.csv", index=False)
print("Total records in final file:", len(df_amazon))

Total records in final file: 1033


# Problem Statement: Create a new route for the E-commerce Tool

In this exercise, you will be creating a new route for the E-commerce Tool. This new route will be called `small-talk` and will be uses to handle colloquial conversations with the user.

### **Tasks**:
1. Create a new route called `small-talk` in the `router.py` file with the following utterances:
    - "How are you?"
    - "What is your name?"
    - "Are you a robot?"
    - "What are you?"
    - "What do you do?"

2. Create a new module called `smalltalk.py` that will contain the `talk` function that uses Groq to answer user's question.

3. Update the `ask` function in `main.py` to handle the new route `small-talk`.