In [None]:
# success to get product detail

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime
import time


# 初始化 WebDriver
driver = webdriver.Chrome()

# 瀏覽到目標頁面
driver.get("https://www.hktvmall.com/hktv/zh/search_a?keyword=%E9%A6%99%E6%B0%B4&page=0")

# 初始化 WebDriverWait
wait = WebDriverWait(driver, 10)

# 定義一個變量來跟踪產品索引
index = 1

# 定義一個變量來跟踪當前頁數
web_page_count = 1

# 創建一個空列表來存儲數據
data_list = []

while True:
    try:
        print(f"正在處理第 {web_page_count} 頁")
        # 滾動到底部以加載整個頁面
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # 循環處理當前頁面的所有產品
        while True:
            try:
                # 動態生成 XPath 表達式
                open_page_btn = driver.find_element(By.XPATH, f'//*[@id="algolia-search-result-container"]/div/div/span[{index}]')
                open_page_btn.click()
                driver.switch_to.window(driver.window_handles[-1])
                
                time.sleep(1)

                # 提取產品標題
                try:
                    title = driver.find_element(By.CLASS_NAME, 'last').text
                except:
                    title = "N/A"

                # 提取產品簡短描述
                try:
                    short_desc = "N/A"
                    if driver.find_element(By.XPATH, "//span[contains(@class, 'short-desc')]"):
                        span_element = driver.find_element(By.XPATH, "//span[contains(@class, 'short-desc')]")
                        p_elements = span_element.find_elements(By.TAG_NAME, "p")
                        p_texts = [p.text for p in p_elements]
                        short_desc = "\n".join(p_texts) 
                except:
                    short_desc = "N/A"

                try:
                    full_desc = "N/A"
                    if driver.find_element(By.CLASS_NAME, 'tabBody'):
                        div_element = driver.find_element(By.CLASS_NAME, 'tabBody')
                        div_elements = div_element.find_elements(By.TAG_NAME, "div")
                        div_texts = [div.text for div in div_elements]
                        full_desc = "\n".join(div_texts)
                except:
                    full_desc = "N/A"


                # 將數據添加到列表
                data_list.append({"Title": title, "Short_Desc": short_desc, "Full_desc":full_desc})

                # 關閉新窗口並切換回主窗口
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
                
                # 更新索引以定位下一個產品
                index += 1
                
            except Exception as e:
                # 如果出現異常，說明沒有更多的產品鏈接
                print(f"第 {web_page_count} 頁已經處理完所有產品或出現錯誤：", e)
                break

        # if no more page to click , BREAK
        web_page_count += 1

        # 重置產品索引
        index = 1

        # 嘗試找到下一頁按鈕並檢查是否可點擊
        try:
            nextPageButton = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@id, 'paginationMenu_nextBtn')]")))
            if "disabled" in nextPageButton.get_attribute("class"):
                print("已經到達最後一頁")
                break
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            nextPageButton.click()
            time.sleep(3)
            print("Next page GO!")
            
        except:
            print("已經到達最後一頁")
            break

    except Exception as e:
        print(f"已經處理完所有頁面或出現錯誤：", e)
        break

# 完成後關閉瀏覽器
driver.quit()

# 將數據轉換為 DataFrame 
data_df = pd.DataFrame(data_list)



data_df['Combined_Desc'] = data_df['Short_Desc'].fillna('') + ' ' + data_df['Full_desc'].fillna('')


# 定義分類函數
def classify_notes(desc):
    notes = {
        '前調': [],
        '中調': [],
        '後調': []
    }
    if pd.notna(desc):
        parts = desc.split(' ')
        for part in parts:
            if '前調' in part or '初調' in part or '前味' in part:
                notes['前調'].append(part.split('：')[-1])
            elif '中調' in part or '中味' in part:
                notes['中調'].append(part.split('：')[-1])
            elif '後調' in part or '後味' in part or '基調' in part:
                notes['後調'].append(part.split('：')[-1])
    return notes

# 應用分類函數
data_df['分類'] = data_df['Combined_Desc'].apply(classify_notes)

# 打印結果
for index, row in data_df.iterrows():
    print(f"Title: {row['Title']}")
    print(f"分類: {row['分類']}")
    print()

# 只保留必要的列
data_df = data_df[['Title', '分類']]

# 創建新的列
data_df['前調'] = ''
data_df['中調'] = ''
data_df['後調'] = ''

# 定義提取前調、中調、後調的正則表達式
pattern = re.compile(r"'前調': \[(.*?)\], '中調': \[(.*?)\], '後調': \[(.*?)\]")

# 處理分類列，提取前調、中調和後調的數據
for index, row in data_df.iterrows():
    match = pattern.search(row['分類'])
    if match:
        data_df.at[index, '前調'] = match.group(1).replace("'", "").replace(", ", ",")
        data_df.at[index, '中調'] = match.group(2).replace("'", "").replace(", ", ",")
        data_df.at[index, '後調'] = match.group(3).replace("'", "").replace(", ", ",")

# 刪除原始的分類列
data_df.drop(columns=['分類'], inplace=True)

# 獲取當前日期和時間
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")

# 生成新的CSV文件名
new_filename = f'processed_perfumes_{current_time}.csv'

# 將處理後的數據保存到新的CSV文件
data_df.to_csv(new_filename, index=False, encoding='utf-8-sig')

print(f"CSV文件已成功處理並保存為 '{new_filename}'")


add one more column name brand

In [9]:
import pandas as pd

# 讀取 CSV 檔案
df = pd.read_csv('/Users/yautakchan/Desktop/Project/success/classified_notes final.csv')

# 創建 brand 列，提取 "-" 前的資料
df['brand'] = df['Title'].apply(lambda x: x.split(' - ')[0])

# 使用 groupby 和 size 計算每個品牌的出現次數
brand_counts = df.groupby('brand').size().reset_index(name='count')

# 對計數結果進行排序並添加排名
brand_counts = brand_counts.sort_values(by='count', ascending=False).reset_index(drop=True)
brand_counts['rank'] = brand_counts['count'].rank(method='dense', ascending=False).astype(int)

# 保存新的 CSV 檔案
df.to_csv('classified_notes_with_brands.csv', index=False)

# # 顯示前幾行確認結果
# print(df.head())
print(brand_counts.head(30))



                brand  count  rank
0              Hermès     52     1
1          Dream Skin     46     2
2              Chanel     30     3
3             VERSACE     29     4
4                香薰之家     27     5
5        Blossom By H     24     6
6            BURBERRY     21     7
7                #N/A     20     8
8             BVLGARI     18     9
9               Gucci     17    10
10               Dior     17    10
11       Calvin Klein     17    10
12               全城熱賣     16    11
13           ANNA SUI     15    12
14    Elizabeth Arden     15    12
15                瑟路菲     14    13
16            Foellie     14    13
17              Chloé     13    14
18    Maison Margiela     13    14
19           Diptyque     12    15
20        W.Dressroom     12    15
21              Loewe     12    15
22           TOM FORD     11    16
23     GIORGIO ARMANI     11    16
24  Narciso Rodriguez     11    16
25         Moody Mood     11    16
26               DIOR     11    16
27              COAC