# 筆記
發現，Amazon 購物網站挑選下一頁的 token 只能夠取得當前頁面往後 1~2 頁的 url 資料。如此一來，會使多線呈爬蟲，不能夠對不同分頁同時請求。

既然如此，我們可以透過將商品類別分成，盡可能小的細項進行排取，讓每個子類別中所有資料能夠呈現的部位，小於 3 頁，儘管對於大類別可能比較無效，但依舊可以增加爬取效率。

其次，對 Amazon 進行 Http request 時，對方判定的標準是，Accept-Language、User-Agent、Cookie。



https://www.amazon.com/s?k=%E6%89%8B%E6%A9%9F&i=mobile&rh=n%3A7072561011%2Cp_36%3A5000-%2Cp_89%3AMotorola&dc&page=4&language=zh_TW&crid=2FI25L7UURJFL&qid=1707746824&rnid=14674871011&sprefix=%2Caps%2C452&ref=sr_nr_p_36_7

https://www.amazon.com/-/zh_TW/s?k=%E6%89%8B%E6%A9%9F&i=mobile&rh=n%3A2335752011%2Cp_89%3AGoogle&dc&language=zh_TW&crid=2FI25L7UURJFL&qid=1707740831&rnid=2528832011&sprefix=%2Caps%2C452&ref=sr_nr_p_89_2&ds=v1%3A6jGo6ILVHiAtZr3yRDtYYfDe7B0QEcSQ3CFbbZBVylo

https://www.amazon.com/s?k=%E6%89%8B%E6%A9%9F&i=mobile&rh=n%3A2335752011%2Cp_89%3AMotorola&dc&page=4&language=zh_TW&crid=2FI25L7UURJFL&qid=1707740751&rnid=2528832011&sprefix=%2Caps%2C452&ref=sr_pg_3

網址範例 : 
https://www.amazon.com/-/zh_TW/s?k=%E6%89%8B%E6%A9%9F&i=mobile&rh=n%3A7072561011%2Cp_89%3A{品牌名稱}&dc&page={頁數}&language=zh_TW&crid=2FI25L7UURJFL&qid=1707744035&rnid=2528832011&sprefix=%2Caps%2C452&ref=sr_nr_p_89_1&ds=v1%3AuaXq%2Bl5LxI7JYQYIyLFKAtBBH8qDOd6ZzwT6RZsgv%2BA


# 爬取，URL素材資料

In [106]:
# 頁面爬取
from bs4 import BeautifulSoup
import pandas, traceback
import requests

url = "https://www.amazon.com/-/zh_TW/s?k=%E6%89%8B%E6%A9%9F&i=mobile&rh=n%3A7072561011&dc&language=zh_TW&crid=2FI25L7UURJFL&qid=1707743372&rnid=113334702011&sprefix=%2Caps%2C452&ref=sr_nr_p_n_feature_thirty-nine_browse-bin_5&ds=v1%3Ax2zp%2BgVbZzVa1f%2BghTAstSKozXK%2B%2FFtqHOUytl3PcDY"

# 請求標頭
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Cookie': 'Cookie',
}

# 發送 GET 請求
response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.text, 'html.parser')

smartphone_brands_level_one = soup.find("div", id = "brandsRefinements")
smartphone_brands = smartphone_brands_level_one.find_all('span', {'data-csa-c-type': 'element', "data-csa-c-slot-id" : "nav-pkr"})

smartphone_brand_list = []
# 爬取每個手機品牌名稱
for smartphone_brand in smartphone_brands : 
    try : 
        smartphone_brand_text = smartphone_brand.find("span", {"class" :"a-size-base a-color-base a-text-bold"}).text
        smartphone_brand_url = smartphone_brand.find("a", {"class" :"a-link-normal s-navigation-item"})["href"]

        smartphone_brand_list.append({"smartphone_brand" : smartphone_brand_text,
                                      "smartphone_brand_url" : "https://www.amazon.com" + smartphone_brand_url
                                      })
    except : 
        smartphone_brand_text = smartphone_brand.find("span", {"class" :"a-size-base a-color-base"}).text
        smartphone_brand_url = smartphone_brand.find("a", {"class" :"a-link-normal s-navigation-item"})["href"]

        smartphone_brand_list.append({"smartphone_brand" : smartphone_brand_text,
                                      "smartphone_brand_url" : "https://www.amazon.com" + smartphone_brand_url
                                      })

smartphone_brand_df = pandas.DataFrame(smartphone_brand_list)
smartphone_brand_df.to_csv("smartphone_brand_df.csv", index=False)

# 爬取商品內容

In [1]:
def target_carwler(target, smartphone_brand) : 
    # 商品標題
    try : 
        target_title = target.find("span", "a-size-medium a-color-base a-text-normal").text
    except : 
        # print(traceback.format_exc())
        target_title = None
        
    # 商品網址
    try : 
        target_url = "https://www.amazon.com" + target.find("a", "a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal")["href"]
    except : 
        # print(traceback.format_exc())
        target_url = None

    # 商品價格
    try : 
        target_price = target.find("span", class_ = "a-offscreen").text
    except : 
        # print(traceback.format_exc())
        target_price = None
    
    # 商品評價
    try : 
        target_star_text = target.find("span", class_ = "a-icon-alt").text
    except : 
        # print(traceback.format_exc())
        target_star_text = None

    # 一個月以後的購買數量
    quantity_bought_mouth = None
    try : 
        quantity_bought_mouth_elements = target.find_all("span", class_ = "a-size-base a-color-secondary")
        for element in quantity_bought_mouth_elements :
            if "過去" in element.text :  
                quantity_bought_mouth = element.text
                break
            else : 
                pass
    except : 
        print(traceback.format_exc())
        quantity_bought_mouth = None

    # 輸出結果
    result = { "smartphone_brand" : smartphone_brand,
                "target_title" : target_title,
                "target_url" : target_url,
                "target_price" : target_price,
                "target_star_text" : target_star_text, # 這個欄位之後要分成 平均星數、最高星數 兩個欄位。
                "quantity_bought_mouth" : quantity_bought_mouth,
                }
    return result

In [2]:
# 頁面爬取
from bs4 import BeautifulSoup
import pandas, traceback

import requests

url = "https://www.amazon.com/-/zh_TW/s?k=%E6%89%8B%E6%A9%9F&i=mobile&rh=n%3A2335752011%2Cp_89%3AGoogle&dc&language=zh_TW&crid=29W3KAS2ZBRMW&qid=1707734674&rnid=2528832011&sprefix=%E6%89%8B%E6%A9%9F%2Caps%2C341&ref=sr_nr_p_89_1&ds=v1%3ASCKE79iOEdipJDMA5zbSfO25z34mFrCb2N9Tn%2Fe9PVU"

# 請求標頭
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Cookie': 'Cookie',
}

# 品牌名稱 data
smartphone_brand_df = pandas.read_csv(r"C:\Users\Sigolon1315\Desktop\程式專案包\amzon_crawler\smartphone_brand_df.csv")

result_list = []
for smartphone_brand in smartphone_brand_df["smartphone_brand"] : 
    for page in range(1, 11) : # 最多抓取10頁的內容
        try : 
            # 建立爬取之 url
            url = f"https://www.amazon.com/-/zh_TW/s?k=%E6%89%8B%E6%A9%9F&i=mobile&rh=n%3A7072561011%2Cp_89%3A{smartphone_brand}%2Cp_36%3A5000-&dc&page={page}&language=zh_TW&crid=2FI25L7UURJFL&qid=1707744035&rnid=2528832011&sprefix=%2Caps%2C452&ref=sr_nr_p_89_1&ds=v1%3AuaXq%2Bl5LxI7JYQYIyLFKAtBBH8qDOd6ZzwT6RZsgv%2BA"
            print(url)
            # 發送 GET 請求
            response = requests.get(url, headers=headers)

            if response.status_code == 200 : 
                soup = BeautifulSoup(response.text, 'html.parser')

                # 抓出商品欄位部分
                div_level_one = soup.find('div', class_="s-main-slot s-result-list s-search-results sg-row")

                #  抓取其中的商品資訊，並形成 list。
                target_information = div_level_one.find_all('div', class_="sg-col-inner")

                # 判斷是否還有商品可以抓取
                if len(target_information) == 1 : 
                    break
                else : 
                    pass
                
                # 抓取頁面中的商品資訊
                for target in target_information : 
                    if target.find('div', class_ = "puisg-col puisg-col-4-of-12 puisg-col-8-of-16 puisg-col-12-of-20 puisg-col-12-of-24 puis-list-col-right")  is not None: 
                        result = target_carwler(target, smartphone_brand)
                        result_list.append(result)

            # 需紀錄哪一頁有問題
            else : 
                print("-------------------request 問題---------")
                print(smartphone_brand)
                print(page)
                print("---------------------------------------")
                pass

            # break

        except : 
            print("-----------------報錯----------------")
            print(traceback.format_exc())
            print(smartphone_brand)
            print(page)
            print("--------------------------")
        
    # break


df = pandas.DataFrame(result_list)
df.to_csv("database.csv", index= False)

https://www.amazon.com/-/zh_TW/s?k=%E6%89%8B%E6%A9%9F&i=mobile&rh=n%3A7072561011%2Cp_89%3AMotorola%2Cp_36%3A5000-&dc&page=1&language=zh_TW&crid=2FI25L7UURJFL&qid=1707744035&rnid=2528832011&sprefix=%2Caps%2C452&ref=sr_nr_p_89_1&ds=v1%3AuaXq%2Bl5LxI7JYQYIyLFKAtBBH8qDOd6ZzwT6RZsgv%2BA
https://www.amazon.com/-/zh_TW/s?k=%E6%89%8B%E6%A9%9F&i=mobile&rh=n%3A7072561011%2Cp_89%3AMotorola%2Cp_36%3A5000-&dc&page=2&language=zh_TW&crid=2FI25L7UURJFL&qid=1707744035&rnid=2528832011&sprefix=%2Caps%2C452&ref=sr_nr_p_89_1&ds=v1%3AuaXq%2Bl5LxI7JYQYIyLFKAtBBH8qDOd6ZzwT6RZsgv%2BA
https://www.amazon.com/-/zh_TW/s?k=%E6%89%8B%E6%A9%9F&i=mobile&rh=n%3A7072561011%2Cp_89%3AMotorola%2Cp_36%3A5000-&dc&page=3&language=zh_TW&crid=2FI25L7UURJFL&qid=1707744035&rnid=2528832011&sprefix=%2Caps%2C452&ref=sr_nr_p_89_1&ds=v1%3AuaXq%2Bl5LxI7JYQYIyLFKAtBBH8qDOd6ZzwT6RZsgv%2BA
https://www.amazon.com/-/zh_TW/s?k=%E6%89%8B%E6%A9%9F&i=mobile&rh=n%3A7072561011%2Cp_89%3AMotorola%2Cp_36%3A5000-&dc&page=4&language=zh_TW&crid=2FI25L7