# **oDCM Resit:** Python Code Script $-$ _Scraping AliExpress: Unveiling Market Trends in Consumer Electronics_

## Step 1: Install Necessary Packages

In [None]:
# Downloading Google Chrome
!wget https://storage.googleapis.com/chrome-for-testing-public/122.0.6261.94/linux64/chrome-linux64.zip

# Unzipping the binary file
!unzip chrome-linux64.zip

# Downloading latest Chromedriver
!wget https://storage.googleapis.com/chrome-for-testing-public/122.0.6261.94/linux64/chromedriver-linux64.zip

# Unzipping the binary file
!unzip chromedriver-linux64.zip

# Installing Selenium and webdriver_manager
!python3 -m pip install selenium webdriver_manager httpx parsel jmespath pandas

# Removing archive files
!rm chrome-linux64.zip  chromedriver-linux64.zip


--2024-04-30 16:34:40--  https://storage.googleapis.com/chrome-for-testing-public/122.0.6261.94/linux64/chrome-linux64.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.141.207, 142.251.162.207, 74.125.134.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.141.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 149157879 (142M) [application/zip]
Saving to: ‘chrome-linux64.zip’


2024-04-30 16:34:41 (141 MB/s) - ‘chrome-linux64.zip’ saved [149157879/149157879]

Archive:  chrome-linux64.zip
  inflating: chrome-linux64/ABOUT    
  inflating: chrome-linux64/MEIPreload/manifest.json  
  inflating: chrome-linux64/MEIPreload/preloaded_data.pb  
  inflating: chrome-linux64/chrome   
  inflating: chrome-linux64/chrome-wrapper  
  inflating: chrome-linux64/chrome_100_percent.pak  
  inflating: chrome-linux64/chrome_200_percent.pak  
  inflating: chrome-linux64/chrome_crashpad_handler  
  inflating: chrome-linux64/chrome_s

## Step 2: Import Required Python Libraries

In [None]:
# Import required libraries
import os
import selenium
import json
import httpx
import pandas as pd
import time
from parsel import Selector
from typing import Dict
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

## Step 3: Configure Selenium WebDriver for Google Colab

In [None]:
## Setup chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1600,900")

In [None]:
# Set path to chrome/chromedriver as per your configuration

# Specify the paths for Chrome and ChromeDriver
chrome_binary_path = "/content/chrome-linux64/chrome"
chromedriver_path = "/content/chromedriver-linux64/chromedriver"

In [None]:
# Initialize chrome_options and webdriver_service
chrome_options.binary_location = chrome_binary_path
webdriver_service = Service(chromedriver_path)

## Step 4: Initialise the Google Chrome WebDriver

In [None]:
# Use the `webdriver_service` to initialise the webdriver.
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)

## Step 5: Define Target URL (Make Sure to Verify the URL is Correct and Up-to-date)

In [None]:
# Set the URL to web scrape and store it in the `url` variable.
url = "https://www.aliexpress.com/category/44/consumer-electronics.html"

In [None]:
# Navigate to the URL webpage address that was stored in the `url` variable.
driver.get(url)

# Wait for the dynamic content to load.
time.sleep(5)

## Step 6: Identify and Extract Product Data

In [None]:
# Define the list to store the product data that will be extracted from the AliExpress Consumer ELectronics product category webpage into.
products_list = []

In [None]:
# Define the `extract_search(response)` function.
def extract_search(response) -> Dict:
    """extract json data from search page"""
    sel = Selector(response.text)

    # Find the script with result.pagectore data in it._it_t_=
    script_with_data = sel.xpath('//script[contains(.,"_init_data_=")]')

    # Select the page data from javascript variable in script tag using regex.
    data = json.loads(script_with_data.re(r'_init_data_\s*=\s*{\s*data:\s*({.+}) }')[0])
    return data['data']['root']['fields']

In [None]:
# Define the `parse_search` function to include "store" details.
def parse_search(response):
    """Parse the search page response for product preview results"""
    data = extract_search(response)  # Assuming you have a function called `extract_search`
    parsed = []
    for result in data["mods"]["itemList"]["content"]:
        store = result["store"]
        parsed.append({
            "id": result["productId"],
            "url": f"https://www.aliexpress.com/item/{result['productId']}.html",
            "type": result["productType"],
            "title": result["title"]["displayTitle"],
            "price": result["prices"]["salePrice"]["minPrice"],
            "currency": result["prices"]["salePrice"]["currencyCode"],
            "trade": result.get("trade", {}).get("tradeDesc"),
            "thumbnail": result["image"]["imgUrl"].lstrip("/"),
            "store_url": store["storeUrl"],
            "store_name": store["storeName"],
            "store_id": store["storeId"],
            "store_ali_id": store["aliMemberId"],
        })
    return parsed

if __name__ == "__main__":

    # Define the total number of pages.
    num_pages = 18

    # Initialise an empty list, named ``all_product_data``, to store the product data.
    all_product_data = []

    # Loop through each page.
    for page_num in range(1, num_pages + 1):
        url = f"https://www.aliexpress.com/category/44/consumer-electronics.html?page={page_num}"
        resp = httpx.get(url, follow_redirects=True)
        product_data = parse_search(resp)
        all_product_data.extend(product_data)

    # Convert the combined data to the ``df_all_product_data`` DataFrame.
    df_all_product_data = pd.json_normalize(all_product_data)

    # Save the ``df_all_product_data`` DataFrame to a CSV file, named ``Resit_AliExpress_Consumer_Electronics_Product_Data``.
    df_all_product_data.to_csv("Resit_AliExpress_Consumer_Electronics_Product_Data.csv", index=False)

    # Print the first few rows of the ``df_all_product_data`` DataFrame.
    print(df_all_product_data.head())

                 id                                                url  \
0  3256806102778480  https://www.aliexpress.com/item/32568061027784...   
1  3256805975404053  https://www.aliexpress.com/item/32568059754040...   
2  3256805546949310  https://www.aliexpress.com/item/32568055469493...   
3  3256806535734250  https://www.aliexpress.com/item/32568065357342...   
4  3256804176518715  https://www.aliexpress.com/item/32568041765187...   

      type                                              title   price  \
0  natural  X15 Wholesale Tws Earphone Bluetooth Wireless ...    4.86   
1  natural  X15Pro TWS Wireless Bluetooth Earphones Stereo...    0.99   
2  natural  E6S TWS Wireless Bluetooth Headset Waterproof ...    1.99   
3  natural  Air Pro Bluetooth 5.0 Earphone Wireless Headph...    0.99   
4       ad  LiFePO4 48V 150Ah 100Ah 200Ah Battery Pack 600...  666.86   

  currency         trade                                          thumbnail  \
0      USD  10,000+ sold  ae01.alicdn