In [1]:
from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.by import By
import re
import pandas as pd

In [4]:
class Scrape:
    driver: webdriver.Chrome

    def __init__(self):
        self.driver = webdriver.Chrome()

    def goto(self, link):
        self.driver.get(link)

    def extract_df(self, name, mapper):
        items = self.driver.find_elements(By.CLASS_NAME, name)
        # Same as using map
        # parsed = []
        # for item in items:
        #     parsed.append(mapper(item))
        parsed = map(mapper, items)
        return pd.DataFrame(parsed)
    
    def extract_pages(self, num_pages, url_mapper, root_class, mapper):
        try:
            dfs = []
            for i in range(num_pages):
                url = url_mapper(i + 1)
                self.goto(url)
                df = self.extract_df(root_class, mapper)
                dfs.append(df)
            return pd.concat(dfs)
        finally:
            self.close()
    
    def close(self):
        self.driver.close()

In [5]:
s = Scrape()

toText = lambda x: x.text

def parseAlibabaElement(el: WebElement):
    price = toText(el.find_element(By.CLASS_NAME, "search-card-e-price-main"))
    title = toText(el.find_element(By.CLASS_NAME, "search-card-e-title"))
    company = next(map(toText, el.find_elements(By.CLASS_NAME, "search-card-e-company")), None)
    review = next(map(toText, el.find_elements(By.CLASS_NAME, "search-card-e-review")), None)
    features = list(map(toText, el.find_elements(By.CLASS_NAME, "search-card-m-sale-features__item")))
    year = next(map(toText, el.find_elements(By.CLASS_NAME, "search-card-e-supplier__year")), None)
    return {
        "price": price,
        "title": title,
        "company": company,
        "review": review,
        "features": features,
        "numFeatures": len(features),
        "year": year
    }

df = s.extract_pages(
    num_pages=10,
    url_mapper=lambda idx: f"https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&categoryId=201394605&keywords=business+in+online&page={idx}",
    root_class="J-search-card-wrapper", 
    mapper=parseAlibabaElement
)

In [134]:
def extract_review_data(s):
    if s is not None:
        result = re.match(r"([\d|.]+)\n\/([\d|.]+).+(\d+)\b", s, re.M)
        return result.groups()
    else:
        return None
    
def extract_supplier_data(s):
    if s is not None:
        result = re.match(r"(\d+)\syrs?\n(\w+)\b", s, re.M)
        return result.groups()
    else:
        return None


In [119]:
re.match(r"([\d|.]+)\n\/([\d|.]+).+(\d+)\b", '2.3\n/5.0 (3 reviews)', re.M).groups()
re.match(r"(\d+)\syrs?\n(\w+)\b", '1 yr\nCN Supplier', re.M).groups()

('1', 'CN')

In [135]:
review_tupled = df["review"].apply(extract_review_data)
df["rating"] = review_tupled.apply(lambda x: float(x[0]) if x is not None else None)
df["max_rating"] = review_tupled.apply(lambda x: float(x[1]) if x is not None else None)
df["num_reviews"] = review_tupled.apply(lambda x: int(x[2]) if x is not None else None)

In [136]:
supplier_tupled = df["year"].apply(extract_supplier_data)
df["years_of_experience"] = supplier_tupled.apply(lambda x: int(x[0]) if x is not None else None)
df["supplier_country"] = supplier_tupled.apply(lambda x: str(x[1]) if x is not None else None)

In [137]:
clean_df = df.drop(columns=["review", "year"])

In [140]:
clean_df

Unnamed: 0,price,title,company,features,numFeatures,rating,max_rating,num_reviews,years_of_experience,supplier_country
0,"$2,688.00 - $2,999.00",Business Ideas For Online Business Umbrella Ve...,,[Min. order: 1 set],1,2.3,5.0,3.0,1,CN
1,$11.10 - $13.50,High-Quality to Sell Online 2023 Set With Ther...,Yiwu Tianshu E-Commerce Firm,[Min. order: 100 pieces],1,4.3,5.0,1.0,1,CN
2,$11.10 - $13.20,2023 unique products to sell online High Quali...,Yiwu Fanchuang Trading Firm,[Min. order: 50 pieces],1,,,,1,CN
3,$1.60 - $3.40,Chinese A5 PU Leather Notebook Gift Box Set Cr...,"Yiwu Yucheng Import&Export Co., Ltd.",[Min. order: 100.0 sets],1,4.8,5.0,6.0,2,CN
4,$10.90 - $12.00,Gift promotion customized Business business an...,"Changsha Jinwo Electronic Commerce Co., Ltd.",[Min. order: 100 sets],1,4.8,5.0,4.0,3,CN
...,...,...,...,...,...,...,...,...,...,...
43,$1.23 - $1.89,Magnetic Golf Divot Tool Multi-Functional Golf...,"Deer Gifts Co., Ltd.",[Min. order: 2 pieces],1,5.0,5.0,3.0,1,CN
44,$10.79 - $12.89,New Product Valentine Wedding Gift Immortal Pr...,"Deer Gifts Co., Ltd.",[Min. order: 2 boxes],1,5.0,5.0,3.0,1,CN
45,$0.47 - $0.56,Canvas drawstring bags red onion sack mesh net...,"Deer Gifts Co., Ltd.",[Min. order: 2 pieces],1,5.0,5.0,3.0,1,CN
46,$10.99 - $13.59,Year-end gifts staff employee client notebook ...,"Deer Gifts Co., Ltd.",[Min. order: 2 boxes],1,5.0,5.0,3.0,1,CN


In [84]:
# results_price_range = driver.find_elements(By.CLASS_NAME, "search-card-e-price-main")
# results_price_range

# results_product_name = driver.find_elements(By.CLASS_NAME, "search-card-e-title")
# results_product_name

# results_supplier = driver.find_elements(By.CLASS_NAME, "search-card-e-company")
# results_supplier

# results_supplier_rating = driver.find_elements(By.CLASS_NAME, "search-card-e-review")
# results_supplier_rating

# results_min_order_quantity = driver.find_elements(By.CLASS_NAME, "search-card-m-sale-features__item")
# results_min_order_quantity

# results_supplier_experience = driver.find_elements(By.CLASS_NAME, "search-card-e-supplier__year")
# results_supplier_experience[0].text




# results_complete = driver.find_elements(By.CLASS_NAME, "J-search-card-wrapper")
# results_complete

[<selenium.webdriver.remote.webelement.WebElement (session="7d80b10b0c49b40e3b759f0e483e765f", element="0C8E687589CF117125196EF237889BE0_element_194")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7d80b10b0c49b40e3b759f0e483e765f", element="0C8E687589CF117125196EF237889BE0_element_195")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7d80b10b0c49b40e3b759f0e483e765f", element="0C8E687589CF117125196EF237889BE0_element_196")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7d80b10b0c49b40e3b759f0e483e765f", element="0C8E687589CF117125196EF237889BE0_element_197")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7d80b10b0c49b40e3b759f0e483e765f", element="0C8E687589CF117125196EF237889BE0_element_198")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7d80b10b0c49b40e3b759f0e483e765f", element="0C8E687589CF117125196EF237889BE0_element_199")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7d80b10b0c49b40e3b759f

In [96]:
# print(len(results_price_range))
# print(len(results_product_name))
# print(len(results_supplier))
# print(len(results_supplier_rating))
# print(len(results_min_order_quantity))
# print(len(results_supplier_experience))





48
48
47
44
66
48


In [89]:
# prices_list = []

# # Iterate over each result
# for result_element in results_complete:
#     # Extract prices using the specified class name
#     price_elements = result_element.find_elements(By.CLASS_NAME, "search-card-e-price-main")

#     # Check if any price element is found
#     if price_elements:
#         # Extract the text from the first price element (you can adjust this if there are multiple price elements)
#         price_text = price_elements[0].text

#         # Append the extracted price to the list
#         prices_list.append(price_text)

# # Print the extracted prices
# for idx, price in enumerate(prices_list, start=1):
#     print(f"Price {idx}: {price}")

Price 1: $2,500.00 - $2,980.00
Price 2: $11.10 - $13.50
Price 3: $11.10 - $13.20
Price 4: $10.90 - $12.00
Price 5: $1.60 - $3.40
Price 6: $12.00 - $13.80
Price 7: $6.29 - $7.52
Price 8: $0.45 - $1.88
Price 9: $16.94 - $28.52
Price 10: $2.99 - $5.89
Price 11: $8.70
Price 12: $8.36
Price 13: $1.71 - $6.27
Price 14: $5.63 - $6.98
Price 15: $0.35 - $0.50
Price 16: $0.38 - $0.43
Price 17: $0.95 - $1.10
Price 18: $10.90 - $13.60
Price 19: $1.55 - $1.75
Price 20: $0.12 - $2.99
Price 21: $5.98 - $9.97
Price 22: $5.99 - $12.90
Price 23: $1.40 - $2.50
Price 24: $4.75 - $5.50
Price 25: $3.10 - $4.00
Price 26: $14.40 - $16.00
Price 27: $14.80 - $19.50
Price 28: $16.50 - $21.00
Price 29: $6.53 - $10.87
Price 30: $1.28 - $1.60
Price 31: $0.01 - $0.02
Price 32: $17.75 - $17.99
Price 33: $19.80 - $25.00
Price 34: $6.70 - $9.00
Price 35: $20.13 - $22.90
Price 36: $0.07 - $0.18
Price 37: $0.50 - $1.50
Price 38: $1.17 - $3.50
Price 39: $11.50 - $12.50
Price 40: $2.90 - $5.80
Price 41: $19.78 - $28.77
Pri

In [90]:
# # Function to extract price range
# def extract_price_range(text):
#     # Use regular expression to match price range pattern
#     match = re.search(r'\$\s*([\d.,]+)\s*-\s*\$\s*([\d.,]+)', text)
#     if match:
#         return f"${match.group(1)} - ${match.group(2)}"
#     else:
#         return None


In [91]:
# results_list = []

# # Iterate over each result
# for result_element in results_complete:
#     # Split the text into a list based on newline character '\n'
#     result_text = result_element.text
#     result_list = result_text.split('\n')

#     # Extracting price range using the function
#     price_range = extract_price_range(result_list[1])

#     # Create a dictionary to store the extracted data for the current result
#     result_dict = {
#         "Product Name": result_list[0],
#         "Price Range": price_range,
#         "Min. Order Quantity": next((x for x in result_list if "Min. order" in x), None),
#         "Supplier Name": next((x for x in result_list if "Supplier" in x), None),
#         "Supplier Experience": next((x for x in result_list if "yr" in x), None),
#         "Supplier Location": next((x for x in result_list if "CN" in x or "US" in x), None),
#         "Supplier Ratings": next((x for x in result_list if "/5.0" in x), None)
#     }

#     # Append the dictionary to the list
#     results_list.append(result_dict)

# # Create a pandas DataFrame from the list of dictionaries
# df = pd.DataFrame(results_list)
# df

Unnamed: 0,Product Name,Price Range,Min. Order Quantity,Supplier Name,Supplier Experience,Supplier Location,Supplier Ratings
0,Refinecolor Newest Technology RF-CO1 Gold Coin...,"$2,500.00 - $2,980.00",Min. order: 1 set,CN Supplier,6 yrs,CN Supplier,/5.0 (75)
1,High-Quality to Sell Online 2023 Set With Ther...,$11.10 - $13.50,Min. order: 100 pieces,CN Supplier,1 yr,CN Supplier,/5.0 (1)
2,2023 unique products to sell online High Quali...,$11.10 - $13.20,Min. order: 50 pieces,CN Supplier,1 yr,CN Supplier,
3,Gift promotion customized Business business an...,$10.90 - $12.00,Min. order: 100 sets,CN Supplier,3 yrs,CN Supplier,/5.0 (34)
4,Chinese A5 PU Leather Notebook Gift Box Set Cr...,$1.60 - $3.40,Min. order: 100.0 sets,CN Supplier,2 yrs,CN Supplier,/5.0 (6)
5,Easy ReturnReady to Ship,,Min. order: 10 pieces,CN Supplier,1 yr,CN Supplier,
6,High Quality Lovely Multimodal Gift Box Contai...,$6.29 - $7.52,Min. order: 2 boxes,CN Supplier,1 yr,CN Supplier,/5.0 (3)
7,wholesale high quality promotional business gi...,$0.45 - $1.88,Min. order: 50 pieces,CN Supplier,2 yrs,CN Supplier,/5.0 (10)
8,Easy ReturnReady to Ship,,Min. order: 10 sets,CN Supplier,5 yrs,CN Supplier,/5.0 (21)
9,Promotional ActivitiesNew Business Giveaways,,Min. order: 100 pieces,CN Supplier,3 yrs,CN Supplier,/5.0 (39)
