# Introduction and Set Up

This project serves as an AliBaba Web Scraper designed to retrieve information about items available for sale on AliBaba and their supplier. This data is then converted into a structured database. Leveraging the capabilities of the requests, BeautifulSoup, and pandas libraries, the script efficiently conducts web scraping and data manipulation tasks.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas

In [3]:
# Use this link to change to your custom header:https://httpbin.org/get (Grab your User-Agent)
custom_headers ={}

# Breakdown of Scraping

In [None]:
# Example URL for Alibaba product
url1 = "https://www.alibaba.com/product-detail/Commercial-Coffee-Grinder-Electric-Coffee-Grinder_62551610449.html"

# Make a request to the URL with custom headers
page = requests.get(url1, headers=custom_headers)

# Make sure the return is 200
page

In [None]:
# Create BeautifulSoup objects for parsing the page
soup1 = BeautifulSoup(page.content, "html.parser")


# Find the title and the price
title = soup1.find(class_='product-title-container')
title = title.h1.text.strip() if title else "Title not found"

print(title)


In [None]:
# Find the Price
price = soup1.find(class_='price').getText()
print(price)

In [None]:
#Suppliers
suppliers = soup1.find(class_ ='company-name').getText()
print(suppliers)

#Suppliers link
supplier_href = soup1.find(class_ ='company-name')
supplier_href = supplier_href.a.get('href')
print(supplier_href)

#Suppliers rating
supplier_rating = soup1.find(class_ = 'company-basicCapacity')
store_rating_div = supplier_rating.find('div', {'class': 'attr-title'}, string='Store rating')
store_rating_value = store_rating_div.find_next_sibling('div', {'class': 'attr-content'}).get('title')
print(store_rating_value)

# AliBaba webscraping condensed function

In [132]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

def extract_attributes_and_more_from_url(url):
    # Make a request to the URL and create BeautifulSoup object
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    # Extract title
    title = soup.find(class_='product-title-container')
    title = title.h1.text.strip() if title else "N/A"

    # Extract price
    price_container = soup.find(class_='price')
    price = price_container.getText().strip() if price_container else "Price not found"

    # Find the attribute-layout container
    attribute_layout_container = soup.find(class_='attribute-layout')

    # Initialize extracted_attributes with None for all attributes
    extracted_attributes = {attribute: None for attribute in ["Place of Origin", "Product", "Supply Ability"]}

    # Check if the container is found
    if attribute_layout_container:
        # Loop through each attribute item and extract the information
        for attribute_item in attribute_layout_container.find_all(class_='attribute-item'):
            left_text = attribute_item.find(class_='left')
            right_text = attribute_item.find(class_='right')

            # Check if both left_text and right_text are not None
            if left_text and right_text:
                left_text = left_text.text.strip()
                right_text = right_text.text.strip()

                # Check if the left_text matches any of the specified attributes
                for attribute in extracted_attributes:
                    if attribute in left_text:
                        extracted_attributes[attribute] = right_text

    else:
        print("Attribute-layout container not found" + url)

    # Extract supplier information
    suppliers_container = soup.find(class_='company-name')
    suppliers = suppliers_container.getText().strip() if suppliers_container else "N/A"

    supplier_href_container = suppliers_container.a if suppliers_container else None
    supplier_href = supplier_href_container.get('href') if supplier_href_container else "N/A"

    supplier_rating_container = soup.find(class_='company-basicCapacity')
    store_rating_value = "N/A"
    if supplier_rating_container:
        store_rating_div = supplier_rating_container.find('div', {'class': 'attr-title'}, string='Store rating')
        store_rating_value = store_rating_div.find_next_sibling('div', {'class': 'attr-content'}).get('title') if store_rating_div else "N/A"

    data = {
    'Title': title,
    'Price': price,
    'Link': url,
    'Place of Origin': extracted_attributes["Place of Origin"],
    'Product': extracted_attributes["Product"],
    'Supplier': suppliers,
    'Supply Ability': extracted_attributes["Supply Ability"],
    'Supplier Link': supplier_href,
    'Store Rating': store_rating_value
    }

    # Convert the dictionary to a Pandas DataFrame
    df = pd.DataFrame([data])

    return df


# Example usage with one of the Alibaba links
url1 = "https://www.alibaba.com/product-detail/Commercial-Coffee-Grinder-Electric-Coffee-Grinder_62551610449.html"
result_df = extract_attributes_and_more_from_url(url1)

# Print the resulting DataFrame
result_df



Unnamed: 0,Title,Price,Link,Place of Origin,Product,Supplier,Supply Ability,Supplier Link,Store Rating
0,Commercial Coffee Grinder Electric Coffee Grin...,$100.00/piece,https://www.alibaba.com/product-detail/Commerc...,"Guangdong, China",Coffee Grinder,"Jiangmen OuHuiTe Hardware Products Co., Ltd",10000 Set/Sets per Month,https://wanhuimanufacturing.en.alibaba.com/min...,4.4/5


# Creating a list of product links

In [None]:
from googlesearch import search

# Define the query for Google search. inurl:dp for products.
query = "White Vinger site:alibaba.com inurl:product-detail"

# Perform a Google search and get the URLs
alibaba_links = list(search(query, num=50, stop=50, pause=2))

# Display the links
for i, link in enumerate(alibaba_links, start=1):
    print(f"{i}. {link}")


In [None]:
# Explanation: Since many links may not work, we need to filter and retain only the valid ones.

import requests
from googlesearch import search

# Function to filter valid links
def filter_valid_links(links):
    valid_links = []

     # Iterate through the links and check their validity
    for i, link in enumerate(links, start=1):
        try:
            response = requests.get(link, timeout=5)

            # Check if the response status code is 200 (OK)
            if response.status_code == 200:
                valid_links.append(link)
                #print(f"{i}. {link} - Status Code: {response.status_code}")
                pass
            else:
                #print(f"{i}. {link} - Status Code: {response.status_code}. Skipping.")
                pass

        except requests.RequestException as e:
            #print(f"{i}. {link} - Error: {e}. Skipping.")
            pass

    return valid_links


# Filter valid links using the defined function
valid_links = filter_valid_links(alibaba_links)


# Create a Pandas DataFrame from the valid links
validlinks_df = pd.DataFrame({"Valid Links": valid_links})
validlinks_df

# Creating the final data set

In [None]:
# Initiate the list
scraped_data_list = []

# Iterate through each valid link in the DataFrame
for url in validlinks_df['Valid Links']:
    # Call the function to scrape Amazon product details from the valid URL's
    scraped_data = extract_attributes_and_more_from_url(url)

    # Check if the scraping was successful (not None)
    if scraped_data is not None:
        scraped_data_list.append(scraped_data)

# Combine the scraped data into a DataFrame
result_df = pd.concat(scraped_data_list, ignore_index=True)
result_df

#Downloading

In [50]:
#Changing the HVAC
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [139]:
# Save the shrunk dataset to a new CSV file
result_df.to_csv('/content/drive/MyDrive/Colab Notebooks/data/WhiteV_AliB.csv', index=False, sep=',')

# Provide a download link for the file
from google.colab import files
files.download('/content/drive/MyDrive/Colab Notebooks/data/WhiteV_AliB.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>