In [4]:

!pip install pandas openpyxl beautifulsoup4 requests selenium




In [5]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time


In [3]:
class Product:
    def __init__(self, mpn, price=None):
        self.mpn = mpn
        self.price = price

    def __repr__(self):
        return f"Product(MPN={self.mpn}, Price={self.price})"

In [None]:
def convert_excel_to_csv(input_excel_file, output_csv_file):
    df = pd.read_excel(input_excel_file)
    df.to_csv(output_csv_file, index=False)

In [None]:
def read_mpn_from_csv(file_path):
    products = []
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header row if present
        for row in reader:
            products.append(Product(mpn=row[0]))
    return products

In [6]:
def scrape_price_for_mpn_raw(mpn):
    url = f"https://example.com/search?q={mpn}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        price_element = soup.find('span', class_='price')  # Update with actual structure
        if not price_element:
            print(f"Price element not found for MPN {mpn}")
            return None
        price = price_element.text.strip()
        return price
    except Exception as e:
        print(f"Error scraping {mpn}: {e}")
        return None

In [7]:
def scrape_price_for_mpn_selenium(mpn):
    """
    Uses Selenium to find the search bar, enter the MPN, and select an item from the search results.
    """
    # Replace with the URL of the website you are scraping
    url = "https://www.avantorsciences.com/us/en/"
    
    # Path to your WebDriver executable
    driver_path = "/chromedriver"  # Replace with the actual path to your WebDriver
    
    # Initialize the WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run browser in headless mode
    options.add_argument('--disable-gpu')  # Disable GPU for headless
    options.add_argument('--no-sandbox')
    
    driver = webdriver.Chrome(executable_path=driver_path, options=options)
    
    try:
        # Navigate to the website
        driver.get(url)
        
        # Wait for the search bar to load
        wait = WebDriverWait(driver, 10)
        search_bar = wait.until(EC.presence_of_element_located((By.NAME, "search")))  # Replace `name="search"` with the actual locator
        
        # Enter the MPN into the search bar
        search_bar.clear()
        search_bar.send_keys(mpn)
        search_bar.send_keys(Keys.RETURN)
        
        # Wait for the search results to load
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "result-item")))  # Replace with the actual class name
        
        # Select the first result
        first_result = driver.find_element(By.CLASS_NAME, "result-item")  # Replace with the actual class name
        first_result.click()
        
        # Wait for the product page to load
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "price")))  # Replace with the actual class for price
        
        # Extract the price
        price_element = driver.find_element(By.CLASS_NAME, "price")  # Replace with the actual class for price
        price = price_element.text.strip()
        return price
    except Exception as e:
        print(f"Error with MPN {mpn}: {e}")
        return None
    finally:
        # Close the browser
        driver.quit()

In [None]:
def scrape_prices(products):
    for product in products:
        product.price = scrape_price_for_mpn_selenium(product.mpn)

In [None]:
def write_prices_to_csv(products, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["MPN", "Price"])
        for product in products:
            writer.writerow([product.mpn, product.price])

In [None]:
def write_prices_to_excel(products, output_file):
    data = [{"MPN": product.mpn, "Price": product.price} for product in products]
    df = pd.DataFrame(data)
    df.to_excel(output_file, index=False)

In [None]:

# Define input and output file paths
input_excel_file = "mpns.xlsx"  # Replace with your input Excel file
temp_csv_file = "mpns.csv"
output_csv_file = "prices.csv"
output_excel_file = "prices.xlsx"

# Step 1: Convert Excel to CSV
convert_excel_to_csv(input_excel_file, temp_csv_file)
print("Excel converted to CSV.")

# Step 2: Read MPNs from CSV
products = read_mpn_from_csv(temp_csv_file)
print("Products read from CSV:", products)

# Step 3: Scrape Prices
scrape_prices(products)
print("Prices scraped:", products)

# Step 4: Write to CSV
write_prices_to_csv(products, output_csv_file)
print(f"Prices written to {output_csv_file}")

# Step 5: Write to Excel
write_prices_to_excel(products, output_excel_file)
print(f"Prices written to {output_excel_file}")
