In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import json

# Function to scrape data from a single URL
def scrape_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract relevant information from the webpage
        title = soup.title.string
        text = soup.get_text()
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        images = [img.get('src') for img in soup.find_all('img', src=True)]

        return {'title': title, 'text': text, 'links': links, 'images': images}
    except Exception as e:
        print("Error scraping:", url, e)
        return None

# List of URLs to scrape
urls = [
    "https://products.basf.com/global/en/ci/n-vinyl-2-pyrrolidone.html",
    "https://pubchem.ncbi.nlm.nih.gov/compound/N-Vinyl-2-pyrrolidone",
    "https://www.shokubai.co.jp/en/products/detail/nvp/",
    "https://pubchem.ncbi.nlm.nih.gov/compound/N-Vinyl-2-pyrrolidone",
    "https://www.sciencedirect.com/topics/pharmacology-toxicology-and-pharmaceutical-science/1-vinyl-2-pyrrolidinone",
    "https://www.ncbi.nlm.nih.gov/books/NBK498761/#:~:text=It%20is%20used%20in%20the,the%20synthesis%20of%20phenolic%20resins",
    "https://www.sciencedirect.com/topics/agricultural-and-biological-sciences/polyvinylpyrrolidone#:~:text=PVP%20added%20to%20iodine%20forms,trade%20name%20Betadine%20and%20Pyodine",
    "https://www.shokubai.co.jp/en/products/detail/nvp/#:~:text=N%2Dvinylpyrrolidone%20is%20a%20nonionic,monomer%20with%20the%20following%20features.&text=N%2Dvinylpyrrolidone%20is%20used%20as,of%20reactivity%20with%20UV%20irradiation",
    "https://adhesives.specialchem.com/product/m-basf-n-vinyl-pyrrolidone-nvp",
    "https://www.science.gov/topicpages/n/n-vinyl+pyrrolidone+nvp",
    "https://shdexiang.en.made-in-china.com/product/tXfQDioPsKVn/China-N-Vinylpyrrolidone-CAS-No-88-12-0-C6h9no.html",
    "https://www.cphi-online.com/nvp-n-vinylpyrrolidone-prod1288298.html",
    "https://www.mdpi.com/2073-4360/11/6/1079",
    # this all URLs
]

# Scraping each URL and storing the data
scraped_data = []
for url in urls:
    data = scrape_url(url)
    if data:
        scraped_data.append(data)

# Storing the scraped data in JSON format
with open('scraped_data.json', 'w') as outfile:
    json.dump(scraped_data, outfile, indent=4)

# Storing the scraped data in CSV format
with open('scraped_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['title', 'text', 'links', 'images']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for data in scraped_data:
        writer.writerow(data)

print("Scraping completed!")


Scraping completed!
