In [None]:
pip install beautifulsoup4 requests



In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import csv
import json

# List of URLs to scrape
urls = [
    'https://plus.cobiss.net/cobiss/si/sl/bib/290172928',
    'https://plus.cobiss.net/cobiss/si/sl/bib/196613891',
    'https://plus.cobiss.net/cobiss/si/sl/bib/208255747',
    'https://plus.cobiss.net/cobiss/si/sl/bib/142292227',
    'https://plus.cobiss.net/cobiss/si/sl/bib/142306563',
    'https://plus.cobiss.net/cobiss/si/sl/bib/142311427',
    'https://plus.cobiss.net/cobiss/si/sl/bib/150827779',
    'https://plus.cobiss.net/cobiss/si/sl/bib/158763267',
    'https://plus.cobiss.net/cobiss/si/sl/bib/129897731',
    'https://plus.cobiss.net/cobiss/si/sl/bib/129925379',
    'https://plus.cobiss.net/cobiss/si/sl/bib/129980931',
    'https://plus.cobiss.net/cobiss/si/sl/bib/48990211',
    'https://plus.cobiss.net/cobiss/si/sl/bib/65704707',
    'https://plus.cobiss.net/cobiss/si/sl/bib/80536835',
    'https://plus.cobiss.net/cobiss/si/sl/bib/38008323',
    'https://plus.cobiss.net/cobiss/si/sl/bib/38819075',
    'https://plus.cobiss.net/cobiss/si/sl/bib/304017408',
    'https://plus.cobiss.net/cobiss/si/sl/bib/299066624'
]

def save_essential_data_to_json(results, filename='essential_book_data.json'):
    """Save only essential book data (author, title, image) to JSON file"""
    essential_data = []
    for result in results:
        if 'error' not in result:
            essential_data.append({
                'title': result.get('record_title'),
                'author': result.get('record_author'),
                'image_url': result.get('image_src')
            })

    with open(filename, 'w', encoding='utf-8') as jsonfile:
        json.dump(essential_data, jsonfile, ensure_ascii=False, indent=2)

    print(f"\nEssential book data saved to {filename}")

# Output list to store results
results = []

# Headers to mimic a browser visit
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

for url in urls:
    try:
        print(f"Scraping: {url}")

        # Fetch the page
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        # Parse the HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the target message div
        message_div = soup.find('div', class_='message')

        if message_div:
            # Extract image source
            img_tag = message_div.find('img', class_='cover')
            img_src = img_tag['src'] if img_tag and 'src' in img_tag.attrs else None
            img_title = img_tag['title'] if img_tag and 'title' in img_tag.attrs else None

            # Make image URL absolute if it's relative
            if img_src and not img_src.startswith(('http:', 'https:')):
                img_src = urljoin(url, img_src)

            # Extract record title and author
            record_title = message_div.find('div', class_='recordTitle').text.strip() if message_div.find('div', class_='recordTitle') else None
            record_author = message_div.find('div', class_='recordAuthor').text.strip() if message_div.find('div', class_='recordAuthor') else None

            # Extract additional details from recordPrompt divs
            details = {}
            for prompt in message_div.find_all('div', class_='recordPrompt'):
                span = prompt.find('span')
                if span:
                    key = span.text.strip()
                    value = prompt.text.replace(span.text, '').strip(' -;').strip()
                    details[key] = value

            # Add to results
            results.append({
                'url': url,
                'image_src': img_src,
                'image_title': img_title,
                'record_title': record_title,
                'record_author': record_author,
                **details
            })
        else:
            print(f"Message div not found on {url}")
            results.append({
                'url': url,
                'error': 'Message div not found'
            })

        # Be polite - add delay between requests
        time.sleep(2)

    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        results.append({
            'url': url,
            'error': str(e)
        })

# Save all data to CSV (original functionality)
if results:
    fieldnames = set()
    for result in results:
        fieldnames.update(result.keys())
    fieldnames = sorted(fieldnames)

    with open('book_details.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(results)

    print("\nComplete data saved to book_details.csv")

# Save essential data to JSON
save_essential_data_to_json(results)

# Print essential results summary
print("\nEssential Data Summary:")
with open('essential_book_data.json', 'r', encoding='utf-8') as jsonfile:
    essential_data = json.load(jsonfile)
    for i, book in enumerate(essential_data, 1):
        print(f"\nBook {i}:")
        print(f"Title: {book['title']}")
        print(f"Author: {book['author']}")
        print(f"Image URL: {book['image_url']}")

Scraping: https://plus.cobiss.net/cobiss/si/sl/bib/290172928
Scraping: https://plus.cobiss.net/cobiss/si/sl/bib/196613891
Scraping: https://plus.cobiss.net/cobiss/si/sl/bib/208255747
Scraping: https://plus.cobiss.net/cobiss/si/sl/bib/142292227
Scraping: https://plus.cobiss.net/cobiss/si/sl/bib/142306563
Scraping: https://plus.cobiss.net/cobiss/si/sl/bib/142311427
Scraping: https://plus.cobiss.net/cobiss/si/sl/bib/150827779
Scraping: https://plus.cobiss.net/cobiss/si/sl/bib/158763267
Scraping: https://plus.cobiss.net/cobiss/si/sl/bib/129897731
Scraping: https://plus.cobiss.net/cobiss/si/sl/bib/129925379
Scraping: https://plus.cobiss.net/cobiss/si/sl/bib/129980931
Scraping: https://plus.cobiss.net/cobiss/si/sl/bib/48990211
Scraping: https://plus.cobiss.net/cobiss/si/sl/bib/65704707
Scraping: https://plus.cobiss.net/cobiss/si/sl/bib/80536835
Scraping: https://plus.cobiss.net/cobiss/si/sl/bib/38008323
Scraping: https://plus.cobiss.net/cobiss/si/sl/bib/38819075
Scraping: https://plus.cobiss