# 1: Import Libraries

In [None]:
# Import required libraries
import requests
import pandas as pd
import time
import json
from typing import List, Dict

print("Libraries imported successfully!")

# 2: Question 1 - Fetch Books from API

In [2]:
def fetch_books_by_subject(subject: str, limit: int = 100, offset: int = 0) -> dict:
    """
    Fetches books from Open Library API by subject.
    
    Args:
        subject: The subject/topic to search for (e.g., 'science', 'fiction')
        limit: Maximum number of books per request (max 100)
        offset: Starting position for pagination
        
    Returns:
        JSON response as dictionary
    """
    # Construct API URL with parameters
    # Open Library Subjects API endpoint
    base_url = f"https://openlibrary.org/subjects/{subject}.json"
    
    # Set parameters for the request
    params = {
        'limit': limit,      # Number of results per page
        'offset': offset,    # Starting position for pagination
        'details': 'true'    # Get detailed information
    }
    
    # Set headers to identify our application
    headers = {
        'User-Agent': 'Educational-WebScraping-Project/1.0'
    }
    
    # Make GET request to API
    response = requests.get(base_url, params=params, headers=headers, timeout=15)
    
    # Check if request was successful
    response.raise_for_status()
    
    # Return JSON response as dictionary
    return response.json()


# Test the function with sample request
print("QUESTION 1: Fetching JSON content from Open Library API...")
print("=" * 70)

sample_response = fetch_books_by_subject('science', limit=5, offset=0)

print(f"API Response Keys: {list(sample_response.keys())}")
print(f"\nTotal books available in 'science' category: {sample_response.get('work_count', 0)}")
print(f"Books in this response: {len(sample_response.get('works', []))}")

if sample_response.get('works'):
    first_book = sample_response['works'][0]
    print(f"\nFirst book example:")
    print(f"  Title: {first_book.get('title')}")
    print(f"  Authors: {first_book.get('authors', [])}")
    print(f"  First publish year: {first_book.get('first_publish_year', 'N/A')}")
    
    print(f"\nJSON structure of first book (first 500 characters):")
    print(json.dumps(first_book, indent=2)[:500])


QUESTION 1: Fetching JSON content from Open Library API...
API Response Keys: ['key', 'name', 'subject_type', 'solr_query', 'work_count', 'works', 'ebook_count', 'subjects', 'places', 'people', 'times', 'authors', 'publishers', 'languages', 'publishing_history']

Total books available in 'science' category: 100661
Books in this response: 5

First book example:
  Title: Frankenstein or The Modern Prometheus
  Authors: [{'key': '/authors/OL25342A', 'name': 'Mary Shelley'}]
  First publish year: 1818

JSON structure of first book (first 500 characters):
{
  "key": "/works/OL450063W",
  "title": "Frankenstein or The Modern Prometheus",
  "edition_count": 2184,
  "cover_id": 12356249,
  "cover_edition_key": "OL35649409M",
  "subject": [
    "Frankenstein (Fictitious character)",
    "Frankenstein's monster (Fictitious character)",
    "Fiction",
    "Victor Frankenstein (Fictitious character)",
    "Scientists",
    "Monsters",
    "Fiction, horror",
    "Frankenstein (fictitious character)

# 3: Question 2 - Extract Book Information

In [3]:
def extract_book_info(book_data: dict) -> dict:
    """
    Extracts relevant information from a single book entry.
    
    Args:
        book_data: Dictionary containing book information from API
        
    Returns:
        Dictionary with cleaned book information
    """
    # Extract title (required field)
    title = book_data.get('title', 'Unknown Title')
    
    # Extract authors (may be list of dictionaries)
    authors = book_data.get('authors', [])
    # Join author names into single string
    author_names = ', '.join([author.get('name', 'Unknown') for author in authors])
    if not author_names:
        author_names = 'Unknown Author'
    
    # Extract first publish year
    first_publish_year = book_data.get('first_publish_year', 'N/A')
    
    # Extract number of editions available
    edition_count = book_data.get('edition_count', 0)
    
    # Extract subject tags (limited to first 5)
    subjects = book_data.get('subject', [])
    subject_list = ', '.join(subjects[:5]) if subjects else 'N/A'
    
    # Extract Open Library key/identifier
    key = book_data.get('key', 'N/A')
    
    # Construct full URL to book page
    book_url = f"https://openlibrary.org{key}" if key != 'N/A' else 'N/A'
    
    # Extract availability information
    availability = book_data.get('availability', {})
    is_readable = availability.get('is_readable', False)
    is_lendable = availability.get('is_lendable', False)
    
    # Determine availability status
    if is_readable:
        status = 'Readable Online'
    elif is_lendable:
        status = 'Lendable'
    else:
        status = 'Metadata Only'
    
    # Extract cover ID if available
    cover_id = book_data.get('cover_id', None)
    cover_url = f"https://covers.openlibrary.org/b/id/{cover_id}-M.jpg" if cover_id else 'N/A'
    
    # Create dictionary with all extracted information
    book_info = {
        'Title': title,
        'Authors': author_names,
        'First_Publish_Year': first_publish_year,
        'Edition_Count': edition_count,
        'Subjects': subject_list,
        'Availability': status,
        'URL': book_url,
        'Cover_URL': cover_url
    }
    
    return book_info


# Test extraction function
print("Testing book information extraction...")
test_book = sample_response['works'][0]
extracted_info = extract_book_info(test_book)

print("\nExtracted information:")
for key, value in extracted_info.items():
    print(f"  {key}: {value}")


Testing book information extraction...

Extracted information:
  Title: Frankenstein or The Modern Prometheus
  Authors: Mary Shelley
  First_Publish_Year: 1818
  Edition_Count: 2184
  Subjects: Frankenstein (Fictitious character), Frankenstein's monster (Fictitious character), Fiction, Victor Frankenstein (Fictitious character), Scientists
  Availability: Readable Online
  URL: https://openlibrary.org/works/OL450063W
  Cover_URL: https://covers.openlibrary.org/b/id/12356249-M.jpg


# 4: Scrape All Books from Multiple Subjects

In [4]:
def scrape_books_from_multiple_subjects(subjects: List[str], 
                                        books_per_subject: int = 100,
                                        total_target: int = 800) -> List[Dict]:
    """
    Scrapes books from multiple subjects until target is reached.
    
    Args:
        subjects: List of subject names to scrape
        books_per_subject: Number of books to fetch per subject
        total_target: Target number of total books
        
    Returns:
        List of dictionaries containing book information
    """
    all_books = []
    books_collected = 0
    
    print("\nQUESTION 2: Extracting book information from API...")
    print("=" * 70)
    
    # Loop through each subject
    for subject in subjects:
        # Check if we reached target
        if books_collected >= total_target:
            print(f"\nReached target of {total_target} books. Stopping.")
            break
        
        print(f"\nFetching books from subject: '{subject}'...")
        
        # Calculate how many books we still need
        books_needed = total_target - books_collected
        books_to_fetch = min(books_per_subject, books_needed)
        
        # Fetch books in batches (API limit is 100 per request)
        offset = 0
        subject_books = []
        
        while len(subject_books) < books_to_fetch:
            # Calculate batch size (max 100 per request)
            batch_size = min(100, books_to_fetch - len(subject_books))
            
            try:
                # Fetch data from API
                response_data = fetch_books_by_subject(subject, limit=batch_size, offset=offset)
                
                # Extract books from response
                works = response_data.get('works', [])
                
                if not works:
                    print(f"  No more books available for subject '{subject}'")
                    break
                
                # Process each book
                for work in works:
                    book_info = extract_book_info(work)
                    subject_books.append(book_info)
                
                print(f"  Fetched {len(works)} books (offset: {offset})")
                
                # Update offset for next batch
                offset += batch_size
                
                # Add delay to be respectful to API (0.5 seconds between requests)
                time.sleep(0.5)
                
            except Exception as e:
                print(f"  Error fetching books: {str(e)}")
                break
        
        # Add subject books to main list
        all_books.extend(subject_books)
        books_collected = len(all_books)
        
        print(f"  Total books from '{subject}': {len(subject_books)}")
        print(f"  Running total: {books_collected} books")
    
    return all_books


# Define subjects to scrape - diverse topics to get variety
subjects_to_scrape = [
    'science',
    'fiction',
    'history',
    'technology',
    'philosophy',
    'mathematics',
    'biology',
    'programming',
    'literature',
    'psychology'
]

# Execute the scraping process
print("Starting the scraping process...")
print("This may take several minutes depending on your internet connection...")
print("=" * 70)

all_books_data = scrape_books_from_multiple_subjects(
    subjects=subjects_to_scrape,
    books_per_subject=100,
    total_target=800
)

print("\n" + "=" * 70)
print(f"Scraping completed! Total books collected: {len(all_books_data)}")
print("=" * 70)


Starting the scraping process...
This may take several minutes depending on your internet connection...

QUESTION 2: Extracting book information from API...

Fetching books from subject: 'science'...
  Fetched 100 books (offset: 0)
  Total books from 'science': 100
  Running total: 100 books

Fetching books from subject: 'fiction'...
  Fetched 100 books (offset: 0)
  Total books from 'fiction': 100
  Running total: 200 books

Fetching books from subject: 'history'...
  Error fetching books: 500 Server Error: Internal Server Error for url: https://openlibrary.org/subjects/history.json?limit=100&offset=0&details=true
  Total books from 'history': 0
  Running total: 200 books

Fetching books from subject: 'technology'...
  Fetched 100 books (offset: 0)
  Total books from 'technology': 100
  Running total: 300 books

Fetching books from subject: 'philosophy'...
  Fetched 100 books (offset: 0)
  Total books from 'philosophy': 100
  Running total: 400 books

Fetching books from subject: 'mat

# 5: Display Sample Results

In [5]:
# Display first 5 books
print("\nFirst 5 books from the scraped data:")
print("=" * 70)

for i, book in enumerate(all_books_data[:5], 1):
    print(f"\n{i}. {book['Title']}")
    print(f"   Authors: {book['Authors']}")
    print(f"   Year: {book['First_Publish_Year']}")
    print(f"   Editions: {book['Edition_Count']}")
    print(f"   Availability: {book['Availability']}")
    print(f"   Subjects: {book['Subjects'][:80]}...")  # First 80 characters
    print(f"   URL: {book['URL']}")



First 5 books from the scraped data:

1. Frankenstein or The Modern Prometheus
   Authors: Mary Shelley
   Year: 1818
   Editions: 2184
   Availability: Readable Online
   Subjects: Frankenstein (Fictitious character), Frankenstein's monster (Fictitious characte...
   URL: https://openlibrary.org/works/OL450063W

2. Brave New World
   Authors: Aldous Huxley
   Year: 1932
   Editions: 696
   Availability: Lendable
   Subjects: Utopias, Brainwashing, Moral and ethical aspects of Science, Fiction, Science an...
   URL: https://openlibrary.org/works/OL64365W

3. The Invisible Man
   Authors: H. G. Wells
   Year: 0
   Editions: 563
   Availability: Metadata Only
   Subjects: Ciencia-ficción, Classic Literature, Fiction, Mentally ill, Science Fiction & Fa...
   URL: https://openlibrary.org/works/OL52266W

4. De rerum natura
   Authors: Titus Lucretius Carus
   Year: 1486
   Editions: 537
   Availability: Readable Online
   Subjects: Latin Didactic poetry, Ancient Philosophy, Poetry, Transla

# 6: Question 3 - Save to CSV

In [6]:
def save_books_to_csv(books_data: List[Dict], filename: str = 'open_library_books.csv'):
    """
    Saves the scraped book data to a CSV file using pandas.
    
    Args:
        books_data: List of dictionaries containing book information
        filename: Name of output CSV file
    """
    print("\nQUESTION 3: Saving data to CSV file...")
    print("=" * 70)
    
    # Convert list of dictionaries to pandas DataFrame
    df = pd.DataFrame(books_data)
    
    # Save to CSV file
    # index=False: Don't include row numbers
    # encoding='utf-8-sig': Proper encoding for international characters
    df.to_csv(filename, index=False, encoding='utf-8-sig')
    
    print(f"\nData successfully saved to: {filename}")
    print(f"File location: {os.path.abspath(filename)}")
    print(f"Total records: {len(df)}")
    
    return df


# Import os to get file path
import os

# Save the scraped data
df_books = save_books_to_csv(all_books_data, 'open_library_books.csv')

print("\nFile saved successfully!")



QUESTION 3: Saving data to CSV file...

Data successfully saved to: open_library_books.csv
File location: C:\Users\Rade\Desktop\TP\BData\open_library_books.csv
Total records: 800

File saved successfully!


# 7: Data Analysis and Statistics

In [7]:
# Display DataFrame information
print("\nDataFrame Structure:")
print("=" * 70)
df_books.info()

print("\n\nFirst 10 rows of the DataFrame:")
print("=" * 70)
display(df_books.head(10))

print("\n\nBasic Statistics:")
print("=" * 70)
print(f"Total books: {len(df_books)}")
print(f"Columns: {list(df_books.columns)}")
print(f"Unique authors: {df_books['Authors'].nunique()}")

# Filter out N/A values for year statistics
years = df_books[df_books['First_Publish_Year'] != 'N/A']['First_Publish_Year']
if len(years) > 0:
    print(f"Publication year range: {years.min()} to {years.max()}")



DataFrame Structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Title               800 non-null    object
 1   Authors             800 non-null    object
 2   First_Publish_Year  800 non-null    int64 
 3   Edition_Count       800 non-null    int64 
 4   Subjects            800 non-null    object
 5   Availability        800 non-null    object
 6   URL                 800 non-null    object
 7   Cover_URL           800 non-null    object
dtypes: int64(2), object(6)
memory usage: 50.1+ KB


First 10 rows of the DataFrame:


Unnamed: 0,Title,Authors,First_Publish_Year,Edition_Count,Subjects,Availability,URL,Cover_URL
0,Frankenstein or The Modern Prometheus,Mary Shelley,1818,2184,"Frankenstein (Fictitious character), Frankenst...",Readable Online,https://openlibrary.org/works/OL450063W,https://covers.openlibrary.org/b/id/12356249-M...
1,Brave New World,Aldous Huxley,1932,696,"Utopias, Brainwashing, Moral and ethical aspec...",Lendable,https://openlibrary.org/works/OL64365W,https://covers.openlibrary.org/b/id/8231823-M.jpg
2,The Invisible Man,H. G. Wells,0,563,"Ciencia-ficción, Classic Literature, Fiction, ...",Metadata Only,https://openlibrary.org/works/OL52266W,https://covers.openlibrary.org/b/id/6419199-M.jpg
3,De rerum natura,Titus Lucretius Carus,1486,537,"Latin Didactic poetry, Ancient Philosophy, Poe...",Readable Online,https://openlibrary.org/works/OL1548597W,https://covers.openlibrary.org/b/id/566208-M.jpg
4,Le Tour du Monde en Quatre-Vingts Jours,Jules Verne,1872,426,"Viajes alrededor del mundo, Translations into ...",Readable Online,https://openlibrary.org/works/OL1100007W,https://covers.openlibrary.org/b/id/6976035-M.jpg
5,Two years before the mast,Richard Henry Dana,1000,337,"Alert (Brig : 1843-1862), Large-type books, Pi...",Readable Online,https://openlibrary.org/works/OL1815550W,https://covers.openlibrary.org/b/id/8245243-M.jpg
6,Houghton Mifflin Science California,Houghton Mifflin Company Staff,2006,301,"Science, Study and teaching (Elementary), Read...",Lendable,https://openlibrary.org/works/OL27468204W,https://covers.openlibrary.org/b/id/12746894-M...
7,The Island of Dr. Moreau,H. G. Wells,1896,295,"Islands, Survival after airplane accidents, sh...",Readable Online,https://openlibrary.org/works/OL381550W,https://covers.openlibrary.org/b/id/968312-M.jpg
8,History,Herodotus,1494,282,"Ancient History, History, Early works to 1800,...",Readable Online,https://openlibrary.org/works/OL15678068W,https://covers.openlibrary.org/b/id/9829028-M.jpg
9,Houghton Mifflin Science Leveled Readers,Houghton Mifflin Company Staff,2005,242,"Science, Juvenile literature, Readers, Science...",Lendable,https://openlibrary.org/works/OL27467443W,https://covers.openlibrary.org/b/id/12917356-M...




Basic Statistics:
Total books: 800
Columns: ['Title', 'Authors', 'First_Publish_Year', 'Edition_Count', 'Subjects', 'Availability', 'URL', 'Cover_URL']
Unique authors: 558
Publication year range: 0 to 2023


# 8: Availability Distribution

In [8]:
# Display availability distribution
print("\nAvailability Distribution:")
print("=" * 70)
availability_counts = df_books['Availability'].value_counts()
print(availability_counts)

# Calculate percentages
print("\nAvailability Percentages:")
print("=" * 70)
availability_percentages = (df_books['Availability'].value_counts(normalize=True) * 100).round(2)
print(availability_percentages)



Availability Distribution:
Availability
Readable Online    335
Metadata Only      271
Lendable           194
Name: count, dtype: int64

Availability Percentages:
Availability
Readable Online    41.88
Metadata Only      33.88
Lendable           24.25
Name: proportion, dtype: float64


# 9: Top Publication Years

In [9]:
# Display top 15 publication years
print("\nTop 15 Publication Years:")
print("=" * 70)

# Filter out N/A values
valid_years = df_books[df_books['First_Publish_Year'] != 'N/A']
year_counts = valid_years['First_Publish_Year'].value_counts().head(15)
print(year_counts)



Top 15 Publication Years:
First_Publish_Year
1800    21
1900    17
1983    16
2003    11
1999    11
1986    11
1991    11
1990    11
1987    11
2004    10
1980    10
2010    10
2009     9
2006     9
1993     9
Name: count, dtype: int64


# 10: Verify CSV File

In [11]:
# Read the CSV file to verify it was saved correctly
print("\nVerifying saved CSV file:")
print("=" * 70)

df_verify = pd.read_csv('open_library_books.csv')
print(f"Successfully read CSV file")
print(f"Rows in file: {len(df_verify)}")
print(f"Columns in file: {len(df_verify.columns)}")
print(f"\nColumn names: {list(df_verify.columns)}")

print("\nFirst 3 rows from CSV file:")
display(df_verify.head(3))

print("\n" + "=" * 70)
print("TP1 COMPLETED !")
print("=" * 70)



Verifying saved CSV file:
Successfully read CSV file
Rows in file: 800
Columns in file: 8

Column names: ['Title', 'Authors', 'First_Publish_Year', 'Edition_Count', 'Subjects', 'Availability', 'URL', 'Cover_URL']

First 3 rows from CSV file:


Unnamed: 0,Title,Authors,First_Publish_Year,Edition_Count,Subjects,Availability,URL,Cover_URL
0,Frankenstein or The Modern Prometheus,Mary Shelley,1818,2184,"Frankenstein (Fictitious character), Frankenst...",Readable Online,https://openlibrary.org/works/OL450063W,https://covers.openlibrary.org/b/id/12356249-M...
1,Brave New World,Aldous Huxley,1932,696,"Utopias, Brainwashing, Moral and ethical aspec...",Lendable,https://openlibrary.org/works/OL64365W,https://covers.openlibrary.org/b/id/8231823-M.jpg
2,The Invisible Man,H. G. Wells,0,563,"Ciencia-ficción, Classic Literature, Fiction, ...",Metadata Only,https://openlibrary.org/works/OL52266W,https://covers.openlibrary.org/b/id/6419199-M.jpg



TP1 COMPLETED !
