# Test de acceso a los datos

In [None]:
# Cargamos librerias necesarias
pip install request pandas scikit-learn

In [2]:
import pandas as pd
import requests
import time

In [None]:
# Genres list
# We are going to search by this genres and transform the column subject
MY_GENRES = {
    'fantasy', 'science fiction', 'romance', 'mystery', 'horror',
    'historical fiction', 'biography', 'nonfiction', 'young adult',
    'children', 'thriller', 'dystopian', 'adventure', 'magic realism'
}

def assign_genres(subjects, my_genres=MY_GENRES):
    """ Filter variable subjects. We want to extract genres from it. It is mixed with other type of information
    
    Args:
        subjects: variable with multiple information
        my_genres: list with genres that we are looking for in subjects variable
        
    Returns:
        Normalized list of genres
    """
    assigned_genres= set()
    # Loop MY_GENRES check what is 
    for subject in subjects:
        # Remove blanks and capital letters
        normalized = ''.join(c for c in subject.lower() if c.isalnum() or c == ' ')
        # Check if normalize genre is in list
        if normalized in my_genres:
    
            assigned_genres.add(normalized)

    return list(assigned_genres)

def get_work_ratings(work_id):
    """Obtiene puntuaciones y reseñas"""
    url = f"https://openlibrary.org/works/{work_id}/ratings.json"
    response = requests.get(url)
    return response.json() if response.status_code == 200 else {}

def fetch_books_by_subject(subject_query, max_books=100):
    """ Fecth books based on subject

    Args:
        subject_query: Genre to search
        max_books: Maximum books it is going to search for
    
    Returns:
        Dataframe with all info retrieved
    """
    limit = 100  # Max limit per page
    offset = 0
    books = []
   
    
    while offset < max_books:
        url = f"https://openlibrary.org/subjects/{subject_query}.json?limit={limit}"
        response = requests.get(url)

        if response.status_code == 200:
            # For each book
            data = response.json()
            works = data.get('works', [])
            if not works:
                break  # No more results
            for work in works:
                
                authors = [author.get('name', 'N/A') for author in work.get('authors', [])]
                work_subjects = work.get('subject', [])
                work_id = work.get('key', '').split('/')[-1]
                ratings_url = f"https://openlibrary.org/works/{work_id}/ratings.json"
                ratings_data = requests.get(ratings_url).json() if work_id else {}
                # Create Book Info Structure
                book_info = {
                    #Basic info
                    'openlibrary_key': work.get('get','').split('/')[-1],
                    'isbn': work.get('availability', {}).get('isbn', 'N/A'),
                    'title': work.get('title', 'N/A'),
                    'author': ', '.join(authors) if authors else 'N/A',
                    'first_publish_year': work.get('first_publish_year', 'N/A'),
                    'edition_count': work.get('edition_count', 0),
                    'original_subjects': work_subjects,
                    'assigned_genres': assign_genres(work_subjects),
                    'language': work.get('language', ['N/A'])[0],
                    'number_of_pages': work.get('number_of_pages', 0),
                    'cover_id': work.get('cover_id', 'N/A'), 
                    'availability_status': work.get('availability', {}).get('status', 'N/A'),
                    'last_modified': work.get('last_modified', {}).get('value', 'N/A'),
                    'average_rating': ratings_data.get('summary', {}).get('average', 0),
                    'rating_count': ratings_data.get('summary', {}).get('count', 0),
                    'reviews': [
                        {
                            'text': r.get('review', {}).get('value', ''),
                            'rating': r.get('rating', 0),
                            'date': r.get('created', '').split('T')[0]  
                        } for r in ratings_data.get('reviews', [])[:3] # Max 3 reviews
                    ]
                }

                # Add book to books
                books.append(book_info)

            offset += limit
        else:
            print(f"Error en offset {offset}")
            break

        time.sleep(1)
    return pd.DataFrame(books)


def retrieveBooks(genres, books_per_genre):
    """ Retrieve books information within a list of genres. Books_per_genre indicates a stimated max value to retieve

    Args:
        genres: List with all the genres we want to retrieve books
        books_per_genre: number of books per genre to retrieve
     
    Returns:
        DataFrame with all the information
    """
    # Aux list to append data for each genre
    df_list = []
    # Loop to retrieve info for each genre
    for genre in genres:
        df_list.append(fetch_books_by_subject(genre,books_per_genre))
        time.sleep(3)
    # Transform list to dataframe
    df = pd.concat(df_list, ignore_index=True)
    return df

In [41]:
test_genres = {
    'science fiction'
}
test = retrieveBooks(test_genres,100)

In [43]:
test.to_csv("openlibraryTest.csv")

In [16]:
test_genres = {
    'fantasy', 'science fiction'
}

Books= retrieveBooks(test_genres,100)

NameError: name 're' is not defined

In [17]:
Books

Unnamed: 0,openlibrary_key,isbn,title,author,first_publish_year,edition_count,original_subjects,assigned_genres,language,number_of_pages,cover_id,availability_status,last_modified
0,,,Alice's Adventures in Wonderland,Lewis Carroll,1865,3546,"[Alice (fictitious character : carroll), ficti...","[children, science fiction, fantasy]",,0,10527843,open,
1,,,The Wonderful Wizard of Oz,L. Frank Baum,1899,2052,"[Witches, Toy and movable books, Spanish langu...","[children, science fiction, fantasy]",,0,552443,open,
2,,,Treasure Island,Robert Louis Stevenson,1880,1984,"[Fiction, Treasure Island (Imaginary place), T...","[nonfiction, fantasy, thriller, historical fic...",,0,13859660,open,
3,,,Gulliver's Travels,Jonathan Swift,1726,1809,"[YA, Young adult, Juvenile, Fiction, Fantasy, ...","[adventure, historical fiction, fantasy, young...",,0,12717083,open,
4,,,The Prince,Niccolò Machiavelli,1515,1406,"[Political science, early works to 1800, Machi...",[fantasy],,0,12726168,open,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,,,"Looking Backward, 2000-1887",Edward Bellamy,1888,68,"[Fiction, Utopias, Time travel, Two thousand, ...",[science fiction],,0,8246102,error,
196,,,Dragonflight,Anne McCaffrey,1968,67,"[Pern (Imaginary place), Science Fiction, Fant...","[fantasy, science fiction]",,0,10306696,error,
197,,,Triplanetary,"E. D. Smith, Edward Elmer Smith, Frederick E. ...",1948,67,"[Fiction, general, Science fiction, Juvenile l...",[science fiction],,0,4627686,error,
198,,,Sphere,"Michael Crichton, Jacques Polanis",1980,67,"[space ships, space vehicles, squid, psycholog...",[science fiction],,0,9254423,error,


In [None]:
import requests
import pandas as pd
import re
from tqdm import tqdm 
import time

def enrich_with_google_books(df, delay=1, api_key=None):
    """
    Enriches a books DataFrame with data from Google Books API.

    Args:
        df: DataFrame containing columns 'isbn', 'title', and 'author'
        delay: Seconds between requests (to prevent rate limiting) 
        api_key: Optional Google Books API key
        
    Returns:
        Enriched DataFrame with new columns:
        - google_rating
        - google_ratings_count  
        - google_description
    """
    # Copy Dataframe
    enriched_df = df.copy()
    
    # Init new columns
    enriched_df['google_rating'] = None
    enriched_df['google_ratings_count'] = None
    enriched_df['google_description'] = None
    
    for idx, row in tqdm(enriched_df.iterrows(), total=len(enriched_df)):
        result = None
        # Check if isbn exist
        if pd.notna(row['isbn']) and row['isbn'] != 'N/A':
            result = get_google_books_data(row['isbn'], search_by='isbn', api_key=api_key)
        
        # If not
        if not result and pd.notna(row['title']) and pd.notna(row['author']):
            result = get_google_books_data(
                f"{row['title']} {row['author'].split(',')[0]}", 
                search_by='title',
                api_key=api_key
            )
        
        # Add data
        if result:
            enriched_df.at[idx, 'google_rating'] = result.get('averageRating')
            enriched_df.at[idx, 'google_ratings_count'] = result.get('ratingsCount')
            enriched_df.at[idx, 'google_description'] = result.get('description')
        
        time.sleep(delay)
    
    return enriched_df

def get_google_books_data(query, search_by='isbn', api_key=None):
    """
    Searches the Google Books API by ISBN or title/author.

    Args:
        query: ISBN or search string
        search_by: 'isbn' or 'title'
        api_key: Optional API key
        
    Returns:
        Dictionary with book data or None if not found
    """
    base_url = "https://www.googleapis.com/books/v1/volumes"
    params = {"maxResults": 1}
    
    if api_key:
        params['key'] = api_key
    
    if search_by == 'isbn':
        params['q'] = f"isbn:{query}"
    else:

        clean_query = re.sub(r'\([^)]*\)', '', query).strip()
        params['q'] = clean_query
    
    try:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            if data.get('totalItems', 0) > 0:
                volume_info = data['items'][0]['volumeInfo']
                return {
                    'averageRating': volume_info.get('averageRating'),
                    'ratingsCount': volume_info.get('ratingsCount'),
                    'description': volume_info.get('description')
                }
    except Exception as e:
        print(f"Error with  {query} in Google Books: {e}")
    
    return None

In [22]:
df_enriched = enrich_with_google_books(Books, delay=1)

100%|██████████| 200/200 [05:32<00:00,  1.66s/it]


In [28]:
df_enriched.to_csv("datostest.csv",index=False)

In [30]:
def get_bookwyrm_reviews(book_title, author, max_reviews=3):
    """
    Extracts reviews from BookWyrm without sentiment analysis
    
    Args:
        book_title (str): Book title 
        author (str): Book author
        max_reviews (int): Maximum reviews to return (default: 3)
        
    Returns:
        list: List of dictionaries with review text and ratings
    """
    # Search for the book
    search_url = f"https://bookwyrm.social/search?q={book_title}+{author}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    try:
        response = requests.get(search_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        book_link = soup.find('a', class_='book-title')
        
        reviews = []
        if book_link:
            book_url = f"https://bookwyrm.social{book_link['href']}"
            book_page = requests.get(book_url, headers=headers)
            book_soup = BeautifulSoup(book_page.text, 'html.parser')
            
            # Extract review elements
            for review in book_soup.select('.review')[:max_reviews]:
                content = review.select_one('.review-content').text.strip()
                rating = len(review.select('.star.full'))  # Count full stars (1-5)
                
                reviews.append({
                    'review_text': content,
                    'review_rating': rating
                })
        
        return reviews
    
    except Exception as e:
        print(f"Error fetching reviews for {book_title}: {str(e)}")
        return []
    
def add_reviews_to_dataframe(df, delay=2):
    """
    Adds BookWyrm reviews to DataFrame
    
    Args:
        df (pd.DataFrame): Input DataFrame with 'title' and 'author' columns
        delay (int): Seconds between requests (default: 2)
        
    Returns:
        pd.DataFrame: Enriched DataFrame with review columns
    """
    # Initialize new columns
    df = df.copy()
    df['reviews'] = None
    df['avg_review_rating'] = None
    
    for idx, row in df.iterrows():
        reviews = get_bookwyrm_reviews(row['title'], row['author'])
        
        if reviews:
            df.at[idx, 'reviews'] = reviews
            df.at[idx, 'avg_review_rating'] = sum(r['review_rating'] for r in reviews)/len(reviews)
        
        time.sleep(delay)  # Respect rate limits
    
    return df


In [31]:
df_enriched2 = add_reviews_to_dataframe(df_enriched)

In [36]:
df_enriched2.avg_review_rating.unique()

array([None], dtype=object)