# Test de acceso a los datos

In [None]:
# Cargamos librerias necesarias
pip install request pandas scikit-learn

In [81]:
import pandas as pd
import requests
import time

In [95]:
# Genres list
# We are going to search by this genres and transform the column subject
MY_GENRES = {
    'fantasy', 'science fiction', 'romance', 'mystery', 'horror',
    'historical fiction', 'biography', 'nonfiction', 'young adult',
    'children', 'thriller', 'dystopian', 'adventure', 'magic realism'
}

def assign_genres(subjects, my_genres=MY_GENRES):
    """ Filter variable subjects. We want to extract genres from it. It may contain actors
    Returns normalized list of genres
    """
    assigned_genres= set()
    # Loop MY_GENRES check what is 
    for subject in subjects:
        # Remove blanks and capital letters
        normalized = ''.join(c for c in subject.lower() if c.isalnum() or c == ' ')
        # Check if normalize genre is in list
        if normalized in my_genres:
    
            assigned_genres.add(normalized)

    return list(assigned_genres)

def fetch_books_by_subject(subject_query, max_books=100):
    """Devuelve un conjunto con todas las claves únicas encontradas en cualquier nivel.
    Fecth books based on subject
    Returns a dataframe with all info retrieved
    """
    limit = 100  # Límite máximo por página
    offset = 0
    books = []
   
    
    while offset < max_books:
        url = f"https://openlibrary.org/subjects/{subject_query}.json?limit={limit}"
        response = requests.get(url)

        if response.status_code == 200:
            # For each book
            data = response.json()
            works = data.get('works', [])
            if not works:
                break  # No hay más resultados
            for work in works:
                
                authors = [author.get('name', 'N/A') for author in work.get('authors', [])]
                work_subjects = work.get('subject', [])
                # Create Book Info Structure
                book_info = {
                    #Basic info
                    'openlibrary_key': work.get('get','').split('/')[-1],
                    'isbn': work.get('availability', {}).get('isbn', 'N/A'),
                    'title': work.get('title', 'N/A'),
                    'author': ', '.join(authors) if authors else 'N/A',
                    'first_publish_year': work.get('first_publish_year', 'N/A'),
                    'edition_count': work.get('edition_count', 0),
                    'original_subjects': work_subjects,
                    'assigned_genres': assign_genres(work_subjects),
                    'language': work.get('language', ['N/A'])[0],
                    'number_of_pages': work.get('number_of_pages', 0),
                    'cover_id': work.get('cover_id', 'N/A'), 
                    'availability_status': work.get('availability', {}).get('status', 'N/A'),
                    'last_modified': work.get('last_modified', {}).get('value', 'N/A')
                }
                # Add book to books
                books.append(book_info)

            offset += limit
        else:
            print(f"Error en offset {offset}")
            break

        time.sleep(1)
    return pd.DataFrame(books)


def retrieveBooks(genres, books_per_genre):
    """ Retrieve books information within a list of genres. Books_per_genre indicates a stimated max value to retieve
    Returns a Dataframe with all the information
    """
    df_list = []
    for genre in genres:
        df_list.append(fetch_books_by_subject(genre,books_per_genre))
        time.sleep(3)
    df = pd.concat(df_list, ignore_index=True)
    return df

In [96]:
test_genres = {
    'fantasy', 'science fiction'
}

Books= retrieveBooks(test_genres,100)

In [None]:
Books

Unnamed: 0,openlibrary_key,isbn,title,author,first_publish_year,edition_count,original_subjects,assigned_genres,language,number_of_pages,cover_id,availability_status,last_modified
0,,,Alice's Adventures in Wonderland,Lewis Carroll,1865,3546,"[Alice (fictitious character : carroll), ficti...","[fantasy, children, science fiction]",,0,10527843,open,
1,,,The Wonderful Wizard of Oz,L. Frank Baum,1899,2052,"[Witches, Toy and movable books, Spanish langu...","[fantasy, children, science fiction]",,0,552443,open,
2,,,Treasure Island,Robert Louis Stevenson,1880,1984,"[Fiction, Treasure Island (Imaginary place), T...","[nonfiction, fantasy, thriller, historical fic...",,0,13859660,open,
3,,,Gulliver's Travels,Jonathan Swift,1726,1809,"[YA, Young adult, Juvenile, Fiction, Fantasy, ...","[biography, fantasy, historical fiction, young...",,0,12717083,error,
4,,,The Prince,Niccolò Machiavelli,1515,1406,"[Political science, early works to 1800, Machi...",[fantasy],,0,12726168,open,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,,9785237041484,Dune Messiah,Frank Herbert,1969,68,"[American Science fiction, Dune (Imaginary pla...",[science fiction],,0,2421405,borrow_available,
196,,2266028790,Dragonflight,Anne McCaffrey,1968,67,"[Pern (Imaginary place), Science Fiction, Fant...","[fantasy, science fiction]",,0,10306696,private,
197,,1416504087,Triplanetary,"E. D. Smith, Edward Elmer Smith, Frederick E. ...",1948,67,"[Fiction, general, Science fiction, Juvenile l...",[science fiction],,0,4627686,borrow_available,
198,,9024525314,Sphere,"Michael Crichton, Jacques Polanis",1980,67,"[space ships, space vehicles, squid, psycholog...",[science fiction],,0,9254423,borrow_available,


In [None]:
Books.isbn

<bound method Series.unique of 0               None
1               None
2               None
3                N/A
4               None
           ...      
195    9785237041484
196       2266028790
197       1416504087
198       9024525314
199       0425033805
Name: isbn, Length: 200, dtype: object>