In [3]:
"""Works with OpenLibrary API, we want to improve it with google books api"""

'Works with OpenLibrary API, we want to improve it with google books api'

In [4]:
import pandas as pd 
import numpy as np

In [5]:
books = pd.read_csv('../data/books_fixed.csv')

In [6]:
books.head(5)

Unnamed: 0,ISBN,book_id
0,2005018,1
1,374157065,3
2,399135782,5
3,440234743,18
4,452264464,19


In [7]:
# import pandas as pd
# from requests import Session
# from requests_ratelimiter import LimiterAdapter
# from tqdm import tqdm

# def fetch_book_data():
#     # Set up rate-limited session
#     session = Session()
#     adapter = LimiterAdapter(per_second=5)  # Adjust rate limit as needed
#     session.mount("https://openlibrary.org/", adapter)
    
#     # Read input data
#     books_df = pd.read_csv('../data/books.csv')
    
#     extended_data = []
#     for isbn in tqdm(books_df['ISBN'], desc="Fetching book data"):
#         try:
#             # Fetch book data
#             response = session.get(f"https://openlibrary.org/isbn/{isbn}.json")
#             if response.status_code == 200:
#                 book_data = response.json()
                
#                 # Extract features
#                 features = {
#                     'ISBN': isbn,
#                     'number_of_pages': book_data.get('number_of_pages'),
#                     'genres': ','.join(book_data.get('genres', [])),
#                     'publish_date': book_data.get('publish_date'),
#                     'authors': ','.join([author.get('key', '') for author in book_data.get('authors', [])]),
#                     'publishers': ','.join(book_data.get('publishers', [])),
#                     'languages': ','.join([language.get('key', '') for language in book_data.get('languages', [])]),
#                     'subjects': ','.join(book_data.get('subjects', []))
#                 }
#                 extended_data.append(features)
#             else:
#                 extended_data.append({'ISBN': isbn})
                
#         except Exception as e:
#             print(f"Error with ISBN {isbn}: {e}")
#             extended_data.append({'ISBN': isbn})
    
#     extended_df = pd.DataFrame(extended_data)
#     books_df = books_df.merge(extended_df, on='ISBN', how='left')
#     books_df.to_csv('../data/extended_books.csv', index=False)
#     print("Extended dataset created and saved.")
    
#     session.close()

# fetch_book_data()

In [None]:
import pandas as pd
from requests import Session
from requests_ratelimiter import LimiterAdapter
from tqdm import tqdm

def fetch_author_name(session, author_key):
    author_url = f"https://openlibrary.org{author_key}.json"
    response = session.get(author_url)
    if response.status_code == 200:
        author_data = response.json()
        return author_data.get('name')
    return None

def fetch_book_data():
    session = Session()
    adapter = LimiterAdapter(per_second=5)
    session.mount("https://openlibrary.org/", adapter)

    books_df = pd.read_csv('../data/books_fixed.csv')
    
    extended_data = []
    for isbn in tqdm(books_df['ISBN'], desc="Fetching book data"):
        try:
            response = session.get(f"https://openlibrary.org/isbn/{isbn}.json")
            if response.status_code == 200:
                book_data = response.json()
                
                authors = [fetch_author_name(session, author.get('key')) for author in book_data.get('authors', [])]
                
                features = {
                    'ISBN': isbn,
                    'number_of_pages': book_data.get('number_of_pages'),
                    'genres': ','.join(book_data.get('genres', [])),
                    'publish_date': book_data.get('publish_date'),
                    'authors': ','.join(filter(None, authors)),  # Join only non-None authors
                    'publishers': ','.join(book_data.get('publishers', [])),
                    'languages': ','.join([language.get('key', '').replace('/languages/', '') for language in book_data.get('languages', [])]),
                    'subjects': ','.join(book_data.get('subjects', []))
                }
                extended_data.append(features)
            else:
                extended_data.append({'ISBN': isbn})
        except Exception as e:
            print(f"Error with ISBN {isbn}: {e}")
            extended_data.append({'ISBN': isbn})
    
    extended_df = pd.DataFrame(extended_data)
    books_df = books_df.merge(extended_df, on='ISBN', how='left')
    books_df.to_csv('../data/extended_books_openlibrary.csv', index=False)
    print("Extended dataset created and saved.")
    
    session.close()

fetch_book_data()

Fetching book data: 100%|██████████| 16599/16599 [4:31:38<00:00,  1.02it/s]   


Extended dataset created and saved.


In [None]:
import pandas as pd
from requests import Session
from tqdm import tqdm
import time
import os

def fetch_book_data_google():
    session = Session()
    books_df = pd.read_csv('../data/books_fixed.csv')
    
    extended_books_path = '../data/extended_books_google.csv'
    if os.path.exists(extended_books_path):
        extended_books_df = pd.read_csv(extended_books_path)
        fetched_isbns = set(extended_books_df['ISBN'].dropna())
    else:
        extended_books_df = pd.DataFrame()
        fetched_isbns = set()
    
    new_isbns = books_df[~books_df['ISBN'].isin(fetched_isbns)]['ISBN']
    
    extended_data = []
    for isbn in tqdm(new_isbns, desc="Fetching book data"):
        time.sleep(0.5)
        try:
            response = session.get(f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}")
            if response.status_code == 200:
                results = response.json()
                if results['totalItems'] > 0:
                    book_data = results['items'][0]['volumeInfo']
                    
                    # Extract relevant information
                    features = {
                        'ISBN': isbn,
                        'title': book_data.get('title'),
                        'subtitle': book_data.get('subtitle'),
                        'authors': ','.join(book_data.get('authors', [])),
                        'publisher': book_data.get('publisher'),
                        'publishedDate': book_data.get('publishedDate'),
                        'description': book_data.get('description'),
                        'pageCount': book_data.get('pageCount'),
                        'maturityRating': book_data.get('maturityRating'),
                        'language': book_data.get('language'),
                        'categories': ','.join(book_data.get('categories', [])),
                        'ratingsCount': book_data.get('ratingsCount'),
                        'averageRating': book_data.get('averageRating'),
                        'textSnippet': book_data.get('searchInfo', {}).get('textSnippet')
                    }
                    extended_data.append(features)
                else:
                    extended_data.append({'ISBN': isbn})
            else:
                print(f"Failed to fetch data for ISBN {isbn}")
                extended_data.append({'ISBN': isbn})
        except Exception as e:
            print(f"Error with ISBN {isbn}: {e}")
            extended_data.append({'ISBN': isbn})
    
    new_extended_df = pd.DataFrame(extended_data)
    
    if not extended_books_df.empty:
        combined_df = pd.concat([extended_books_df, new_extended_df], ignore_index=True)
    else:
        combined_df = new_extended_df
    
    combined_df.to_csv(extended_books_path, index=False)
    print("Extended dataset created and saved with Google Books API.")
    
    session.close()

fetch_book_data_google()

Fetching book data:   0%|          | 0/16599 [00:00<?, ?it/s]

Fetching book data:  25%|██▌       | 4210/16599 [1:01:22<3:00:21,  1.14it/s]