In [None]:
# Install dependencies from environment.yaml
!conda env update -f environment.yaml --prune

In [1]:
#start
# Cell 1: Load and track CSV data
import pandas as pd
import requests
import json
from datetime import datetime
from reprolab.archive_file import save_compact, read_compact, persistio, upload_to_cloud, download_from_cloud
#end



In [8]:
#start
@persistio(only_local=True)
def get_books():
    books_csv = pd.read_csv('books.csv')
    return books_csv

books = get_books()
#end


[persistio] Function: get_books
[persistio] Hash: 7b78f88836b51f9d1265134a35a776c0
[persistio] Attempting to load from local cache...
[persistio] Local cache miss
[persistio] Cache miss - executing function...
[persistio] Saving result of type DataFrame to cache...
Data saved compactly to reprolab_data/7b78f88836b51f9d1265134a35a776c0.DataFrame.parquet (2810 bytes)
[persistio] Successfully saved to local cache!


In [8]:
#start
for col in books_csv.select_dtypes(include='object').columns:
    books_csv[col] = books_csv[col].str.capitalize()
books_csv.head()
#end

NameError: name 'books_csv' is not defined

In [None]:
#start
# Cell 2: Fetch and track API data
import requests
import pandas as pd

def get_book_details(title):
    url = f"https://openlibrary.org/search.json?title={title}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if data['docs']:
            book = data['docs'][0]
            return {
                'isbn': book.get('isbn', ['N/A'])[0],
                'page_count': book.get('number_of_pages_median', 0),
                'first_publish': book.get('first_publish_year', None)
            }
    return {'isbn': 'N/A', 'page_count': 0, 'first_publish': None}

api_data = books_csv['title'].apply(get_book_details)
api_df = pd.DataFrame(api_data.tolist())
print("\nAPI fetched data:")
print(api_df.head())


# Some transformations
api_df = api_df.iloc[:-1]


#end

In [None]:
#start
# Cell 3: Create and track dictionary data
book_categories = {
    "The Great Gatsby": {"genre": "Fiction", "rating": 4.5, "price": 12.99},
    "1984": {"genre": "Dystopian", "rating": 4.8, "price": 15.99},
    "To Kill a Mockingbird": {"genre": "Literary Fiction", "rating": 4.7, "price": 14.99}
}

category_df = pd.DataFrame.from_dict(book_categories, orient='index')
category_df.index.name = 'title'
category_df = category_df.reset_index()
print("\nDictionary data:")
print(category_df.head())

# No new inputs originating from ths cell

#end

In [None]:
#start
# Cell 4: Merge all data sources and track intermediate result
combined_df = pd.concat([books_csv, api_df], axis=1)
final_df = pd.merge(combined_df, category_df, on='title', how='left')
final_df
#end

In [None]:
#start
# Cell 5: Perform transformations
current_year = datetime.now().year
final_df['book_age'] = current_year - final_df['publication_year']
final_df['price_per_page'] = final_df['price'] / final_df['page_count'].replace(0, 1)

def rating_category(rating):
    if rating >= 4.5:
        return 'Excellent'
    elif rating >= 4.0:
        return 'Good'
    else:
        return 'Average'

final_df['rating_category'] = final_df['rating'].apply(rating_category)
final_df['rating_category']
#end

In [None]:
#start
# Cell 7: Final analysis and visualization
print("\nDataset Statistics:")
print(final_df.describe())

genre_summary = final_df.groupby('genre').agg({
    'rating': 'mean',
    'price': 'mean',
    'page_count': 'mean'
}).round(2)

print("\nGenre Summary:")
print(genre_summary)

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
final_df.plot(kind='scatter', x='book_age', y='price', s=final_df['rating']*100, alpha=0.5)
plt.title('Book Age vs Price (size = rating)')
plt.xlabel('Book Age (years)')
plt.ylabel('Price ($)')
plt.show()
#end

In [None]:
#start

#end