In [3]:
# from dotenv import load_dotenv
# import os, json, time
# import pandas as pd
# from sqlalchemy import create_engine, text
# from pymongo import MongoClient
# from pymongo.server_api import ServerApi
# import numpy as np
# import math

# # ----------------- Load environment -----------------
# load_dotenv(override=True)

# mdb_user = os.getenv("MDB_USER")
# mdb_password = os.getenv("MDB_PASSWORD")
# msql_user = os.getenv("MSQL_USER")
# msql_password = os.getenv("MSQL_PASSWORD")

# # ----------------- Load CSV -----------------
# df_users = pd.read_csv("../data/interim/Users_clean.csv", sep = ",")
# df_books = pd.read_csv("../data/interim/Books_clean.csv", sep = ",")
# df_ = pd.read_csv("../data/interim/Users_clean.csv", sep = ",")
# df = pd.read_csv("dias_catalogue.csv")
# df = df.replace('', None)
# df.replace([np.inf, -np.inf], np.nan, inplace=True)

# # ----------------- MongoDB -----------------
# uri = f"mongodb+srv://{mdb_user}:{mdb_password}@cluster0.aoaan0f.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
# client = MongoClient(uri, server_api=ServerApi('1'))
# db = client[db_name]
# collection = db[col_name]

# # Drop collection safely for repeated runs
# if col_name in db.list_collection_names():
#     collection.drop()

# # Load JSON and insert
# with open("dias_catalogue_filtered.json", "r") as f:
#     stars_data = json.load(f)
# if stars_data:
#     collection.insert_many(stars_data)

# # ----------------- MySQL -----------------
# engine = create_engine(f"mysql+mysqlconnector://{msql_user}:{msql_password}@localhost:3306/")

# # Create database if not exists
# with engine.connect() as conn:
#     conn.execute(text(f"CREATE DATABASE IF NOT EXISTS {db_name}"))
#     conn.commit()

# # Connect to the DB
# engine = create_engine(f"mysql+mysqlconnector://{msql_user}:{msql_password}@localhost:3306/{db_name}")

# # Write DataFrame safely
# df.to_sql(name=col_name, con=engine, if_exists='replace', index=False)

# # Create index for performance (MySQL)
# with engine.connect() as conn:
#     conn.execute(text(f"CREATE INDEX IF NOT EXISTS idx_Feh_Diam ON {col_name}(FeH, Diam_pc)"))
#     conn.commit()

# # ----------------- Define FeH bins -----------------
# minFeH = math.floor(df['FeH'].min())
# maxFeH = math.ceil(df['FeH'].max())
# feh_bins = list(np.arange(minFeH, maxFeH + 0.5, 0.5))  # +0.5 to include last bin

# # ----------------- MongoDB Query -----------------
# # Match & aggregate
# pipeline = [
#     {"$match": {"features.FeH": {"$lt": 0}, "features.Diam_pc": {"$gt": 10}}},
#     {"$bucket": {
#         "groupBy": "$features.FeH",
#         "boundaries": feh_bins,
#         "default": "Other",
#         "output": {
#             "avg_age": {"$avg": "$features.age"},
#             "max_DE": {"$max": "$position.DE_ICRS"},
#             "cluster_count": {"$sum": 1},
#             "rows": {"$push": "$$ROOT"}  # push all rows into an array
#         }
#     }},
#     {"$sort": {"avg_age": -1}}
# ]

# time_start = time.time()
# agg_result = list(collection.aggregate(pipeline))
# time_end = time.time()

# print("\nMongoDB results per bin:")
# for r in agg_result:
#     print(f"Bin {r['_id']}: count={r['cluster_count']}, avg_age={r['avg_age']:.2f}, max_DE={r['max_DE']:.2f}")
#     # Optionally print each row
#     for row in r['rows']:
#         print(row)
# print("MongoDB query time:", time_end - time_start)

# # ----------------- MySQL Query -----------------
# mysql_query = f"""
# SELECT
#     FLOOR(FeH*2)/2 AS FeH_bin,
#     age, DE_ICRS, FeH, Diam_pc
# FROM {col_name}
# WHERE FeH < 0 AND Diam_pc > 10
# ORDER BY FeH_bin, age DESC;
# """

# time_start = time.time()
# with engine.connect() as conn:
#     rows = conn.execute(text(mysql_query)).fetchall()
# time_end = time.time()

# # Group rows per bin and compute summary
# from collections import defaultdict

# bins_dict = defaultdict(list)
# for r in rows:
#     bin_val = float(r[0])
#     bins_dict[bin_val].append(r[1:])  # age, DE_ICRS, FeH, Diam_pc

# print("\nMySQL results per bin:")
# for bin_val, bin_rows in sorted(bins_dict.items(), reverse=True):
#     ages = [row[0] for row in bin_rows]
#     de_vals = [row[1] for row in bin_rows]
#     print(f"Bin {bin_val}: count={len(bin_rows)}, avg_age={np.mean(ages):.2f}, max_DE={np.max(de_vals):.2f}")
#     # Optionally print each row
#     for row in bin_rows:
#         print(row)

# print("MySQL query time:", time_end - time_start)

# User enrichment preview: age category, gender, and geocoding

This section enriches `Users_clean.csv` with:
- `age_category` buckets
- deterministic `gender` assignment (stable per user)
- optional geocoding of `Location` into longitude/latitude (GeoJSON-compatible)

It only previews the enriched DataFrame (and can optionally save a preview CSV) before loading data into MySQL/MongoDB.

In [None]:
from pathlib import Path
import pandas as pd

# Project utilities
from bookrec.utils.demographics import age_to_category, assign_gender
from bookrec.utils.geographic import GeographicTransformer

# Paths
users_csv = Path("../data/interim/Users_clean.csv").resolve()

# Load users
users = pd.read_csv(users_csv)
users.columns = [c.strip().lower() for c in users.columns]  # normalize headers
users = users.rename(columns={"user-id": "user_id"})

# Enrich: age_category and gender
users["age_category"] = users["age"].apply(age_to_category)
users["gender"] = users["user_id"].apply(lambda uid: assign_gender(int(uid)))

# Display age category and gender percentage distribution
print("\n=== Age Category Distribution ===")
print(users["age_category"].value_counts(normalize=True) * 100)
print("\n=== Gender Distribution ===")
print(users["gender"].value_counts(normalize=True) * 100)




=== Age Category Distribution ===
age_category
young-adult    30.054179
40-60          29.572590
30-40          28.348550
60+             6.481389
juvenile        4.820909
child           0.722384
Name: proportion, dtype: float64

=== Gender Distribution ===
gender
female        40.167553
male          40.012040
non-binary    19.820407
Name: proportion, dtype: float64

=== User Distribution by Country ===

Total unique countries: 97
Total users: 19934

Top 20 countries by user count:
country
United States     14307
Canada             1885
United Kingdom      967
Australia           541
Germany             448
Spain               347
Portugal            164
France              157
Netherlands         150
New Zealand         122
Italy               120
Malaysia             83
Switzerland          68
Austria              45
Brazil               39
Finland              39
Singapore            38
Ireland              34
Sweden               33
Philippines          26
Name: count, dtype: in

In [None]:
#lets check if all users have country location info
#first extract location info from user location strings
#Note sometimes the location string is malformed and has more than 3 loaction parts separated by commas,
#  and country is always the last part
def extract_country(location_str):
    if isinstance(location_str, str) and location_str.strip():
        parts = [part.strip() for part in location_str.split(",")]
        return parts[-1]  # last part is country
    return None

# Apply the function to extract countries
users["country"] = users["location"].apply(extract_country)

# Dictionary to normalize country names
country_mapping = {
    # USA variants
    'usa': 'United States',
    'u.s.a.': 'United States',
    'united states': 'United States',
    'united state': 'United States',
    'america': 'United States',
    
    # UK variants
    'united kingdom': 'United Kingdom',
    'u.k.': 'United Kingdom',
    'england': 'United Kingdom',
    'wales': 'United Kingdom',
    'guernsey': 'United Kingdom',
    
    # New Zealand variants
    'new zealand': 'New Zealand',
    'new zealand"': 'New Zealand',
    'nz': 'New Zealand',
    
    # Turkey variants
    'turkey': 'Turkey',
    'turkey"': 'Turkey',
    
    # Portugal variants
    'portugal': 'Portugal',
    'portugal"': 'Portugal',
    
    # UAE variants
    'u.a.e': 'United Arab Emirates',
    'united arab emirates': 'United Arab Emirates',
    
    # Uruguay variants
    'uruguay': 'Uruguay',
    'urugua': 'Uruguay',
    
    # Philippines variants
    'philippines': 'Philippines',
    'phillipines': 'Philippines',
    
    # Trinidad and Tobago
    'tobago': 'Trinidad and Tobago',
    'trinidad and tobago': 'Trinidad and Tobago',
    
    # Spain regional variants
    'galiza': 'Spain',
    'catalonia': 'Spain',
    'euskal herria': 'Spain',
    
    # Myanmar
    'burma': 'Myanmar',
    
    # Invalid/unresolvable locations - assign to USA (most common country)
    '': 'United States',
    '"': 'United States',
    'n/a - on the road': 'United States',
    'far away...': 'United States',
    'universe': 'United States',
    'everywhere and anywhere': 'United States',
    'quit': 'United States',
    'x': 'United States',
    'alachua': 'United States',  # city in USA
    'burlington': 'United States',  # assume USA
}

# Add proper capitalization for countries already correctly named
proper_names = [
    'portugal', 'india', 'germany', 'finland', 'canada', 'romania', 'france',
    'australia', 'malaysia', 'taiwan', 'italy', 'spain', 'ireland', 'iran',
    'hong kong', 'japan', 'switzerland', 'netherlands', 'belgium', 'austria',
    'denmark', 'sweden', 'thailand', 'brazil', 'china', 'argentina',
    'singapore', 'qatar', 'mexico', 'albania', 'moldova', 'iceland', 'andorra',
    'luxembourg', 'south africa', 'slovenia', 'bulgaria', 'zimbabwe', 'iraq',
    'norway', 'lithuania', 'costa rica', 'israel', 'papua new guinea', 'grenada',
    'south korea', 'nepal', 'chile', 'belize', 'poland', 'kenya', 'solomon islands',
    'malta', 'zambia', 'lebanon', 'ecuador', 'czech republic', 'kuwait',
    'cayman islands', 'indonesia', 'laos', 'paraguay', 'russia', 'saudi arabia',
    'cyprus', 'pakistan', 'bermuda', 'dominican republic', 'panama', 'cuba',
    'egypt', 'bahrain', 'slovakia', 'afghanistan', 'haiti', 'hungary', 'peru',
    'guatemala', 'sri lanka', 'honduras', 'madagascar', 'tanzania', 'croatia',
    'puerto rico', 'venezuela', 'barbados', 'angola', 'bahamas'
]

for name in proper_names:
    if name not in country_mapping:
        country_mapping[name] = name.title()

# Normalize country names
users["country"] = users["country"].str.lower().str.strip().map(country_mapping).fillna('United States')

# Display distribution of users per country
print("\n=== User Distribution by Country ===")
country_counts = users["country"].value_counts()
print(f"\nTotal unique countries: {users['country'].nunique()}")
print(f"Total users: {len(users)}\n")
print("Top 20 countries by user count:")
print(country_counts.head(20))
print(f"\n... and {len(country_counts) - 20} more countries" if len(country_counts) > 20 else "")

# Show percentage distribution for top 10
print("\n=== Top 10 Countries (with percentages) ===")
top10 = country_counts.head(10)
for country, count in top10.items():
    percentage = (count / len(users)) * 100
    print(f"{country:25s}: {count:5d} users ({percentage:5.2f}%)")

# Add geolocation data using geopy
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import time

print("\n=== Adding Geolocation Data ===")
geolocator = Nominatim(user_agent="book_recommendation_app")

# Create a dictionary to cache country coordinates
country_coords = {}

# Get unique countries
unique_countries = users["country"].unique()

print(f"Fetching coordinates for {len(unique_countries)} unique countries...")

for country in unique_countries:
    try:
        # Add delay to respect API rate limits
        time.sleep(1)
        location = geolocator.geocode(country, exactly_one=True, timeout=10)
        if location and hasattr(location, 'latitude') and hasattr(location, 'longitude'):
            country_coords[country] = (location.latitude, location.longitude)
            print(f"✓ {country}: ({location.latitude:.4f}, {location.longitude:.4f})")
        else:
            print(f"✗ {country}: Could not geocode")
            country_coords[country] = (None, None)
    except (GeocoderTimedOut, GeocoderServiceError) as e:
        print(f"✗ {country}: Error - {e}")
        country_coords[country] = (None, None)
    except Exception as e:
        print(f"✗ {country}: Unexpected error - {e}")
        country_coords[country] = (None, None)

# Apply coordinates to users dataframe using vectorized operations
def get_lat(country):
    coords = country_coords.get(country, (None, None))
    return coords[0]

def get_lon(country):
    coords = country_coords.get(country, (None, None))
    return coords[1]

users["latitude"] = users["country"].apply(get_lat)
users["longitude"] = users["country"].apply(get_lon)

# Display summary
print("\n=== Geolocation Summary ===")
print(f"Total users with coordinates: {users['latitude'].notna().sum()}")
print(f"Users missing coordinates: {users['latitude'].isna().sum()}")


In [5]:
# Load ratings data
df_ratings = pd.read_csv("../data/interim/Ratings_clean.csv", sep=",")
df_ratings.columns = [c.strip().lower() for c in df_ratings.columns]
df_ratings = df_ratings.rename(columns={"user-id": "user_id", "book-rating": "rating"})

print("=== Rating Data Overview ===")
print(f"Total ratings: {len(df_ratings)}")
print(f"Unique users: {df_ratings['user_id'].nunique()}")
print(f"Unique books: {df_ratings['isbn'].nunique()}")
print("\nRating value distribution:")
display(df_ratings['rating'].value_counts().sort_index())

# Analyze ratings per user
user_rating_counts = df_ratings.groupby('user_id').size()

print("\n=== Ratings Distribution Per User ===")
print(f"Mean ratings per user: {user_rating_counts.mean():.2f}")
print(f"Median ratings per user: {user_rating_counts.median():.0f}")
print(f"Min ratings per user: {user_rating_counts.min()}")
print(f"Max ratings per user: {user_rating_counts.max()}")

# Distribution buckets
print("\n=== User Buckets by Rating Count ===")
buckets = [
    (1, 1, "Single rating"),
    (2, 5, "2-5 ratings"),
    (6, 10, "6-10 ratings"),
    (11, 20, "11-20 ratings"),
    (21, 50, "21-50 ratings"),
    (51, 100, "51-100 ratings"),
    (101, float('inf'), "100+ ratings")
]

for min_r, max_r, label in buckets:
    count = ((user_rating_counts >= min_r) & (user_rating_counts <= max_r)).sum()
    pct = (count / len(user_rating_counts)) * 100
    print(f"{label:20s}: {count:6d} users ({pct:5.2f}%)")

# Analyze rating patterns per user
user_stats = df_ratings.groupby('user_id').agg({
    'rating': ['mean', 'std', 'min', 'max', 'count']
}).round(2)
user_stats.columns = ['avg_rating', 'std_rating', 'min_rating', 'max_rating', 'total_ratings']
user_stats = user_stats.reset_index()

print("\n=== Rating Pattern Analysis ===")
print(f"Users with std=0 (same rating always): {(user_stats['std_rating'] == 0).sum()}")
print(f"Users with full range (0-10): {((user_stats['min_rating'] == 0) & (user_stats['max_rating'] == 10)).sum()}")

# Categorize users by rating behavior
def categorize_rater(row):
    """Categorize user rating behavior"""
    if row['total_ratings'] == 1:
        return 'insufficient_data'
    elif row['std_rating'] == 0:
        return 'uniform_rater'
    elif row['avg_rating'] >= 8:
        return 'lenient_rater'
    elif row['avg_rating'] <= 4:
        return 'harsh_rater'
    else:
        return 'moderate_rater'

user_stats['rater_type'] = user_stats.apply(categorize_rater, axis=1)

print("\n=== Rater Type Distribution ===")
rater_distribution = user_stats['rater_type'].value_counts()
for rater_type, count in rater_distribution.items():
    pct = (count / len(user_stats)) * 100
    print(f"{rater_type:20s}: {count:6d} users ({pct:5.2f}%)")

# Merge with user data
user_stats_enriched = users.merge(user_stats, on='user_id', how='left')

# Handle users with no ratings
user_stats_enriched['total_ratings'] = user_stats_enriched['total_ratings'].fillna(0).astype(int)
user_stats_enriched['rater_type'] = user_stats_enriched['rater_type'].fillna('no_ratings')

print("\n=== Users Without Ratings ===")
users_no_ratings = (user_stats_enriched['total_ratings'] == 0).sum()
print(f"Users with no ratings: {users_no_ratings} ({(users_no_ratings/len(users))*100:.2f}%)")

# Display sample of enriched data
print("\n=== Sample Enriched User Data ===")
display(user_stats_enriched[['user_id', 'country', 'age_category', 'total_ratings', 'avg_rating', 'rater_type']].head(10))


# Analyze ratings per book
book_rating_counts = df_ratings.groupby('isbn').size()

print("\n=== Ratings Distribution Per Book ===")
print(f"Mean ratings per book: {book_rating_counts.mean():.2f}")
print(f"Median ratings per book: {book_rating_counts.median():.0f}")
print(f"Min ratings per book: {book_rating_counts.min()}")
print(f"Max ratings per book: {book_rating_counts.max()}")

# Distribution buckets for books
print("\n=== Book Buckets by Rating Count ===")
book_buckets = [
    (1, 1, "Single rating"),
    (2, 5, "2-5 ratings"),
    (6, 10, "6-10 ratings"),
    (11, 20, "11-20 ratings"),
    (21, 50, "21-50 ratings"),
    (51, 100, "51-100 ratings"),
    (101, float('inf'), "100+ ratings")
]

for min_r, max_r, label in book_buckets:
    count = ((book_rating_counts >= min_r) & (book_rating_counts <= max_r)).sum()
    pct = (count / len(book_rating_counts)) * 100
    print(f"{label:20s}: {count:6d} books ({pct:5.2f}%)")

# Analyze rating statistics per book
book_stats = df_ratings.groupby('isbn').agg({
    'rating': ['mean', 'std', 'min', 'max', 'count']
}).round(2)
book_stats.columns = ['avg_rating', 'std_rating', 'min_rating', 'max_rating', 'total_ratings']
book_stats = book_stats.reset_index()

print("\n=== Book Rating Pattern Analysis ===")
print(f"Books with std=0 (same rating always): {(book_stats['std_rating'] == 0).sum()}")
print(f"Books with full range (0-10): {((book_stats['min_rating'] == 0) & (book_stats['max_rating'] == 10)).sum()}")
print(f"Books with high avg (>=8): {(book_stats['avg_rating'] >= 8).sum()}")
print(f"Books with low avg (<=4): {(book_stats['avg_rating'] <= 4).sum()}")

# Display top and bottom rated books
print("\n=== Top 10 Books by Average Rating (min 5 ratings) ===")
top_books = book_stats[book_stats['total_ratings'] >= 5].nlargest(10, 'avg_rating')
display(top_books)

print("\n=== Bottom 10 Books by Average Rating (min 5 ratings) ===")
bottom_books = book_stats[book_stats['total_ratings'] >= 5].nsmallest(10, 'avg_rating')
display(bottom_books)

print("\n=== Most Rated Books (Top 10) ===")
most_rated = book_stats.nlargest(10, 'total_ratings')
display(most_rated)


=== Rating Data Overview ===
Total ratings: 121257
Unique users: 19934
Unique books: 24751

Rating value distribution:


rating
0     78527
1       120
2       236
3       500
4       812
5      4108
6      3211
7      7079
8     10649
9      7415
10     8600
Name: count, dtype: int64


=== Ratings Distribution Per User ===
Mean ratings per user: 6.08
Median ratings per user: 1
Min ratings per user: 1
Max ratings per user: 950

=== User Buckets by Rating Count ===
Single rating       :  10738 users (53.87%)
2-5 ratings         :   5913 users (29.66%)
6-10 ratings        :   1384 users ( 6.94%)
11-20 ratings       :    848 users ( 4.25%)
21-50 ratings       :    645 users ( 3.24%)
51-100 ratings      :    239 users ( 1.20%)
100+ ratings        :    167 users ( 0.84%)

=== Rating Pattern Analysis ===
Users with std=0 (same rating always): 1769
Users with full range (0-10): 2289

=== Rater Type Distribution ===
insufficient_data   :  10738 users (53.87%)
harsh_rater         :   4022 users (20.18%)
moderate_rater      :   2764 users (13.87%)
uniform_rater       :   1769 users ( 8.87%)
lenient_rater       :    641 users ( 3.22%)

=== Users Without Ratings ===
Users with no ratings: 0 (0.00%)

=== Sample Enriched User Data ===


Unnamed: 0,user_id,country,age_category,total_ratings,avg_rating,rater_type
0,51,United States,30-40,1,9.0,insufficient_data
1,56,United States,young-adult,1,7.0,insufficient_data
2,75,United States,30-40,2,0.0,uniform_rater
3,85,United Kingdom,40-60,1,5.0,insufficient_data
4,99,United States,40-60,2,8.0,lenient_rater
5,114,United States,40-60,4,7.5,moderate_rater
6,133,United States,40-60,1,10.0,insufficient_data
7,139,United States,30-40,1,0.0,insufficient_data
8,165,United States,60+,1,0.0,insufficient_data
9,178,United States,young-adult,1,6.0,insufficient_data



=== Ratings Distribution Per Book ===
Mean ratings per book: 4.90
Median ratings per book: 2
Min ratings per book: 1
Max ratings per book: 472

=== Book Buckets by Rating Count ===
Single rating       :  11766 books (47.54%)
2-5 ratings         :   8758 books (35.38%)
6-10 ratings        :   2035 books ( 8.22%)
11-20 ratings       :   1176 books ( 4.75%)
21-50 ratings       :    712 books ( 2.88%)
51-100 ratings      :    213 books ( 0.86%)
100+ ratings        :     91 books ( 0.37%)

=== Book Rating Pattern Analysis ===
Books with std=0 (same rating always): 3455
Books with full range (0-10): 3309
Books with high avg (>=8): 3419
Books with low avg (<=4): 17380

=== Top 10 Books by Average Rating (min 5 ratings) ===


Unnamed: 0,isbn,avg_rating,std_rating,min_rating,max_rating,total_ratings
22118,1551669056,9.6,0.55,9,10,5
21424,0931580587,9.4,1.34,7,10,5
6599,0385324138,9.38,1.19,7,10,8
21367,0920668364,8.89,1.69,6,10,9
18486,0805013407,8.86,1.07,7,10,7
9486,0446391069,8.6,1.52,7,10,5
22730,1566192951,8.6,2.19,5,10,5
22982,157082262X,8.6,2.07,5,10,5
23213,1576739627,8.6,0.89,8,10,5
1535,0091891965,8.57,1.4,6,10,7



=== Bottom 10 Books by Average Rating (min 5 ratings) ===


Unnamed: 0,isbn,avg_rating,std_rating,min_rating,max_rating,total_ratings
46,0006170056,0.0,0.0,0,0,5
149,0020545401,0.0,0.0,0,0,7
342,0060149205,0.0,0.0,0,0,6
422,0060179333,0.0,0.0,0,0,6
713,0060735430,0.0,0.0,0,0,5
1008,0061002798,0.0,0.0,0,0,6
1011,006100331X,0.0,0.0,0,0,7
1071,0061013668,0.0,0.0,0,0,7
1079,0061014230,0.0,0.0,0,0,5
1209,0061087017,0.0,0.0,0,0,8



=== Most Rated Books (Top 10) ===


Unnamed: 0,isbn,avg_rating,std_rating,min_rating,max_rating,total_ratings
3236,312195516,3.97,4.29,0,10,472
4563,345337662,3.55,4.07,0,10,400
8719,440214041,2.49,3.74,0,10,384
4667,345370775,3.23,3.95,0,10,364
9692,446605239,3.13,4.06,0,10,337
11262,452282152,4.05,4.16,0,10,332
3993,316769487,4.64,4.13,0,10,296
8776,440220602,2.73,3.77,0,10,276
9346,446310786,5.09,4.59,0,10,274
4617,345353145,2.77,3.72,0,10,262


In [None]:
#lets check if all users have country location info
#first extract location info from user location strings
#Note sometimes the location string is malformed and has more than 3 loaction parts separated by commas,
#  and country is always the last part
def extract_country(location_str):
    if isinstance(location_str, str) and location_str.strip():
        parts = [part.strip() for part in location_str.split(",")]
        return parts[-1]  # last part is country
    return None

# Apply the function to extract countries
users["country"] = users["location"].apply(extract_country)

# Dictionary to normalize country names
country_mapping = {
    # USA variants
    'usa': 'United States',
    'u.s.a.': 'United States',
    'united states': 'United States',
    'united state': 'United States',
    'america': 'United States',
    
    # UK variants
    'united kingdom': 'United Kingdom',
    'u.k.': 'United Kingdom',
    'england': 'United Kingdom',
    'wales': 'United Kingdom',
    'guernsey': 'United Kingdom',
    
    # New Zealand variants
    'new zealand': 'New Zealand',
    'new zealand"': 'New Zealand',
    'nz': 'New Zealand',
    
    # Turkey variants
    'turkey': 'Turkey',
    'turkey"': 'Turkey',
    
    # Portugal variants
    'portugal': 'Portugal',
    'portugal"': 'Portugal',
    
    # UAE variants
    'u.a.e': 'United Arab Emirates',
    'united arab emirates': 'United Arab Emirates',
    
    # Uruguay variants
    'uruguay': 'Uruguay',
    'urugua': 'Uruguay',
    
    # Philippines variants
    'philippines': 'Philippines',
    'phillipines': 'Philippines',
    
    # Trinidad and Tobago
    'tobago': 'Trinidad and Tobago',
    'trinidad and tobago': 'Trinidad and Tobago',
    
    # Spain regional variants
    'galiza': 'Spain',
    'catalonia': 'Spain',
    'euskal herria': 'Spain',
    
    # Myanmar
    'burma': 'Myanmar',
    
    # Invalid/unresolvable locations - assign to USA (most common country)
    '': 'United States',
    '"': 'United States',
    'n/a - on the road': 'United States',
    'far away...': 'United States',
    'universe': 'United States',
    'everywhere and anywhere': 'United States',
    'quit': 'United States',
    'x': 'United States',
    'alachua': 'United States',  # city in USA
    'burlington': 'United States',  # assume USA
}

# Add proper capitalization for countries already correctly named
proper_names = [
    'portugal', 'india', 'germany', 'finland', 'canada', 'romania', 'france',
    'australia', 'malaysia', 'taiwan', 'italy', 'spain', 'ireland', 'iran',
    'hong kong', 'japan', 'switzerland', 'netherlands', 'belgium', 'austria',
    'denmark', 'sweden', 'thailand', 'brazil', 'china', 'argentina',
    'singapore', 'qatar', 'mexico', 'albania', 'moldova', 'iceland', 'andorra',
    'luxembourg', 'south africa', 'slovenia', 'bulgaria', 'zimbabwe', 'iraq',
    'norway', 'lithuania', 'costa rica', 'israel', 'papua new guinea', 'grenada',
    'south korea', 'nepal', 'chile', 'belize', 'poland', 'kenya', 'solomon islands',
    'malta', 'zambia', 'lebanon', 'ecuador', 'czech republic', 'kuwait',
    'cayman islands', 'indonesia', 'laos', 'paraguay', 'russia', 'saudi arabia',
    'cyprus', 'pakistan', 'bermuda', 'dominican republic', 'panama', 'cuba',
    'egypt', 'bahrain', 'slovakia', 'afghanistan', 'haiti', 'hungary', 'peru',
    'guatemala', 'sri lanka', 'honduras', 'madagascar', 'tanzania', 'croatia',
    'puerto rico', 'venezuela', 'barbados', 'angola', 'bahamas'
]

for name in proper_names:
    if name not in country_mapping:
        country_mapping[name] = name.title()

# Normalize country names
users["country"] = users["country"].str.lower().str.strip().map(country_mapping).fillna('United States')

# Display distribution of users per country
print("\n=== User Distribution by Country ===")
country_counts = users["country"].value_counts()
print(f"\nTotal unique countries: {users['country'].nunique()}")
print(f"Total users: {len(users)}\n")
print("Top 20 countries by user count:")
print(country_counts.head(20))
print(f"\n... and {len(country_counts) - 20} more countries" if len(country_counts) > 20 else "")

# Show percentage distribution for top 10
print("\n=== Top 10 Countries (with percentages) ===")
top10 = country_counts.head(10)
for country, count in top10.items():
    percentage = (count / len(users)) * 100
    print(f"{country:25s}: {count:5d} users ({percentage:5.2f}%)")

# Add geolocation data using geopy
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import time

print("\n=== Adding Geolocation Data ===")
geolocator = Nominatim(user_agent="book_recommendation_app")

# Create a dictionary to cache country coordinates
country_coords = {}

# Get unique countries
unique_countries = users["country"].unique()

print(f"Fetching coordinates for {len(unique_countries)} unique countries...")

for country in unique_countries:
    try:
        # Add delay to respect API rate limits
        time.sleep(1)
        location = geolocator.geocode(country, exactly_one=True, timeout=10)
        if location and hasattr(location, 'latitude') and hasattr(location, 'longitude'):
            country_coords[country] = (location.latitude, location.longitude)
            print(f"✓ {country}: ({location.latitude:.4f}, {location.longitude:.4f})")
        else:
            print(f"✗ {country}: Could not geocode")
            country_coords[country] = (None, None)
    except (GeocoderTimedOut, GeocoderServiceError) as e:
        print(f"✗ {country}: Error - {e}")
        country_coords[country] = (None, None)
    except Exception as e:
        print(f"✗ {country}: Unexpected error - {e}")
        country_coords[country] = (None, None)

# Apply coordinates to users dataframe using vectorized operations
def get_lat(country):
    coords = country_coords.get(country, (None, None))
    return coords[0]

def get_lon(country):
    coords = country_coords.get(country, (None, None))
    return coords[1]

users["latitude"] = users["country"].apply(get_lat)
users["longitude"] = users["country"].apply(get_lon)

# Display summary
print("\n=== Geolocation Summary ===")
print(f"Total users with coordinates: {users['latitude'].notna().sum()}")
print(f"Users missing coordinates: {users['latitude'].isna().sum()}")


In [None]:
import ast
import pandas as pd

# Hierarchical category system
category_hierarchy = {
    'Fiction': {
        'General Fiction': ['Fiction', 'Fiction in English', 'American fiction', 'English fiction', 
                           'Canadian fiction', 'Australian fiction', 'Irish fiction', 'Scottish fiction',
                           'New Zealand fiction', 'Domestic fiction', 'Domestic fiction, American',
                           'Literary fiction', 'Popular literature', 'Experimental fiction',
                           'Dutch fiction', 'Spanish fiction', 'Argentine fiction', 'Czech fiction',
                           'Indic fiction (English)', 'Italian fiction', 'Japanese fiction',
                           'Botswanan fiction (English)', 'Russian fiction', 'Samoan fiction',
                           'Fiction in English, 1900-1945 - Texts', 'Classical fiction',
                           'Musical fiction', 'Film novelizations', 'Gothic novels',
                           'Amerikaanse fiksie', 'French fiction'],
        
        'Mystery & Crime': ['Detective and mystery stories', 'Detective and mystery stories, American',
                           'Detective and mystery stories, English', 'Detective and mystery stories, New Zealand',
                           'Crime', 'Crime and criminals', 'Mystery & Detective Stories', 'Noir fiction',
                           'Code and cipher stories', 'Detectives', 'Private investigators',
                           'Private investigation', 'Murder', 'Homicide', 'Serial killers',
                           'Criminal psychology', 'Criminal investigation', 'True Crime',
                           'Forensic psychology', 'Criminals', 'Brigands and robbers',
                           'Mafia', 'Organized crime', 'Commercial crimes',
                           # Fictional detectives
                           'Poirot, Hercule (Fictitious character)', 'Marple, Jane (Fictitious character)',
                           'Holmes, Sherlock (Fictitious character)', 'Beck, Martin (Fictitious character)',
                           'Bosch, Harry (Fictitious character)', 'Alleyn, Roderick (Fictitious character)',
                           'Appleby, John, Sir (Fictitious character)', 'Battle, Superintendent (Fictitious character)',
                           'Cadfael, Brother (Fictitious character)', 'Campion, Albert (Fictitious character)',
                           'Balzic, Mario (Fictitious character)', 'Chimera, Hertzan (Fictitious character)',
                           'Ashton, Carol (Fictitious character)'],
        
        'Science Fiction & Fantasy': ['Science fiction', 'Science fiction, American', 'Science fiction, British',
                                     'Science fiction, Australian', 'Science fiction, French', 'Fantasy fiction',
                                     'Fantasy fiction, American', 'Fantasy', 'Dystopian fiction',
                                     'Time travel', 'Interplanetary voyages', 'Life on other planets',
                                     'Extraterrestial beings', 'Human-alien encounters', 'Alien abduction',
                                     'Science fiction.', 'Science fiction, American.', 'Fantasy games',
                                     'Dungeons and Dragons (Game)', 'Dragons', 'Dragons in literature',
                                     'Elves', 'Dwarfs (Persons)', 'Wizards', 'Magic realism (Literature)',
                                     'Unicorns', 'Fairies', 'Fantastic fiction, Cuban', 'Fourth dimension',
                                     'Jupiter (Planet)', 'Moon', 'Rama (Imaginary space vehicle)',
                                     'Discworld (Imaginary place)', 'Amber (Imaginary place)',
                                     'Genetic engineering', 'Robots', 'Future life',
                                     # Fantasy elements
                                     'Goddesses', 'Gods, Chinese', 'Grail', 'Knights and knighthood',
                                     'Swordsmen', 'Castles', 'Chivalry',
                                     # Fictional characters
                                     'Tarzan (Fictitious character)', 'DiGriz, James Bolivar (Fictitious character)',
                                     'Mulder, Fox (Fictitious character)', 'Worthing, Jason (Fictitious character)',
                                     'Ayla (Fictitious character)', 'Crusoe, Robinson (Fictitious character)',
                                     'Bolan, Mack (Fictitious character)', 'Saint (Fictitious character)',
                                     'Falstaff, John, Sir (Fictitious character)', 'Finch, Septimus (Fictitious character)',
                                     # Imaginary places
                                     'Bellehaven (Wash. : Imaginary place)', 'Belleview (Wash. : Imaginary place)',
                                     'Cato, Quintus Licinius (Fictitious character)'],
        
        'Horror': ['Horror tales', 'Horror tales, American', 'Horror tales, English', 'Horror',
                  'Ghost stories', 'Ghost stories, American', 'Ghosts', 'Vampires',
                  'Horror tales, American.', 'Horror stories.', 'Devil', 'Demonology',
                  'Hell', 'Superstition'],
        
        'Romance': ['Love stories', 'Romance fiction', 'Romances', 'Historical romance',
                   'Romantic suspense fiction', 'Erotic stories', 'Erotic stories, American',
                   'Erotic stories, Australian', 'Erotic literature', 'Love', 'Courtship',
                   'Marriage proposals', 'First loves', 'Dating (Social customs)',
                   'Chick lit', 'juvenile romance', 'Love, Maternal.',
                   'Adultery', 'Married people', 'Bridegrooms',
                   # Character types
                   'Governesses', 'Nobility', 'Aristocracy (Social class)',
                   'Favorites, Royal'],
        
        'Adventure': ['Adventure stories', 'Adventure stories, American', 'Adventure fiction',
                     'Adventure and adventurers', 'Adventure', 'Escapes',
                     'Voyages around the world', 'Travelers',
                     'Lewis and Clark Expedition', 'Beagle Expedition',
                     # Action elements
                     'Duelling', 'Cannibalism', 'Kidnapping', 'Abduction',
                     'Missing persons', 'Derelicts'],
        
        'Historical Fiction': ['Historical fiction', 'Historical fiction, American',
                              'Alternative histories (Fiction), American', 'Historical films',
                              'Historical romance', 'Renaissance', 'Medieval fiction',
                              # Historical periods
                              'Nineteen twenties', 'Nineteen thirties',
                              'Civilization, Medieval'],
        
        'War & Military': ['War stories', 'Military fiction', 'World War, 1914-1918',
                          'World War, 1939-1945', 'Civil War', 'Vietnam War, 1961-1975',
                          'Holocaust, Jewish (1939-1945)', 'Naval battles', 'Military history',
                          'Boer War, 1880-1881', 'Iwo Jima, Battle of, Japan, 1945',
                          'New Market, Battle of, New Market, Va., 1864',
                          'Appomattox Campaign, 1865', 'Pearl Harbor (Hawaii), Attack on, 1941',
                          'Military funerals', 'Soldiers', 'African-American soldiers',
                          'Fighter pilots', 'Air pilots', 'Cavalry pioneer troops',
                          'Holocaust survivors', 'Prisoners of war', 'Nuclear warfare',
                          'Generals', 'Admirals', 'Armed Forces'],
        
        'Western': ['Western stories', 'Frontier and pioneer life', 'Dakota Territory',
                   'Cowboys', 'Pioneers', 'Great Plains'],
        
        'Thriller & Suspense': ['Suspense fiction', 'Espionage', 'Conspiracies',
                               'Espionage, Soviet', 'Intelligence service', 'Intelligence officers',
                               'Assassination', 'Assassins', 'Extortion', 'Attempted murder',
                               'Deception'],
        
        'Literary Forms': ['Short stories', 'Short stories, American', 'Short stories, English',
                          'Short stories, Canadian', 'Novellas', 'Epistolary fiction',
                          'Diary fiction', 'Autobiographical fiction', 'Bildungsromans',
                          'Epic literature', 'Fables', 'Fairy tales', 'Fairy tales.',
                          'Tales', 'Legends', 'Penny dreadfuls', 'Diaries',
                          'Short Stories-Science Fiction'],
        
        'Humor': ['Humorous stories', 'Black humor', 'Comic fiction', 'Satire',
                 'Religious satire'],
        
        'Religious Fiction': ['Christian fiction', 'Christian ficiton', 'Missionary stories',
                             'Jewish fiction'],
        
        'Paranormal & Occult Fiction': ['Paranormal fiction', 'Occult fiction',
                                       'Channeling (Spiritualism)', 'Prophecies',
                                       'Reincarnation', 'Evocation']
    },
    
    'Children & Young Adult': {
        'Juvenile Fiction': ['Juvenile Fiction', 'JUVENILE FICTION', "Children's literature",
                            "Children's literature, English", "Children's stories",
                            "Children's stories, American.", "Children's stories, English",
                            "Children's stories, New Zealand", "Children's stories, Swiss (German)",
                            'Junior fiction', 'Christmas stories', 'Christmas stories, American',
                            'Christmas stories, English',
                            # Children's characters
                            'Clifford (Fictitious character : Bridwell)', 'Baby Bop (Fictitious character)',
                            'Baby Roo (Fictitious character)', 'Malone, Beany (Fictitious character)',
                            'Singenpoo (Fictitious character)', 'Alfie (Fictitious character : Hughes)',
                            'Hannay, Richard (Fictitious character)', 'Browne, Agnes (Fictitious character)',
                            'Cassidy, Hopalong (Fictitious character)'],
        
        'Young Adult': ['Young Adult Fiction', 'Young adult fiction', 'Young adult fiction, English',
                       'juvenile romance'],
        
        'Picture Books': ['Picture books for children', 'Board books', 'Stories in rhyme',
                         'Picture puzzles'],
        
        'Juvenile Nonfiction': ['Juvenile Nonfiction', 'JUVENILE NONFICTION', 'Young Adult Nonfiction',
                               "Children's encyclopedias and dictionaries",
                               # Educational content for children
                               'Alphabet.', 'Colors', 'Counting', 'Bedtime'],
        
        'Poetry for Children': ["Children's poetry", "Children's poetry, Australian",
                               "Children's poetry, English"]
    },
    
    'Poetry': {
        'American Poetry': ['American poetry', 'American poetry (Collections)', 'Poetry, American',
                           'Sonnets, American'],
        'English Poetry': ['English poetry', 'Poetry, English', 'Fantasy poetry, English'],
        'Other Poetry': ['Poetry', 'POETRY', 'Epic poetry, Greek', 'Bawdy poetry']
    },
    
    'Drama': {
        'Plays': ['Drama', 'American drama', 'English drama', 'Canadian drama',
                 'Australian drama', 'Hungarian drama', 'Swiss drama (German)'],
        'Theater': ['Theater', 'Acting', 'Promptbooks', 'Actors & Acting',
                   'Burlesque (Theater)', 'Performing Arts', 'Actors', 'Actresses',
                   'Entertainers', 'Comedians', 'Clowns', 'Acrobats',
                   'Juggling', 'Stripteasers', 'Television scripts',
                   'Situation comedies (Television programs)']
    },
    
    'Biography & Memoir': {
        'Biography': ['Biography', 'Biography & Autobiography', 'BIOGRAPHY & AUTOBIOGRAPHY',
                     'Autobiographies', 'Memoirs', 'Biographers', 'Last words',
                     'Bereavement', 'Death', 'Death, Apparent'],
        
        'By Profession': ['Authors', 'Authors, American', 'Authors, English', 'Authors, Australian',
                         'Authors, Irish', 'Authors, Japanese', 'Authors, Russian',
                         'Musicians', 'Actors', 'Actresses', 'Artists', 'Painters',
                         'Politicians', 'Presidents', 'Generals', 'Scientists',
                         'Dramatists', 'Dramatists, English', 'Novelists, English',
                         'Women authors, American', 'Women authors, Irish',
                         'African American dramatists', 'African American authors',
                         'Poets', 'Sculptors', 'Architects', 'Photographers',
                         'Singers', 'Folk musicians', 'Rock musicians', 'Composers',
                         'Conductors (Music)', 'Entertainers', 'Comedians',
                         'Women comedians', 'Women painters', 'Artists, Dutch',
                         'Baseball players', 'Hockey players', 'Atheletes', 'Wrestlers',
                         'Ice skaters.', 'Journalists', 'Reporters and reporting',
                         'Librarians', 'Physicians', 'Dentists', 'Veterinarians',
                         'Chiropractors', 'Gynecologists', 'Nurses',
                         'Fighter pilots', 'Air pilots', 'Admirals',
                         'Businessmen', 'Businesspeople', 'Businesswomen', 'Executives',
                         'Abolitionists', 'Bakers', 'Accountants', 'Jewelers',
                         'Dog breeders', 'Antique dealers', 'Artist couples',
                         'Blind musicians', 'Baseball coaches', 'Flight attendants',
                         'Construction workers', 'Crossword puzzle makers',
                         'Caterers and catering', 'Cabinet officers',
                         'Criminal defense lawyers', 'Attorneys general', 'Judges',
                         'Motion picture actors and actresses',
                         'Motion picture producers and directors'],
        
        'Religious Biography': ['Christian biography', 'Christian saints', 'Apostles', 'Nuns',
                               'Hasidim', 'Lepers New Brunswick Tracadie', 'Clergy',
                               'Children of clergy', 'Atheists', 'Missionaries']
    },
    
    'History': {
        'General History': ['History', 'HISTORY', 'History, Modern', 'Civilization',
                           'Civilization, Medieval', 'Civilization, Modern',
                           'Chronology, Historical', 'Antiquities.',
                           'Climate and civilization', 'Collective memory',
                           'Historical reenactments', 'Historic ships',
                           'Archaeology', 'Archaeological surveying',
                           'Charters', 'Session laws'],
        
        'Military History': ['World War, 1914-1918', 'World War, 1939-1945', 'Civil War',
                            'Vietnam War, 1961-1975', 'Holocaust, Jewish (1939-1945)',
                            'Naval battles', 'Military history', 'Boer War, 1880-1881',
                            'Holocaust survivors', 'Prisoners of war', 'Nuclear warfare',
                            'Military funerals', 'Iwo Jima, Battle of, Japan, 1945',
                            'New Market, Battle of, New Market, Va., 1864',
                            'Appomattox Campaign, 1865', 'Pearl Harbor (Hawaii), Attack on, 1941',
                            'Cold War', 'Berlin Wall, Berlin, Germany, 1961-1989'],
        
        'United States': ['United States', 'American history', 'Colonial America',
                        'Southern States', 'New England', 'Colonial Williamsburg (Williamsburg, Va.)',
                        'Alabama', 'Alaska', 'Arizona', 'California', 'Colorado', 'Connecticut',
                        'Idaho', 'Maine', 'Maryland', 'Montana', 'New York (N.Y.)', 'Pennsylvania',
                        'Texas', 'Vermont', 'Virginia', 'North Carolina', 'Southern States',
                        # US regions and cities
                        'Atlanta (Ga.)', 'Brooklyn (New York, N.Y.)', 'Chicago (Ill.)',
                        'Detroit (Mich.)', 'Fort Lauderdale (Fla.)', 'Houston Region (Tex.)',
                        'Key West (Fla.)', 'Hamptons (N.Y.)', 'Hollywood (Los Angeles, Calif.)',
                        'Saint Paul (Minn.)', 'Castro (San Francisco, Calif.)',
                        'Chinatown (New York, N.Y.)', 'Back Bay (Boston, Mass.)',
                        'Cape Cod (Mass.)', 'Sun Valley (Idaho)', 'Alta (Utah)',
                        'Hole-in-the-Rock (Utah)', 'Trumbull County (Ohio)',
                        'Pacific Coast (Or.)', 'Klondike River Valley (Yukon)',
                        'Great Plains', 'Dakota Territory', 'Knickajack Dam',
                        'Universal Studios Florida (Orlando, Fla. : Amusement park)'],
        
        'Europe': ['Europe', 'Great Britain', 'England', 'France', 'Germany', 'Italy', 'Spain',
                  'Ireland', 'Scotland', 'Wales', 'Belgium', 'Netherlands', 'Switzerland',
                  'Austria', 'Sweden', 'Norway', 'Denmark', 'Finland', 'Iceland',
                  'Poland', 'Russia', 'Czech Republic', 'Slovakia', 'Croatia',
                  'Hungary', 'Romania', 'Bulgaria', 'Slovenia', 'Albania', 'Moldova',
                  'Greece', 'Portugal', 'Czechoslovakia',
                  # European cities and regions
                  'London (England)', 'Paris (France)', '4e Arrondissement (Paris, France)',
                  'Berlin (Germany)', 'Cologne (Germany)', 'Athens (Greece)',
                  'Florence (Italy)', 'Sicily (Italy)', 'Rome (Italy)',
                  'Belfast (Northern Ireland)', 'Belgrade (Serbia)',
                  'Cannes (France)', 'Aberystwyth (Wales)', 'Caerphilly (Wales)',
                  'Cornwall (England : County)', 'Cotswold Hills (England)',
                  'Hebrides (Scotland)', 'Aran Islands (Ireland)',
                  'Alps, Swiss (Switzerland)', 'Canadian Rockies (B.C. and Alta.)',
                  # Historical entities
                  'Anglo-Saxons', 'Celts', 'Belgians', 'Czech Americans',
                  'German Americans', 'Italian Americans', 'Italian American families'],
        
        'Asia': ['Asia', 'China', 'Japan', 'India', 'Korea', 'South Korea', 'Taiwan',
                'Thailand', 'Malaysia', 'Indonesia', 'Philippines', 'Singapore',
                'Burma', 'Myanmar', 'Nepal', 'Sri Lanka', 'Pakistan', 'Iran',
                'Afghanistan', 'Iraq', 'Turkey', 'Mongolia', 'Laos', 'Hong Kong',
                # Asian cities
                'Cairo (Egypt)', 'New Delhi (India)', 'Colombo (Sri Lanka)',
                'Japanese Americans', 'Asian Americans'],
        
        'Africa': ['Africa', 'Africa, West', 'Africa, East', 'Africa, Southern',
                  'Africa, Central', 'Africa, North', 'South Africa', 'Egypt',
                  'Kenya', 'Zimbabwe', 'Zambia', 'Namibia', 'Ghana', 'Nigeria',
                  'Algeria', 'Morocco', 'Tanzania', 'Madagascar', 'Angola',
                  'Burkina Faso', 'Botswana',
                  'Africans', 'Chaga (African people)'],
        
        'Middle East': ['Middle East', 'Israel', 'Lebanon', 'Qatar', 'Bahrain',
                       'Kuwait', 'Saudi Arabia', 'United Arab Emirates',
                       'Arab-Israeli conflict', 'Gaza Strip', 'Arabian knights'],
        
        'Latin America': ['Latin America', 'Mexico', 'Brazil', 'Argentina', 'Chile',
                         'Peru', 'Ecuador', 'Colombia', 'Cuba', 'Haiti', 'Venezuela',
                         'Uruguay', 'Paraguay', 'Costa Rica', 'Guatemala', 'Honduras',
                         'Panama', 'Dominican Republic', 'Puerto Rico', 'Barbados',
                         'Trinidad and Tobago', 'Bahamas', 'Grenada', 'Belize',
                         'Jamaica', 'Cayman Islands', 'Bermuda',
                         'Baja California (Mexico : Peninsula)', 'Caribbean Area',
                         'Creoles'],
        
        'Oceania': ['Australia', 'New Zealand', 'Fiji', 'Papua New Guinea',
                   'Solomon Islands'],
        
        'Historical Movements': ['Renaissance', 'Reformation', 'Crusades',
                                'French Revolution', 'Industrial Revolution',
                                'Cold War', 'Berlin Wall, Berlin, Germany, 1961-1989',
                                'Antinuclear movement', 'Anti-clericalism',
                                'Feminism', 'Cargo cults']
    },
    
    'Science & Technology': {
        'Life Sciences': ['Biology', 'Botany', 'Zoology', 'Genetics', 'Evolution',
                         'Natural history', 'Ecology', 'Biodiversity conservation',
                         'Anatomy', 'Physiology', 'Genetic engineering',
                         'Animal behavior', 'Predation (Biology)',
                         'Bible and evolution'],
        
        'Physical Sciences': ['Physics', 'Chemistry', 'Astronomy', 'Geology', 'Meteorology',
                             'Earth sciences', 'Photochemistry', 'Analytical chemistry',
                             'Geophysics', 'Relativity (Physics)', 'Weather', 'Clouds',
                             'Earthquakes', 'Rocks', 'Astronomy, Ancient', 'Alchemy',
                             'Force and energy', 'Light', 'Photochemistry',
                             'Stream measurements'],
        
        'Computer Science': ['Computers', 'Computer science', 'Programming', 'Internet',
                            'Computer networks', 'Database management', 'HTML (Document markup language)',
                            'Java (Computer program language)', 'Linux', 'FreeBSD.',
                            'Microsoft Windows (Computer file)', 'Object-oriented programming (Computer science)',
                            'ArcView', 'Computer graphics', 'Human-computer interaction',
                            'Electronic commerce', 'World Wide Web (Information retrieval system)',
                            'Microcomputers', 'Minicomputers', 'Computer games', 'Video games'],
        
        'Engineering': ['Technology & Engineering', 'Engineering design', 'Aeronautics',
                       'Technology', 'Chemical process control', 'Metallurgy',
                       'Construction', 'Machine sewing', 'Highway planning',
                       'Inventions'],
        
        'Mathematics': ['Mathematics', 'Algebra', 'Calculus', 'Plane trigonometry',
                       'Logic', 'Geometry', 'Common fallacies']
    },
    
    'Social Sciences': {
        'Psychology': ['Psychology', 'Psychoanalysis', 'Cognitive psychology', 'Social psychology',
                      'Child psychology', 'Developmental psychology', 'Adolescent psychotherapy',
                      'Existential psychotherapy', 'Gestalt psychology', 'Cognitive neuroscience.',
                      'Identity (Psychology)', 'Intimacy (Psychology)', 'Competition (Psychology)',
                      'Control (Psychology)', 'Memory', 'Emotions', 'Fear', 'Guilt',
                      'Courage', 'Happiness', 'Adjustment disorders', 'Mental health',
                      'Mental illness', 'Autism', 'Cerebral dominance', 'Brain',
                      'Discernment of spirits', 'Dream interpretation', 'Dreams',
                      'Egoism', 'Forensic psychology', 'Mindfulness (Psychology)',
                      'Behavior', 'Alienation (Social psychology)', 'Codependency',
                      'Dependencia (Psicología)', 'Sex (Psychology)',
                      'Depression', 'Anxiety', 'Insanity, Religious',
                      'Dangerously mentally ill', 'Eating disorders',
                      'Senile dementia', "Alzheimer's disease",
                      'Compulsive eaters.', 'Hypnotism',
                      'Creative ability', 'Chance'],
        
        'Sociology': ['Sociology', 'Social Science', 'Social problems', 'Communities',
                     'Class structure', 'Classes sociales - États-Unis',
                     'Social case work', 'Conflict of generations',
                     'Interpersonal relations', 'Interpersonal relations.',
                     'Interpersonal communication', 'Interpersonal conflict',
                     'Communication in humanities', 'Conversation',
                     'Family', 'Families', 'Families in art', 'Family secrets',
                     'Marriage', 'Divorce', 'Married people', 'Divorced people',
                     'Adultery', 'Courtship', 'Dating (Social customs)',
                     'Friendship', 'Female friendship', 'Man-woman relationships',
                     'Fathers', 'Mothers', 'Mothers-Quotations, Maxims, etc',
                     'Motherhood', 'Fathers and sons', 'Fathers and daughters',
                     'Brothers', 'Sisters', 'Brothers and sisters', 'Cousins',
                     'Grandparents', 'Grandfathers', 'Aunts',
                     'African Americans', 'African American families',
                     'African American men', 'African American women',
                     'African American teenage girls', 'African American police',
                     'African American judges', 'African American entertainers',
                     'Women', 'Men', 'Gender', 'Sex role', 'Feminism', 'Feminists',
                     'Femmes - Europe - Conditions sociales',
                     'Housewives', 'Middle-aged women', 'Christian women',
                     'Abused wives', 'Abused women', 'Lesbian mothers',
                     'Gay male couples', 'Gay men', 'Gays', 'Lesbians',
                     'Homosexuality', 'Cross-dressers',
                     'Eccentrics and eccentricities', 'Bohemianism',
                     'Alternative lifestyles', 'Derelicts',
                     'Manners and customs', 'Etiquette', 'Hospitality',
                     'Age groups', 'Aging', 'Life cycle, Human',
                     'Menopause', 'Adolescence',
                     'Clubs', 'Congresses and conventions',
                     'Amusements', 'Contests',
                     'Bullies', 'Bullying',
                     'Curiosities and wonders', 'Superstition',
                     'Common interest ownership communities',
                     'Villages', 'City and town life', 'Country life',
                     'Plantation life', 'Plantations',
                     'Americans', 'Americanisms',
                     'National characteristics, American',
                     'Amish'],
        
        'Anthropology': ['Anthropology.', 'Ethnology', 'Folklore',
                        'Mythology', 'Mythology, African', 'Mythology, Greek.',
                        'Indian mythology', 'Hindu art', 'Aboriginal Australians',
                        'Maori (New Zealand people)', 'Native Americans',
                        'Indians of North America', 'Dakota Indians', 'Crow Indians',
                        'Cheyenne Indians', 'Chinook Indians', 'Arapaho Indians',
                        'Chaga (African people)', 'Cargo cults',
                        'Celts', 'Anglo-Saxons', 'Beowulf'],
        
        'Economics': ['Economics', 'Business & Economics', 'Finance', 'International finance',
                     'Capitalism', 'Socialism', 'Communism', 'Fascism',
                     'Economic forecasting', 'Economic confusion',
                     'Economía internacional', 'Income distribution',
                     'Financial futures', 'Agriculture', 'Agricultural credit',
                     'Agricultural price supports', 'Agricultural colonies',
                     'Crops and climate', 'Sugar growing', 'Timber',
                     'Diamond industry and trade', 'Cosmetics industry',
                     'Fashion', 'Brand name products',
                     'Consumer goods', 'Money-making projects for children',
                     'Finance, Personal', 'Investment advisors'],
        
        'Political Science': ['Political Science', 'POLITICAL SCIENCE', 'Politics', 'Democracy',
                             'Totalitarianism', 'Fascism', 'Communism', 'Conservatism',
                             'Radicalism', 'Libertarianism', 'Anarchism',
                             'Political participation', 'Political prisoners',
                             'Press and politics', 'Church and state',
                             'Diplomacy', 'Diplomatic and consular service',
                             'International organization', 'Territorial waters',
                             'Antinuclear movement', 'Anti-clericalism',
                             'Bureaucracy', 'Corruption',
                             'Technical assistance, American'],
        
        'Education': ['Education', 'Teaching', 'Learning', 'Study Aids',
                     'Test preparation', 'Study skills', 'Education, Preschool',
                     'Educational exchanges', 'Creative writing', 'Academic writing',
                     'Technical writing', 'Composition (Language arts)',
                     'Reading', 'Reading (Elementary).', 'College readers',
                     'Group reading', 'Grammar', 'Grammar, Comparative and general',
                     'Vocabulary', 'Spelling', 'Handwriting', 'Authorship',
                     'Schools', 'Boarding schools', 'High school students',
                     'College students', 'College dropouts', 'Medical students',
                     'Sinclair Symposium'],
        
        'Communication': ['Communication in marketing.', 'Speech',
                         'Public speaking', 'Rhetoric', 'Negotiation',
                         'Internet in public relations']
    },
    
    'Religion & Spirituality': {
        'Christianity': ['Christianity', 'Christian life', 'Christian ethics', 'Bible',
                        'Theology, Doctrinal', 'Church history', 'Prayer', 'Devotional literature',
                        'Devotional calendars', 'Devotional calendars.', 'Evangelistic work',
                        'Evangelistic sermons', 'Christian education',
                        'Christianity and literature', 'Christian converts from Judaism',
                        'Anglican converts', 'Baptists', 'Seventh-Day Adventists',
                        'Society of Friends', 'Catholics', 'Catholic Worker Movement',
                        'Church and social problems', 'Apologetics', "Apostles' Creed",
                        'Atonement', 'Immaculate Conception', 'Canon law',
                        'Bibles', 'Book of Mormon', 'Christian literature, Early',
                        'Hymns', 'Choirs (Music)', 'Christmas', 'Christmas crafts',
                        'Christmas decorations', 'Easter', 'Lent', 'Advent',
                        'Midsummer (Festival)', 'Labor Day',
                        'Church and state', 'Clergy', 'Children of clergy',
                        'Arminianism', 'Good and evil'],
        
        'Other Religions': ['Judaism', 'Islam', 'Buddhism', 'Hinduism', 'Confucianism',
                           'Jewish converts', 'Buddhist sermons, English',
                           'Hindu sects', 'Koan', 'Kunḍạlinī', 'Kuṇḍalinī',
                           'Zen Buddhism', 'Hasidim', 'Hindu art',
                           'Indian mythology', 'Gods, Chinese', 'Goddesses'],
        
        'Spirituality': ['Religion', 'RELIGION', 'Spiritual life', 'Meditation', 'Mysticism',
                        'New Age movement', 'New Thought', 'Spiritual healing',
                        'Spiritual retreats', 'Meditations', 'Affirmations',
                        'Channeling (Spiritualism)', 'Ascended masters',
                        'Reincarnation', 'Faith', 'Forgiveness', 'Discernment of spirits',
                        'Evocation', 'Hatha yoga', 'Yoga', 'Mindfulness (Psychology)',
                        'Contemplation', 'Religions', 'Mediums'],
        
        'Occult': ['Occultism', 'Magic', 'Astrology', 'Tarot', 'Prophecies', 'Occult fiction',
                  'Prophecies (Occultism)', 'Fortune-telling by runes', 'Numerology',
                  'Cabala', 'Demonology', 'Devil', 'Talismans', 'Witchcraft',
                  'Wicca', 'Paganism', 'Astrology, Chinese']
    },
    
    'Philosophy': {
        'General Philosophy': ['Philosophy', 'Ethics', 'Logic', 'Metaphysics',
                              'Epistemology', 'Knowledge, Theory of', 'Aesthetics',
                              'Conduct of life', 'Good and evil'],
        
        'Schools of Thought': ['Existentialism', 'Idealism', 'Rationalism',
                              'Irrationalism (Philosophy)']
    },
    
    'Self-Help & Personal Development': {
        'Personal Growth': ['Self-Help', 'Self-actualization (Psychology)', 'Self-realization',
                           'Success', 'Happiness', 'Conduct of life',
                           'Self-realization in women', 'Self-acceptance',
                           'Identity (Psychology)', 'Decision making', 'Problem solving',
                           'Time management', 'Honesty', 'Courage',
                           'Self-esteem', 'Motivation'],
        
        'Relationships': ['Interpersonal relations', 'Love', 'Marriage', 'Friendship',
                         'Dating (Social customs)', 'Divorce', 'Intimacy (Psychology)',
                         'Communication', 'Conflict resolution', 'Man-woman relationships',
                         'Female friendship'],
        
        'Health & Wellness': ['Health & Fitness', 'Exercise', 'Diet', 'Meditation', 'Yoga',
                             'Mental health', 'Alternative medicine', 'Fitness walking',
                             'Bodybuilding', 'Hygiene', 'Sleep', 'Relaxation',
                             'Hatha yoga']
    },
    
    'Family & Parenting': {
        'Parenting': ['Parenting', 'Child rearing', 'FAMILY & RELATIONSHIPS',
                     'Family & Relationships', 'Discipline of children',
                     'Child development', 'Infants', 'Infants (Newborn)',
                     'Babies', 'Adolescence', 'Teenagers',
                     'Children', 'Adult children', 'Adult children of alcoholics',
                     'Children of celebrities', 'Children of clergy',
                     'Adopted children', 'Abandoned children',
                     'Children with disabilities', 'Cerebral palsied children',
                     'Brain-damaged children', 'Dyslexic children',
                     'Child abuse', 'Abused children',
                     'Babysitters', 'Chores', 'Bedtime',
                     'Money-making projects for children'],
        
        'Pregnancy & Childbirth': ['Pregnancy', 'Childbirth', 'Infants',
                                   'Breast', 'Birthmothers', 'Birth control',
                                   'Breast feeding'],
        
        'Family Life': ['Families', 'Marriage', 'Divorce', 'Adoption',
                       'Intercountry adoption', 'Stepfamilies', 'Single parents',
                       'Lesbian mothers', 'Gay male couples',
                       'Administration of estates', 'Inheritance and succession',
                       'Family secrets', 'Families in art',
                       'Girls\' clothing', 'Birthdays', "Children's parties",
                       'Gifts']
    },
    
    'Health & Medicine': {
        'Medicine': ['Medicine', 'MEDICAL', 'Medical', 'Health',
                    'Public health', 'Health promotion', 'Nursing',
                    'Medical students', 'Gynecology'],
        
        'Specific Conditions': ['Cancer', 'Heart disease', 'Diabetes', 'Mental illness',
                               "Alzheimer's disease", 'AIDS (Disease)', 'Eating disorders',
                               'Senile dementia', 'Arthritis', 'Hay fever',
                               'Intractable pain', 'Cerebral palsy', 'Autism',
                               'Depression', 'Anxiety', 'Insanity, Religious',
                               'Dangerously mentally ill', 'Death row inmates',
                               'Compulsive eaters.', 'Adjustment disorders',
                               'Eye', 'Generative organs, Female', 'Breast',
                               'Blood substitutes', 'Diazepam',
                               'Abnormalities, Human', 'Blind', 'Blind-deaf',
                               'Deaf', 'Drug abuse', 'Drugs', 'Alcoholics',
                               'Menopause', 'Masturbation',
                               'Sex instruction for women', 'Jet lag'],
        
        'Medical Professions': ['Physicians', 'Nurses', 'Nursing', 'Medical students',
                               'Gynecologists', 'Dentists', 'Veterinarians',
                               'Chiropractors'],
        
        'Alternative Health': ['Alternative medicine', 'Aromatherapy', 'Reflexotherapy',
                              'Herbs', 'Herb gardening', 'Essences and essential oils',
                              'Folk medicine', 'Spiritual healing', 'Meditation',
                              'Yoga', 'Hatha yoga']
    },
    
    'Business & Career': {
        'Business': ['Business', 'Business & Economics', 'Management', 'Marketing',
                    'Entrepreneurship', 'Finance, Personal', 'Business ethics',
                    'Business failures', 'Businessmen', 'Businesspeople',
                    'Businesswomen', 'Executives', 'Sales', 'Selling',
                    'Customer services', 'Customer privacy',
                    'Communication in marketing.', 'Advertising',
                    'Advertising Social aspects', 'Advertising agencies',
                    'Industrial management', 'Industries', 'Management science',
                    'Organizational change', 'Employee morale', 'Employees',
                    'Corporations', 'Corporate image', 'Franchises (Retail trade)',
                    'Home-based businesses', 'International finance',
                    'Investment advisors', 'Brand name products',
                    'Diamond industry and trade', 'Cosmetics industry',
                    'Fashion', 'Consumer goods',
                    'Booksellers and bookselling', 'Publishing',
                    'Escort services'],
        
        'Career': ['Vocational guidance', 'Job hunting', 'Success in business',
                  'Occupations', 'Leadership', 'Success']
    },
    
    'Arts & Entertainment': {
        'Visual Arts': ['Art', 'Painting', 'Sculpture', 'Photography', 'Drawing', 'Design',
                       'Painting, American', 'Painting, Modern', 'Photography, Artistic',
                       'Photography, Erotic', 'Photography of wolves.',
                       'Human figure in art', 'Justice in art', 'Angels in art',
                       'Families in art', 'Cats in art', 'Animals in art',
                       'Illumination of books and manuscripts', 'Manuscripts, Medieval',
                       'Art thefts', 'Art, Celtic', 'Knotwork, Celtic', 'Hindu art',
                       'Arthurian romances', 'Body marking'],
        
        'Music': ['Music', 'Musicians', 'Opera', 'Songs', 'Choirs (Music)',
                 'Popular music', 'Rock musicians', 'Folk musicians',
                 'Singers', 'Conductors (Music)', 'Bands (Music)',
                 'Piano music', 'World music', 'Bones (Musical instrument)',
                 'Musical fiction', 'Composers', 'Hymns',
                 'Blind musicians'],
        
        'Film & Television': ['Motion pictures', 'Television', 'Cinematography',
                             'Motion picture actors and actresses',
                             'Motion picture industry',
                             'Motion picture producers and directors',
                             'Films and filming', 'Historical films',
                             'Film novelizations', 'Television programs.',
                             'Television scripts',
                             'Situation comedies (Television programs)',
                             'Star Trek films', 'Batman (Comic strip)',
                             'Bewitched (Television program)',
                             "Dawson's Creek (Television program)",
                             'Northern exposure (Television program)',
                             'Donahue (Television program)',
                             'Blair Witch Project (Motion picture)',
                             'In the bleak midwinter (Motion picture)',
                             'Beauty and the Beast (Television program)',
                             'Dogs in motion pictures'],
        
        'Performing Arts': ['Performing Arts', 'Theater', 'Dance', 'Ballet',
                           'Ballet dancing', 'Acting', 'Actors', 'Actresses',
                           'Entertainers', 'Comedians', 'Clowns', 'Acrobats',
                           'Juggling', 'Stripteasers']
    },
    
    'Home & Garden': {
        'Cooking': ['Cooking', 'COOKING', 'Cookery', 'Recipes', 'Baking',
                   'Cooking, American', 'Cooking, Italian', 'Cookery, Italian',
                   'Cooking, Chinese', 'Cooking, Mexican', 'Cooking, Greek',
                   'Cooking, Caribbean', 'Cookery, Asian.', 'Cookery (Stevia)',
                   'Cookery for diabetics', 'Cookery for one', 'Cooking for one',
                   'Quick and easy cooking', 'Passover cooking',
                   'Vegetarian cooking', 'Food', 'Food habits', 'Food supply',
                   'Food service', 'Food, Junk', 'Wine and wine making',
                   'Brewing', 'Low-carbohydrate diet', 'Reducing diets',
                   'Diet therapy', 'Dietary supplements',
                   'Carbohydrates, Refined',
                   'Cookery, American.', 'Chocolate', 'Basil.',
                   'Diners (Restaurants)', 'Bars (Drinking establishments)'],
        
        'Gardening': ['Gardening', 'Horticulture', 'Organic gardening', 'Herbs',
                     'Herb gardening', 'Flowers', 'Flower language',
                     'Edible landscaping', 'Landscape', 'Compost',
                     'Permaculture', 'Espaliers', 'Violets'],
        
        'Home Improvement': ['Interior decoration', 'Furniture', 'Home repair',
                            'House & Home', 'Home Economics', 'Built-in furniture',
                            'Blue in interior decoration',
                            'Americana in interior decoration',
                            'Buildings', 'Abandoned buildings',
                            'Dwellings', 'Architecture', 'Tree houses',
                            'Weather vanes', 'Household appliances.',
                            'Housing', 'Construction'],
        
        'Crafts': ['Handicraft', 'Crafts & Hobbies', 'Knitting', 'Sewing',
                  'Machine sewing', 'Patchwork quilts', 'Glass craft',
                  'Horn carving', 'Balloon decorations',
                  'Christmas crafts', 'Christmas decorations',
                  'Knotwork, Celtic']
    },
    
    'Sports & Recreation': {
        'Sports': ['Sports & Recreation', 'Baseball', 'Football', 'Basketball', 'Hockey',
                  'Baseball stories', 'Hockey stories', 'Basketball stories',
                  'College sports', 'Harness racing', 'Horsemanship', 'Horses',
                  'Archery', 'Wrestling', 'Wrestlers', 'Ice skating',
                  'Skis and skiing', 'Mountain flying', 'Mountaineering',
                  'Prosto plezanje - Alpinizem - Priročniki',
                  'Spotted seatrout fishing', 'Fly fishing',
                  'Baseball players', 'Hockey players', 'Atheletes',
                  'Ice skaters.', 'Baseball coaches',
                  'Baseball.'],
        
        'Outdoor Activities': ['Hiking', 'Camping', 'Fishing', 'Hunting', 'Mountaineering',
                              'Recreational vehicle living', 'Sailing',
                              'Coastwise navigation', 'Ocean travel',
                              'Boats and boating', 'Yachts', 'Barges',
                              'Porsche 911 automobile', 'Spotted seatrout fishing',
                              'Fly fishing', 'Cutthroat trout',
                              'Insect pests'],
        
        'Games': ['Games', 'Games & Activities', 'Chess', 'Card games',
                 'Fantasy games', 'Dungeons and Dragons (Game)',
                 'Video games', 'Computer games']
    },
    
    'Travel': {
        'Travel Guides': ['Travel', 'Guidebooks'],
        
        'Travel Narratives': ['Voyages around the world', 'Travelers', 'Ocean travel',
                             'Air travel', 'Airplane crash survival',
                             'Lewis and Clark Expedition', 'Beagle Expedition']
    },
    
    'Language & Reference': {
        'Language': ['Language Arts & Disciplines', 'English language', 'Spanish language',
                    'French language', 'Dutch language', 'Japanese',
                    'Grammar', 'Vocabulary', 'Grammar, Comparative and general',
                    'Language acquisition', 'Language and languages',
                    'Anglais (Langue) - Histoire', 'Egyptian language',
                    'American Sign Language', 'Foreign Language Study',
                    'Spanish language materials', 'Antonyms',
                    'Names, Geographical', 'Names, Personal',
                    'Dictionaries, Reverse', 'Spelling',
                    'English literature', 'American literature',
                    'Canadian literature', 'Literature'],
        
        'Reference': ['Reference', 'Encyclopedias and dictionaries', 'Dictionaries',
                     'Almanacs', 'Atlases', 'Atlases, British',
                     'Encyclopedias and dictionaries, Japanese',
                     'Best books', 'Handbooks, vade-mecums, etc',
                     'Handbooks, vade-mecums, etc.',
                     'Questions and answers.', 'Quotations',
                     'Union catalogs', 'Bibliography, National',
                     'Readers', 'College readers',
                     'Literary recreations.',
                     'Books and reading', 'Book of the dead']
    },
    
    'Comics & Graphic Novels': {
        'Graphic Novels': ['Graphic novels', 'Comics & Graphic Novels', 'COMICS & GRAPHIC NOVELS'],
        
        'Comic Strips': ['Comic books, strips, etc', 'Cartoons and comics',
                        'Caricatures and cartoons', 'Batman (Comic strip)',
                        'Astérix (Fictitious character)']
    },
    
    'Nature & Environment': {
        'Animals': ['Animals', 'Dogs', 'Cats', 'Birds', 'Wildlife',
                   'Horses', 'Rabbits', 'Squirrels', 'Chipmunks', 'Raccoon',
                   'Foxes', 'Bears', 'Wolves', 'Alligators', 'Frogs',
                   'Guinea pigs', 'Parrots', 'African gray parrot', 'Pigeons',
                   'Owls', 'Ants', 'Bees', 'Insects', 'Butterflies',
                   'Spiders', 'Snakes', 'Dinosaurs', 'Moles', 'Moles (Animals)',
                   'Iguana (Genus)', 'Elsa (Lion)', 'Beagle (Dog breed)',
                   'Captive wild animals', 'Animal behavior', 'Animal welfare',
                   'Animal rights activists', 'Animal sounds',
                   'Animals, Mythical', 'Dogs in motion pictures',
                   'Photography of wolves.', 'Cutthroat trout',
                   'Spotted seatrout fishing', 'Pets', 'Dog breeders'],
        
        'Environment': ['Ecology', 'Conservation', 'Climate change', 'Environmental issues',
                       'Conservation of natural resources',
                       'Biodiversity conservation', 'Recycling',
                       'Contaminated sediments', 'Climatic changes',
                       'Forest reserves', 'Endangered species',
                       'Climate and civilization'],
        
        'Natural History': ['Natural history', 'Geology', 'Weather', 'Meteorology',
                           'Earth sciences', 'Rocks', 'Earthquakes', 'Clouds',
                           'Stream measurements', 'Nature', 'Seashore',
                           'Arctic Regions', 'Arctic regions',
                           'Amazon River Valley', 'Andes', 'Alps, Swiss (Switzerland)',
                           'Hebrides (Scotland)', 'Galapagos Islands',
                           'Karakoram Range', 'Canadian Rockies (B.C. and Alta.)',
                           'North Atlantic Ocean', 'Blizzards', 'Hurricanes',
                           'Ice crossings']
    },
    
    'Law & Legal': {
        'Law': ['Law', 'Legal system', 'Constitutional law', 'Canon law',
               'Antitrust law', 'Actions and defenses', 'Patents',
               'Citation of legal authorities', 'Session laws',
               'Charters', 'Contempt of legislative bodies',
               'Examination of witnesses', 'Capacity and disability',
               'Breach of contract', 'Promissory notes',
               'Administration of estates', 'Inheritance and succession',
               'Eminent domain', 'Class actions (Civil procedure)',
               "Drivers' licenses"],
        
        'Criminal Justice': ['Criminal law', 'Prisons', 'Police',
                            'Penal colonies', 'Capital punishment',
                            'Death row inmates', 'Pardon', 'Corrections',
                            'Crimes without victims', 'Criminal defense lawyers',
                            'Attorneys general', 'Judges',
                            'African American police', 'African American judges']
    },
    
    'Education': {
        'General Education': ['Education', 'Teaching', 'Learning',
                             'Education, Preschool', 'Boarding schools',
                             'High school students', 'College students',
                             'College dropouts', 'Schools',
                             'Educational exchanges'],
        
        'Study Guides': ['Study Aids', 'Test preparation', 'Study skills']
    },
    
    'True Crime': {
        'True Crime': ['True Crime', 'Murder', 'Serial killers', 'Criminal investigation',
                      'Homicide', 'Assassination', 'Attempted murder',
                      'Crime and criminals', 'Criminals', 'Brigands and robbers',
                      'Mafia', 'Organized crime', 'Corruption',
                      'Crime', 'Commercial crimes']
    },
    
    'Literary Criticism': {
        'Literary Criticism': ['Literary Criticism', 'Criticism', 'Literary Collections',
                              'Characters and characteristics in literature',
                              'Dragons in literature',
                              'Literary recreations.']
    },
    
    'Humor & Satire': {
        'Humor': ['Humor', 'American wit and humor', 'English wit and humor',
                 'Wit and humor', 'Jokes', 'American wit and humor, Pictorial',
                 'American wit and humor.', 'Nigerian wit and humor (English)',
                 'Caricatures and cartoons', 'Cartoons and comics']
    },
    
    'Transportation': {
        'Aviation': ['Aeronautics', 'Air travel', 'Airplane crash survival',
                    'Air pilots', 'Fighter pilots', 'Flight attendants',
                    'Mountain flying', 'Helicopters', 'Spitfire (Fighter plane)'],
        
        'Automotive': ['Automobiles', 'Automobile mechanics',
                      'Porsche 911 automobile', 'Drivers\' licenses',
                      'Bicycles'],
        
        'Maritime': ['Ships', 'Sailing', 'Boats and boating', 'Yachts',
                    'Barges', 'Historic ships', 'Ocean travel',
                    'Coastwise navigation', 'Merchant marine'],
        
        'Railways': ['Railroads', 'Transportation']
    },
    
    'Other': {
        'Miscellaneous': ['Large print books', 'Large type books', 'Talking books',
                         'Audiobooks', 'Electronic books', 'Paperbacks',
                         'Electronic journals', 'Free material',
                         'Documents, Printing of', 'Copy editing',
                         'Imprints', 'Canada Imprints', 'Publishing',
                         'Books and reading', 'Booksellers and bookselling',
                         'Libraries', 'Accidents', 'Kissing',
                         'Yawning', 'Finger sucking', 'Clumsiness',
                         'Laundry', 'Baths', 'Chores',
                         'Gifts', 'Bedtime', 'Birthdays',
                         "Children's parties", 'Halloween',
                         'Bereavement', 'Death', 'Death, Apparent',
                         'Last words', 'Funerals', 'Cemeteries',
                         'Burial', 'Cremation',
                         'Beauty, Personal', 'Cosmetics industry',
                         'Bullets', 'Bondage (Sexual behavior)',
                         'Deception', 'Escort services'],
        
        'Americana': ['Americanisms', 'Americana in interior decoration',
                     'National characteristics, American'],
        
        'Uncategorized': ['Literatura contemporánea']
    }
}


# Regional identifiers to detect country-specific content
regional_identifiers = {
    'American': ['American', 'United States'],
    'English': ['English', 'England'],
    'British': ['British', 'Britain'],
    'Canadian': ['Canadian', 'Canada'],
    'Australian': ['Australian', 'Australia'],
    'Irish': ['Irish', 'Ireland'],
    'Scottish': ['Scottish', 'Scotland'],
    'New Zealand': ['New Zealand'],
    'French': ['French', 'France'],
    'German': ['German', 'Germany'],
    'Italian': ['Italian', 'Italy'],
    'Spanish': ['Spanish', 'Spain'],
    'Japanese': ['Japanese', 'Japan'],
    'Russian': ['Russian', 'Russia'],
    'Swiss': ['Swiss', 'Switzerland'],
    'Hungarian': ['Hungarian', 'Hungary']
}

def build_category_mapping(hierarchy):
    """
    Build mapping from raw categories to their root and subcategories.
    Returns: dict with structure {raw_category: {'root_categories': [str], 'subcategories': [str], 'regions': [str]}}
    """
    category_mapping = {}
    
    for root_cat, subcats in hierarchy.items():
        if isinstance(subcats, dict):
            for subcat_name, items in subcats.items():
                if isinstance(items, list):
                    for item in items:
                        if item not in category_mapping:
                            category_mapping[item] = {
                                'root_categories': [],
                                'subcategories': [],
                                'regions': []
                            }
                        if root_cat not in category_mapping[item]['root_categories']:
                            category_mapping[item]['root_categories'].append(root_cat)
                        if subcat_name not in category_mapping[item]['subcategories']:
                            category_mapping[item]['subcategories'].append(subcat_name)
    
    return category_mapping

def extract_regional_tags(category_name):
    """Extract regional identifiers from category name"""
    regions = []
    for region, identifiers in regional_identifiers.items():
        for identifier in identifiers:
            if identifier.lower() in category_name.lower():
                regions.append(region)
                break
    return list(set(regions))

# Build the mapping
category_to_hierarchy = build_category_mapping(category_hierarchy)

# Add regional tags to each category
for category, info in category_to_hierarchy.items():
    info['regions'] = extract_regional_tags(category)

def map_category_with_metadata(category_str):
    """
    Map a book's categories to root categories, subcategories, and regional tags.
    
    Returns:
    - root_categories: list of str (all unique root categories)
    - subcategories: list of str (all unique subcategories/primary categories)
    - regional_tags: list of str (all unique regional identifiers)
    """
    try:
        categories = ast.literal_eval(category_str)
        if not isinstance(categories, list) or len(categories) == 0:
            return {
                'root_categories': ['Other'],
                'subcategories': ['Uncategorized'],
                'regional_tags': []
            }
        
        # Collect all data from all matched categories
        all_root_cats = []
        all_subcats = []
        all_regions = []
        
        for cat in categories:
            if cat in category_to_hierarchy:
                info = category_to_hierarchy[cat]
                
                # Add root categories
                for root_cat in info['root_categories']:
                    if root_cat not in all_root_cats:
                        all_root_cats.append(root_cat)
                
                # Add subcategories
                for subcat in info['subcategories']:
                    if subcat not in all_subcats:
                        all_subcats.append(subcat)
                
                # Add regional tags
                for region in info['regions']:
                    if region not in all_regions:
                        all_regions.append(region)
        
        # If we found mappings, use them
        if all_root_cats:
            return {
                'root_categories': all_root_cats,         # All roots as list
                'subcategories':   all_subcats,           # All subcategories as list
                'regional_tags':   all_regions            # All regions as list
            }
    except:
        pass
    
    # Default fallback
    return {
        'root_categories': ['Other'],
        'subcategories': ['Uncategorized'],
        'regional_tags': []
    }

# Load books data
df_books = pd.read_csv("../data/interim/Books_clean.csv", sep=",")
df_books.columns = [c.strip().lower() for c in df_books.columns]
df_books = df_books.rename(columns={"book-title": "book_title"})

# Print full list of unique categories before mapping
unique_categories = []
for cat_str in df_books['categories']:
    try:
        cats = ast.literal_eval(cat_str)
        if isinstance(cats, list):
            unique_categories.extend(cats)
    except:
        continue
unique_categories = list(set(unique_categories))
print(f"\nTotal unique raw categories before mapping: {len(unique_categories)}")
print("=== Sample of Unique Raw Categories (first 50) ===")
for cat in sorted(unique_categories):
    print(cat)

# Apply enhanced mapping
category_metadata = df_books['categories'].apply(map_category_with_metadata)

# Create new columns with consistent data types
df_books['root_categories'] = category_metadata.apply(lambda x: x['root_categories'])
df_books['subcategories'] = category_metadata.apply(lambda x: x['subcategories'])
df_books['regional_tags'] = category_metadata.apply(lambda x: x['regional_tags'])

# Show distribution
print("\n=== Root Category Distribution ===")
root_counts = df_books['root_categories'].explode().value_counts()
print(f"\nTotal unique root categories: {df_books['root_categories'].explode().nunique()}")
print("\nAll root categories:")
print(root_counts)

# Show books belonging to multiple root categories
print("\n=== Books Belonging to Multiple Root Categories ===")
multi_root_books = df_books[df_books['root_categories'].apply(len) > 1]
print(f"Number of books with multiple root categories: {len(multi_root_books)}")
if len(multi_root_books) > 0:
    print("\nSample of books with multiple root categories:")
    sample_multi = multi_root_books[['book_title', 'root_categories', 'subcategories']].head(15)
    for idx, row in sample_multi.iterrows():
        print(f"\n{row['book_title']}")
        print(f"  Root categories: {row['root_categories']}")
        print(f"  Subcategories: {row['subcategories']}")

# Show subcategory distribution
print("\n=== Subcategory Distribution (Top 30) ===")
all_subcats = [subcat for subcats in df_books['subcategories'] for subcat in subcats]
subcat_counts = pd.Series(all_subcats).value_counts()
print(subcat_counts.head(30))

# Show regional distribution
print("\n=== Regional Tags Distribution ===")
all_regions = [region for regions in df_books['regional_tags'] for region in regions]
if all_regions:
    regional_counts = pd.Series(all_regions).value_counts()
    print(f"\nTotal books with regional tags: {len([r for r in df_books['regional_tags'] if r])}")
    print("\nAll regional tags:")
    print(regional_counts)

# Show sample with enhanced metadata
print("\n=== Sample Books with Enhanced Category Metadata ===")
print("\nData types of new columns:")
print("root_categories: list of strings")
print("subcategories: list of strings")
print("regional_tags: list of strings")

print("\nSample rows:")
display_cols = ['book_title', 'categories','root_categories', 'subcategories', 'regional_tags']
display(df_books[display_cols])

# Save the enhanced dataset
df_books.to_csv("../data/interim/Books_enchanced_categories.csv", index=False)
print("\n✓ Enhanced dataset saved to: ../data/interim/Books_enchanced_categories.csv")


Total unique raw categories before mapping: 1337
=== Sample of Unique Raw Categories (first 50) ===
1. Short Stories-Science Fiction
4e Arrondissement (Paris, France)
AIDS (Disease)
Abandoned buildings
Abandoned children
Abduction
Aberystwyth (Wales)
Abnormalities, Human
Abolitionists
Aboriginal Australians
Abused wives
Abused women
Academic writing
Accidents
Accountants
Acrobats
Acting
Actions and defenses
Actors
Actors & Acting
Actresses
Adjustment disorders
Administration of estates
Admirals
Adolescence
Adolescent psychotherapy
Adoption
Adult children
Adult children of alcoholics
Adultery
Adventure
Adventure and adventurers
Adventure fiction
Adventure stories
Adventure stories, American
Advertising
Advertising Social aspects
Advertising agencies
Aeronautics
Aesthetics
Affirmations
Afghanistan
Africa
Africa, Central
Africa, East
Africa, North
Africa, Southern
Africa, West
African American authors
African American dramatists
African American entertainers
African American families
Afr

Unnamed: 0,book_title,categories,root_categories,subcategories,regional_tags
0,Brave New World,['Fiction'],[Fiction],[General Fiction],[]
1,Monk's-hood,['Fiction'],[Fiction],[General Fiction],[]
2,Made in America,['Americanisms'],"[Social Sciences, Other]","[Sociology, Americana]",[American]
3,Farley: The Life of Farley Mowat,['Biography & Autobiography'],[Biography & Memoir],[Biography],[]
4,Mother Teresa: Her people and her work,['Nuns'],[Biography & Memoir],[Religious Biography],[]
...,...,...,...,...,...
24746,"Love, etc.",['Fiction'],[Fiction],[General Fiction],[]
24747,Fahrenheit 451,['Book burning'],[Other],[Uncategorized],[]
24748,Fraud,['Business & Economics'],"[Social Sciences, Business & Career]","[Economics, Business]",[]
24749,Falling Angels,['Fiction'],[Fiction],[General Fiction],[]



✓ Enhanced dataset saved to: ../data/interim/Books_enchanced_categories.csv


In [32]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df_books, title="Profiling Report", explorative=True)

profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s][A

100%|██████████| 12/12 [00:02<00:00,  4.12it/s][A



Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [22]:
# year of publication inputation logic

# first let's display the books with year of publication = 0
# display full dataframe
display(df_books[df_books['year-of-publication'] == 0], df_books[df_books['year-of-publication'] == 0].shape)
# analyse the impact of removing the books with no year of publication from the ratings dataset
# get list of isbn id:
books_to_remove = list(df_books[df_books['year-of-publication'] == 0]['isbn'])
print(books_to_remove)
df_ratings = pd.read_csv("../data/interim/Ratings_clean.csv", sep=",")
num_rat_before = df_ratings.shape[0]
print(f"\nNumber of ratings before: {num_rat_before}")

df_ratings_cleaned = df_ratings[~df_ratings['ISBN'].isin(books_to_remove)]
num_rat_after = df_ratings_cleaned.shape[0]
print(f"\nNumber of ratings after removing Books with year of pub =0: {num_rat_after}")
print(f"\nNumber of ratings removed: {num_rat_before - num_rat_after} ({((num_rat_before - num_rat_after)/num_rat_before)*100:.2f}%)")


Unnamed: 0,isbn,book_title,book-author,year-of-publication,publisher,image-url-s,image-url-m,image-url-l,categories,root_category,all_root_categories,subcategories,regional_tags
7,0002210436,Agent In Place,Helen Macinnes,0,Collins,http://images.amazon.com/images/P/0002210436.0...,http://images.amazon.com/images/P/0002210436.0...,http://images.amazon.com/images/P/0002210436.0...,['Fiction'],Fiction,[Fiction],[General Fiction],[]
13,0002221160,Solo,Jack Higgns,0,Collins,http://images.amazon.com/images/P/0002221160.0...,http://images.amazon.com/images/P/0002221160.0...,http://images.amazon.com/images/P/0002221160.0...,['Fiction'],Fiction,[Fiction],[General Fiction],[]
16,0002227150,White Gold Wielder,Stephen R Donaldson,0,Collins,http://images.amazon.com/images/P/0002227150.0...,http://images.amazon.com/images/P/0002227150.0...,http://images.amazon.com/images/P/0002227150.0...,['Fiction'],Fiction,[Fiction],[General Fiction],[]
19,0002243962,Girlfriend In a Coma,Douglas Coupland,0,Harper Collins Publishers,http://images.amazon.com/images/P/0002243962.0...,http://images.amazon.com/images/P/0002243962.0...,http://images.amazon.com/images/P/0002243962.0...,['Fiction'],Fiction,[Fiction],[General Fiction],[]
22,0002254697,Mourning Doves,Helen Forrester,0,Harpercollins,http://images.amazon.com/images/P/0002254697.0...,http://images.amazon.com/images/P/0002254697.0...,http://images.amazon.com/images/P/0002254697.0...,['Technology & Engineering'],Science & Technology,[Science & Technology],[Engineering],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24733,B00009APKU,Moby Dick,Herman Melville,0,"Outrigger Publishing, LLC",http://images.amazon.com/images/P/B00009APKU.0...,http://images.amazon.com/images/P/B00009APKU.0...,http://images.amazon.com/images/P/B00009APKU.0...,['Young Adult Fiction'],Children & Young Adult,[Children & Young Adult],[Young Adult],[]
24737,B0000A2U93,Carmilla,Joseph Sheridan Le Fanu,0,Soft Editions Ltd,http://images.amazon.com/images/P/B0000A2U93.0...,http://images.amazon.com/images/P/B0000A2U93.0...,http://images.amazon.com/images/P/B0000A2U93.0...,['Fiction'],Fiction,[Fiction],[General Fiction],[]
24743,B000199D26,The Wonderful Wizard of Oz,L. Frank Baum,0,NuVision Publications,http://images.amazon.com/images/P/B000199D26.0...,http://images.amazon.com/images/P/B000199D26.0...,http://images.amazon.com/images/P/B000199D26.0...,['Juvenile Fiction'],Children & Young Adult,[Children & Young Adult],[Juvenile Fiction],[]
24745,B0001I1JII,Kim,Rudyard Kipling,0,Modern Library,http://images.amazon.com/images/P/B0001I1JII.0...,http://images.amazon.com/images/P/B0001I1JII.0...,http://images.amazon.com/images/P/B0001I1JII.0...,['Fiction'],Fiction,[Fiction],[General Fiction],[]


(397, 13)

['0002210436', '0002221160', '0002227150', '0002243962', '0002254697', '0002257378', '0002711567', '0006161529', '0006170056', '0006172504', '0006323154', '0006386725', '0006475043', '0006479561', '0006482236', '0006546684', '000654679X', '0006547613', '0006548040', '0006548539', '0006550061', '0006550576', '0006551076', '0060805129', '0060830565', '0061094781', '0061317780', '0091736579', '0099479419', '0099744414', '009975391X', '0099760118', '0099769913', '0099778017', '0099825201', '0099914700', '0131767690', '0140007709', '0140008640', '0140019200', '014003773X', '0140037896', '0140048332', '0140051317', '014005829X', '0140062394', '0140062556', '0140089128', '0140096612', '0140102973', '0140109617', '0140118365', '0140132465', '0140147446', '0140147470', '0140153942', '0140181350', '0140235280', '0140258175', '0140264698', '0140269886', '0140277439', '014028351X', '0140288864', '0140296417', '0140385355', '0140620206', '0141182032', '014130751X', '0141309296', '0141311703', '0143