In [4]:
import pandas as pd
import numpy as np
import sqlite3
from contextlib import closing

# HOMEWORK 1.

# Creating a function that connects to the database and accesses tables dynamically
def load_db_tables(db_path):
    """Load all tables from a SQLite database into a dictionary of DataFrames."""
    tables_dict = {}
    
    try:
        with closing(sqlite3.connect(db_path)) as conn:
            # Step 1: List all tables
            tables = pd.read_sql(
                "SELECT name FROM sqlite_master WHERE type='table';", 
                conn
            )['name'].tolist()
            
            # Step 2: Dynamically load each table into a DataFrame
            for table in tables:
                query = f"SELECT * FROM {table};"  # Dynamic query
                tables_dict[table] = pd.read_sql(query, conn)
            
            return tables_dict
    
    except sqlite3.Error as e:
        print(f"SQLite error: {e}")
        raise
    except Exception as e:
        print(f"Unexpected error: {e}")
        raise

# database file path
db_path = "chinook.db"

# 'tables_dict' variable that 
tables_dict = load_db_tables(db_path)

# Task 1. Find the total amount spent by each customer on purchases (considering invoices).

invoices_df = tables_dict['invoices']  # From your dynamically loaded tables


# Accessing 'tracks' table and saving in a variable
tracks_df = tables_dict['tracks']

# Accessing 'invoice_items' table from 'tables_dict'
invoices_cont = tables_dict['invoice_items']


# Step 1: Map tracks to albums
# Create a dictionary: {AlbumId: set_of_all_TrackIds}
album_track_ids = (
    tracks_df
    .groupby('AlbumId')['TrackId']
    .apply(set)
    .to_dict()
)

# Group purchased TrackIds by CustomerId and AlbumId
customer_purchases = (
    invoices_cont
    .merge(invoices_df[['InvoiceId', 'CustomerId']], on='InvoiceId')
    .merge(tracks_df[['TrackId', 'AlbumId']], on='TrackId')
    .groupby(['CustomerId', 'AlbumId'])['TrackId']
    .apply(set)
    .reset_index(name='PurchasedTrackIds')
)

# Check if purchased tracks cover ALL tracks in the album
customer_purchases['IsFullAlbum'] = customer_purchases.apply(
    lambda row: row['PurchasedTrackIds'] == album_track_ids[row['AlbumId']],
    axis=1
)

# Customers who bought ANY full album
full_album_customers = (
    customer_purchases
    .query('IsFullAlbum')
    ['CustomerId']
    .unique()
)

# Calculate percentages
total_customers = invoices_df['CustomerId'].nunique()
pct_full_album = (len(full_album_customers) / total_customers * 100)
pct_individual = 100 - pct_full_album

print(f"Full Album Buyers: {pct_full_album:.2f}%")
print(f"Individual Track Buyers: {pct_individual:.2f}%")

customer_purchases

Full Album Buyers: 35.59%
Individual Track Buyers: 64.41%


Unnamed: 0,CustomerId,AlbumId,PurchasedTrackIds,IsFullAlbum
0,1,24,{262},False
1,1,25,"{280, 271}",False
2,1,26,"{289, 298}",False
3,1,27,{307},False
4,1,28,{316},False
...,...,...,...,...
1296,59,190,"{2328, 2322}",False
1297,59,191,"{2340, 2334}",False
1298,59,192,"{2352, 2346}",False
1299,59,193,"{2364, 2358}",False
