In [40]:
import pandas as pd
import yaml
import matplotlib.pyplot as plt
import numpy as np
import re
import warnings
from datetime import datetime

In [41]:
def clean_price(price_str):

    if pd.isna(price_str):
        return None

    price_str = str(price_str).strip()
    is_euro = ('â‚¬' in price_str) or ('EUR' in price_str.upper())

    price_str = price_str.replace('$', '').replace('â‚¬', '').replace('USD', '').replace('EUR', '').replace('Â¢', '.').strip()
    
    try:
        price = float(price_str)
    except:
        return None

    if is_euro:
        price = price * 1.2

    return price


def clean_timestamp(timestamp_str):

    if pd.isna(timestamp_str):
        return None
    
    timestamp_str = str(timestamp_str).strip()
    timestamp_str = timestamp_str.replace(';', ' ').replace(',', ' ')
    timestamp_str = timestamp_str.replace('A.M.', 'AM').replace('P.M.', 'PM')
    
    iso_pattern = r'\d{4}-\d{2}-\d{2}'
    if re.search(iso_pattern, timestamp_str):
        result = pd.to_datetime(timestamp_str, errors='coerce', dayfirst=False)
    else:
        result = pd.to_datetime(timestamp_str, errors='coerce', dayfirst=True)
    
    return result

class UnionFind:
    def __init__(self, n):
        self.parent = list(range(n))  
    
    def find(self, x):
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])  
        return self.parent[x]
    
    def union(self, x, y):
        root_x = self.find(x)
        root_y = self.find(y)
        if root_x != root_y:
            self.parent[root_x] = root_y
    
    def count_groups(self):
        return len(set(self.find(i) for i in range(len(self.parent))))
    
    def get_group_members(self, x):
        root = self.find(x)
        return [i for i in range(len(self.parent)) if self.find(i) == root]


In [42]:
def analyze_dataset(dataset_name):
    
    print(f"ANALYZING: {dataset_name}")
    
    # ===== 1. LOAD DATA =====
    orders_df = pd.read_parquet(f'{dataset_name}/orders.parquet', engine='fastparquet')
    users_df = pd.read_csv(f'{dataset_name}/users.csv')
    
    with open(f'{dataset_name}/books.yaml', 'r', encoding='utf-8') as file:
        books_data = yaml.safe_load(file)
    books_df = pd.DataFrame(books_data)
    books_df.columns = books_df.columns.str.replace(':', '')
    
    print(f"   Loaded {len(orders_df):,} orders")
    print(f"   Loaded {len(users_df):,} users")
    print(f"   Loaded {len(books_df):,} books\n")
    
    # ===== 2. CLEAN ORDERS =====
    print("ðŸ§¹ Cleaning orders data...")
    orders_df['clean_price'] = orders_df['unit_price'].apply(clean_price)
    orders_df['clean_timestamp'] = orders_df['timestamp'].apply(clean_timestamp)
    
    before_clean = len(orders_df)
    orders_df = orders_df.dropna(subset=['clean_price', 'clean_timestamp'])
    after_clean = len(orders_df)
    
    orders_df['paid_price'] = orders_df['quantity'] * orders_df['clean_price']
    orders_df['date'] = orders_df['clean_timestamp'].dt.date
    orders_df = orders_df.drop_duplicates(subset=['id'])
    
    print(f"   Removed {before_clean - after_clean:,} invalid rows")
    print(f"   Final orders count: {len(orders_df):,}\n")
    
    # ===== 3. BUILD UNION-FIND FOR USERS =====
    print("Building user deduplication structure...")
    n_users = len(users_df)
    uf = UnionFind(n_users)
    users_clean = users_df.copy()
    
    # Fill missing addresses
    na_mask = users_clean['address'].isna()
    na_count = na_mask.sum()
    users_clean.loc[na_mask, 'address'] = [f'MISSING_ADDRESS_{i}' for i in range(na_count)]
    
    # Union users with matching fields
    for field in ['email', 'phone', 'name', 'address']:
        groups = users_clean.groupby(field).groups
        for indices in groups.values():
            if len(indices) > 1:
                first = indices[0]
                for idx in indices[1:]:
                    uf.union(first, idx)
    
    print(f"  User deduplication complete\n")
    
    # ===== EXECUTE 6 TASKS =====
    results = {}
    
    # --- TASK 1: Top 5 Revenue Days ---
    print("Task 1: Top 5 Revenue Days")
    top5_days = orders_df.groupby('date')['paid_price'].sum().sort_values(ascending=False).head(5)
    results['task1'] = top5_days
    for i, (date, revenue) in enumerate(top5_days.items(), 1):
        print(f"   {i}. {date}: ${revenue:,.2f}")
    print()
    
    # --- TASK 2: Unique Users ---
    print("Task 2: Real Unique Users")
    unique_users = uf.count_groups()
    results['task2'] = unique_users
    print(f"   Total records: {len(users_df):,}")
    print(f"   Real unique users: {unique_users:,}")
    print(f"   Duplicates found: {len(users_df) - unique_users:,}\n")
    
    # --- TASK 3: Author Sets ---
    print("Task 3: Unique Author Sets")
    author_sets = set()
    for author_str in books_df['author']:
        if pd.isna(author_str):
            continue
        authors = [a.strip() for a in str(author_str).split(',')]
        author_tuple = tuple(sorted(authors))
        author_sets.add(author_tuple)
    results['task3'] = len(author_sets)
    print(f"   Unique author sets: {len(author_sets):,}\n")
    
    # --- TASK 4: Most Popular Author ---
    print("Task 4: Most Popular Author")
    book_sales = orders_df.groupby('book_id')['quantity'].sum()
    books_with_sales = books_df.merge(book_sales, left_on='id', right_index=True, how='left')
    books_with_sales['quantity'] = books_with_sales['quantity'].fillna(0)
    
    def standardize_author(author_str):
        if pd.isna(author_str):
            return ()
        authors = [a.strip() for a in str(author_str).split(',')]
        return tuple(sorted(authors))
    
    books_with_sales['author_set'] = books_with_sales['author'].apply(standardize_author)
    author_sales = books_with_sales.groupby('author_set')['quantity'].sum().sort_values(ascending=False)
    
    most_popular = author_sales.idxmax()
    max_sales = author_sales.max()
    results['task4'] = {'authors': most_popular, 'books_sold': int(max_sales)}
    
    author_name = most_popular[0] if len(most_popular) == 1 else ' & '.join(most_popular)
    print(f"   Most popular: {author_name}")
    print(f"   Books sold: {int(max_sales):,}\n")
    
    # --- TASK 5: Top Customer ---
    print("Task 5: Top Customer")
    user_spending = orders_df.groupby('user_id')['paid_price'].sum()
    real_user_spending = {}
    
    for user_id, spending in user_spending.items():
        user_rows = users_clean[users_clean['id'] == user_id]
        if len(user_rows) == 0:
            continue
        user_idx = user_rows.index[0]
        root = uf.find(user_idx)
        if root not in real_user_spending:
            real_user_spending[root] = 0
        real_user_spending[root] += spending
    
    top_real_user = max(real_user_spending, key=real_user_spending.get)
    max_spending = real_user_spending[top_real_user]
    all_user_indices = uf.get_group_members(top_real_user)
    all_user_ids = users_clean.loc[all_user_indices, 'id'].tolist()
    
    results['task5'] = {'user_ids': all_user_ids, 'total_spending': max_spending}
    print(f"   Total spending: ${max_spending:,.2f}")
    print(f"   User IDs: {all_user_ids}")
    print(f"   Number of aliases: {len(all_user_ids)}\n")
    
    # --- TASK 6: Daily Revenue Chart ---
    print("Task 6: Daily Revenue Chart")
    daily_revenue = orders_df.groupby('date')['paid_price'].sum().sort_index()
    
    plt.figure(figsize=(14, 7))
    plt.plot(daily_revenue.index, daily_revenue.values, 
             marker='o', linestyle='-', linewidth=2, markersize=4, color='#2E86AB')
    plt.fill_between(daily_revenue.index, daily_revenue.values, alpha=0.2, color='#2E86AB')
    
    plt.title(f'Daily Revenue Over Time - {dataset_name}', fontsize=16, fontweight='bold')
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Revenue ($)', fontsize=12)
    plt.grid(True, alpha=0.3, linestyle='--')
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    chart_filename = f'daily_revenue_{dataset_name}.png'
    plt.savefig(chart_filename, dpi=300, bbox_inches='tight')
    plt.close()
    
    results['task6'] = chart_filename
    print(f"   âœ“ Chart saved: {chart_filename}\n")
    
    print(f"{'='*70}")
    print(f"{dataset_name} ANALYSIS COMPLETE")
    print(f"{'='*70}\n")
    
    return results

print("analyze_dataset function defined")

analyze_dataset function defined
