## Part 1: Imports and Configuration

In [1]:
import pandas as pd
import yaml
import matplotlib.pyplot as plt
import numpy as np
import re
import warnings
from datetime import datetime

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore', category=UserWarning)

print("‚úì Libraries imported successfully")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\hp\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\hp\AppData\Local\Programs\Python\Python312\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\hp\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelapp.py", line 739,

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\hp\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\hp\AppData\Local\Programs\Python\Python312\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\hp\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelapp.py", line 739,

AttributeError: _ARRAY_API not found

‚úì Libraries imported successfully


## Part 2: Utility Functions and Classes

In [2]:
def clean_price(price_str):
    """
    Clean and standardize price strings to USD floats.
    Handles formats: '$27.00', '‚Ç¨50¬¢50', 'USD 45.99', 'EUR 71.00'
    Converts EUR to USD: ‚Ç¨1 = $1.2
    """
    if pd.isna(price_str):
        return None

    price_str = str(price_str).strip()
    is_euro = ('‚Ç¨' in price_str) or ('EUR' in price_str.upper())

    # Remove currency symbols
    price_str = price_str.replace('$', '').replace('‚Ç¨', '').replace('USD', '')
    price_str = price_str.replace('EUR', '').replace('¬¢', '.').strip()
    
    try:
        price = float(price_str)
    except:
        return None

    # Convert EUR to USD
    if is_euro:
        price = price * 1.2
    
    return price

print("‚úì clean_price function defined")

‚úì clean_price function defined


In [3]:
def clean_timestamp(timestamp_str):
    """
    Parse various timestamp formats into pandas datetime.
    Handles ISO 8601 (YYYY-MM-DD) and European formats (DD.MM.YYYY)
    """
    if pd.isna(timestamp_str):
        return None
    
    timestamp_str = str(timestamp_str).strip()
    timestamp_str = timestamp_str.replace(';', ' ').replace(',', ' ')
    timestamp_str = timestamp_str.replace('A.M.', 'AM').replace('P.M.', 'PM')
    
    # Check if ISO format (YYYY-MM-DD)
    iso_pattern = r'\d{4}-\d{2}-\d{2}'
    if re.search(iso_pattern, timestamp_str):
        result = pd.to_datetime(timestamp_str, errors='coerce', dayfirst=False)
    else:
        result = pd.to_datetime(timestamp_str, errors='coerce', dayfirst=True)
    
    return result

print("‚úì clean_timestamp function defined")

‚úì clean_timestamp function defined


In [4]:
class UnionFind:
    """
    Union-Find (Disjoint Set Union) data structure with path compression.
    Used for efficiently grouping duplicate users.
    """
    def __init__(self, n):
        self.parent = list(range(n))
    
    def find(self, x):
        """Find root with path compression"""
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]
    
    def union(self, x, y):
        """Merge two sets"""
        root_x = self.find(x)
        root_y = self.find(y)
        if root_x != root_y:
            self.parent[root_x] = root_y
    
    def count_groups(self):
        """Count number of disjoint sets"""
        return len(set(self.find(i) for i in range(len(self.parent))))
    
    def get_group_members(self, x):
        """Get all members in the same group as x"""
        root = self.find(x)
        return [i for i in range(len(self.parent)) if self.find(i) == root]

print("‚úì UnionFind class defined")

‚úì UnionFind class defined


## Part 3: Main Analysis Function

In [5]:
def analyze_dataset(dataset_name):
    """
    Perform complete analysis on a dataset.
    Returns dictionary with all 6 task results.
    """
    
    print(f"\n{'='*70}")
    print(f"üìä ANALYZING: {dataset_name}")
    print(f"{'='*70}\n")
    
    # ===== 1. LOAD DATA =====
    print("üìÇ Loading data...")
    orders_df = pd.read_parquet(f'{dataset_name}/orders.parquet', engine='fastparquet')
    users_df = pd.read_csv(f'{dataset_name}/users.csv')
    
    with open(f'{dataset_name}/books.yaml', 'r', encoding='utf-8') as file:
        books_data = yaml.safe_load(file)
    books_df = pd.DataFrame(books_data)
    books_df.columns = books_df.columns.str.replace(':', '')
    
    print(f"   ‚úì Loaded {len(orders_df):,} orders")
    print(f"   ‚úì Loaded {len(users_df):,} users")
    print(f"   ‚úì Loaded {len(books_df):,} books\n")
    
    # ===== 2. CLEAN ORDERS =====
    print("üßπ Cleaning orders data...")
    orders_df['clean_price'] = orders_df['unit_price'].apply(clean_price)
    orders_df['clean_timestamp'] = orders_df['timestamp'].apply(clean_timestamp)
    
    before_clean = len(orders_df)
    orders_df = orders_df.dropna(subset=['clean_price', 'clean_timestamp'])
    after_clean = len(orders_df)
    
    orders_df['paid_price'] = orders_df['quantity'] * orders_df['clean_price']
    orders_df['date'] = orders_df['clean_timestamp'].dt.date
    orders_df = orders_df.drop_duplicates(subset=['id'])
    
    print(f"   ‚úì Removed {before_clean - after_clean:,} invalid rows")
    print(f"   ‚úì Final orders count: {len(orders_df):,}\n")
    
    # ===== 3. BUILD UNION-FIND FOR USERS =====
    print("üîó Building user deduplication structure...")
    n_users = len(users_df)
    uf = UnionFind(n_users)
    users_clean = users_df.copy()
    
    # Fill missing addresses
    na_mask = users_clean['address'].isna()
    na_count = na_mask.sum()
    users_clean.loc[na_mask, 'address'] = [f'MISSING_ADDRESS_{i}' for i in range(na_count)]
    
    # Union users with matching fields
    for field in ['email', 'phone', 'name', 'address']:
        groups = users_clean.groupby(field).groups
        for indices in groups.values():
            if len(indices) > 1:
                first = indices[0]
                for idx in indices[1:]:
                    uf.union(first, idx)
    
    print(f"   ‚úì User deduplication complete\n")
    
    # ===== EXECUTE 6 TASKS =====
    results = {}
    
    # --- TASK 1: Top 5 Revenue Days ---
    print("üìÖ Task 1: Top 5 Revenue Days")
    top5_days = orders_df.groupby('date')['paid_price'].sum().sort_values(ascending=False).head(5)
    results['task1'] = top5_days
    for i, (date, revenue) in enumerate(top5_days.items(), 1):
        print(f"   {i}. {date}: ${revenue:,.2f}")
    print()
    
    # --- TASK 2: Unique Users ---
    print("üë• Task 2: Real Unique Users")
    unique_users = uf.count_groups()
    results['task2'] = unique_users
    print(f"   Total records: {len(users_df):,}")
    print(f"   Real unique users: {unique_users:,}")
    print(f"   Duplicates found: {len(users_df) - unique_users:,}\n")
    
    # --- TASK 3: Author Sets ---
    print("‚úçÔ∏è  Task 3: Unique Author Sets")
    author_sets = set()
    for author_str in books_df['author']:
        if pd.isna(author_str):
            continue
        authors = [a.strip() for a in str(author_str).split(',')]
        author_tuple = tuple(sorted(authors))
        author_sets.add(author_tuple)
    results['task3'] = len(author_sets)
    print(f"   Unique author sets: {len(author_sets):,}\n")
    
    # --- TASK 4: Most Popular Author ---
    print("‚≠ê Task 4: Most Popular Author")
    book_sales = orders_df.groupby('book_id')['quantity'].sum()
    books_with_sales = books_df.merge(book_sales, left_on='id', right_index=True, how='left')
    books_with_sales['quantity'] = books_with_sales['quantity'].fillna(0)
    
    def standardize_author(author_str):
        if pd.isna(author_str):
            return ()
        authors = [a.strip() for a in str(author_str).split(',')]
        return tuple(sorted(authors))
    
    books_with_sales['author_set'] = books_with_sales['author'].apply(standardize_author)
    author_sales = books_with_sales.groupby('author_set')['quantity'].sum().sort_values(ascending=False)
    
    most_popular = author_sales.idxmax()
    max_sales = author_sales.max()
    results['task4'] = {'authors': most_popular, 'books_sold': int(max_sales)}
    
    author_name = most_popular[0] if len(most_popular) == 1 else ' & '.join(most_popular)
    print(f"   Most popular: {author_name}")
    print(f"   Books sold: {int(max_sales):,}\n")
    
    # --- TASK 5: Top Customer ---
    print("üí∞ Task 5: Top Customer")
    user_spending = orders_df.groupby('user_id')['paid_price'].sum()
    real_user_spending = {}
    
    for user_id, spending in user_spending.items():
        user_rows = users_clean[users_clean['id'] == user_id]
        if len(user_rows) == 0:
            continue
        user_idx = user_rows.index[0]
        root = uf.find(user_idx)
        if root not in real_user_spending:
            real_user_spending[root] = 0
        real_user_spending[root] += spending
    
    top_real_user = max(real_user_spending, key=real_user_spending.get)
    max_spending = real_user_spending[top_real_user]
    all_user_indices = uf.get_group_members(top_real_user)
    all_user_ids = users_clean.loc[all_user_indices, 'id'].tolist()
    
    results['task5'] = {'user_ids': all_user_ids, 'total_spending': max_spending}
    print(f"   Total spending: ${max_spending:,.2f}")
    print(f"   User IDs: {all_user_ids}")
    print(f"   Number of aliases: {len(all_user_ids)}\n")
    
    # --- TASK 6: Daily Revenue Chart ---
    print("üìà Task 6: Daily Revenue Chart")
    daily_revenue = orders_df.groupby('date')['paid_price'].sum().sort_index()
    
    plt.figure(figsize=(14, 7))
    plt.plot(daily_revenue.index, daily_revenue.values, 
             marker='o', linestyle='-', linewidth=2, markersize=4, color='#2E86AB')
    plt.fill_between(daily_revenue.index, daily_revenue.values, alpha=0.2, color='#2E86AB')
    
    plt.title(f'Daily Revenue Over Time - {dataset_name}', fontsize=16, fontweight='bold')
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Revenue ($)', fontsize=12)
    plt.grid(True, alpha=0.3, linestyle='--')
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    chart_filename = f'daily_revenue_{dataset_name}.png'
    plt.savefig(chart_filename, dpi=300, bbox_inches='tight')
    plt.close()
    
    results['task6'] = chart_filename
    print(f"   ‚úì Chart saved: {chart_filename}\n")
    
    print(f"{'='*70}")
    print(f"‚úÖ {dataset_name} ANALYSIS COMPLETE")
    print(f"{'='*70}\n")
    
    return results

print("‚úì analyze_dataset function defined")

‚úì analyze_dataset function defined


## Part 4: Execute Analysis on All Datasets

In [6]:
# Run analysis on all three datasets
all_results = {}

for dataset in ['DATA1', 'DATA2', 'DATA3']:
    try:
        all_results[dataset] = analyze_dataset(dataset)
    except Exception as e:
        print(f"\n‚ùå Error analyzing {dataset}: {str(e)}\n")
        all_results[dataset] = None


üìä ANALYZING: DATA1

üìÇ Loading data...
   ‚úì Loaded 11,237 orders
   ‚úì Loaded 3,293 users
   ‚úì Loaded 753 books

üßπ Cleaning orders data...
   ‚úì Removed 0 invalid rows
   ‚úì Final orders count: 11,237

üîó Building user deduplication structure...
   ‚úì User deduplication complete

üìÖ Task 1: Top 5 Revenue Days
   1. 2024-12-17: $57,011.46
   2. 2024-11-03: $46,258.65
   3. 2025-03-23: $39,120.97
   4. 2024-09-06: $32,795.31
   5. 2025-01-25: $31,732.46

üë• Task 2: Real Unique Users
   Total records: 3,293
   Real unique users: 3,066
   Duplicates found: 227

‚úçÔ∏è  Task 3: Unique Author Sets
   Unique author sets: 325

‚≠ê Task 4: Most Popular Author
   Most popular: Arlinda Huel
   Books sold: 201

üí∞ Task 5: Top Customer
   Total spending: $37,609.70
   User IDs: [45800]
   Number of aliases: 1

üìà Task 6: Daily Revenue Chart
   ‚úì Chart saved: daily_revenue_DATA1.png

‚úÖ DATA1 ANALYSIS COMPLETE


üìä ANALYZING: DATA2

üìÇ Loading data...
   ‚úì Loaded 

## Part 5: Results Summary

In [7]:
print("\n" + "="*70)
print("üìä FINAL RESULTS SUMMARY")
print("="*70 + "\n")

for dataset in ['DATA1', 'DATA2', 'DATA3']:
    results = all_results.get(dataset)
    if results is None:
        print(f"{dataset}: Analysis failed\n")
        continue
    
    print(f"üìÅ {dataset}:")
    print(f"   Task 1: Top day revenue: ${results['task1'].iloc[0]:,.2f}")
    print(f"   Task 2: Unique users: {results['task2']:,}")
    print(f"   Task 3: Author sets: {results['task3']:,}")
    
    authors = results['task4']['authors']
    author_name = authors[0] if len(authors) == 1 else ' & '.join(authors)
    print(f"   Task 4: Most popular: {author_name} ({results['task4']['books_sold']:,} books)")
    
    print(f"   Task 5: Top customer: {results['task5']['user_ids']} (${results['task5']['total_spending']:,.2f})")
    print(f"   Task 6: Chart: {results['task6']}")
    print()

print("="*70)
print("‚úÖ ALL ANALYSES COMPLETE")
print("="*70)
print("\nüìÅ Generated files:")
print("   - daily_revenue_DATA1.png")
print("   - daily_revenue_DATA2.png")
print("   - daily_revenue_DATA3.png")
print("\nüéØ Next step: Create dashboard with these results")


üìä FINAL RESULTS SUMMARY

üìÅ DATA1:
   Task 1: Top day revenue: $57,011.46
   Task 2: Unique users: 3,066
   Task 3: Author sets: 325
   Task 4: Most popular: Arlinda Huel (201 books)
   Task 5: Top customer: [45800] ($37,609.70)
   Task 6: Chart: daily_revenue_DATA1.png

üìÅ DATA2:
   Task 1: Top day revenue: $42,137.01
   Task 2: Unique users: 2,633
   Task 3: Author sets: 293
   Task 4: Most popular: Hershel Treutel & Miss Modesto Denesik & Sen. Trula Bosco (163 books)
   Task 5: Top customer: [53256] ($37,051.25)
   Task 6: Chart: daily_revenue_DATA2.png

üìÅ DATA3:
   Task 1: Top day revenue: $63,761.34
   Task 2: Unique users: 3,232
   Task 3: Author sets: 268
   Task 4: Most popular: Coy Streich & Keeley Hand & Lela Emard (159 books)
   Task 5: Top customer: [49002, 49414] ($44,582.89)
   Task 6: Chart: daily_revenue_DATA3.png

‚úÖ ALL ANALYSES COMPLETE

üìÅ Generated files:
   - daily_revenue_DATA1.png
   - daily_revenue_DATA2.png
   - daily_revenue_DATA3.png

üéØ Next

---

## Notes

- All three datasets analyzed independently
- Results stored in `all_results` dictionary
- Charts saved as PNG files
- Ready for dashboard creation

**Next Steps:**
1. Create dashboard (Streamlit recommended)
2. Deploy to Streamlit Cloud or similar
3. Submit code repository + dashboard link