In [5]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
os.chdir('/workspaces/b2b-customer-profitability-analysis')

# Load ONLY the raw datasets
customers = pd.read_csv('data/processed/01_customer_master.csv')
transactions = pd.read_csv('data/generated/02_transactions_generated.csv')
products = pd.read_csv('data/generated/03_products_generated.csv')

print("INITIAL EDA - RAW DATA ONLY ")
# PART 1: DATA QUALITY
print("\n1. DATA QUALITY CHECK")
print(f"Customers: {len(customers):,} rows, {customers.isnull().sum().sum()} nulls")
print(f"Transactions: {len(transactions):,} rows, {transactions.isnull().sum().sum()} nulls")
print(f"Products: {len(products):,} rows, {products.isnull().sum().sum()} nulls")

# PART 2: CUSTOMER OVERVIEW
print("\n2. CUSTOMER OVERVIEW")
print(f"By Channel: {customers['ChannelName'].value_counts().to_dict()}")
print(f"By Segment: {customers['CustomerSegment'].value_counts().to_dict()}")
print(f"By Region: {customers['RegionName'].value_counts().to_dict()}")

# PART 3: TRANSACTION PATTERNS
print("\n3. TRANSACTION PATTERNS")
print(f"Total Transactions: {len(transactions):,}")
print(f"Avg Orders per Customer: {len(transactions) / len(customers):.1f}")
print(f"Date Range: {transactions['TransactionDate'].min()} to {transactions['TransactionDate'].max()}")

# PART 4: DISTRIBUTION
print("\n4. TRANSACTION AMOUNT DISTRIBUTION")
print(f"Min: {transactions['TransactionAmount'].min():.2f}")
print(f"Max: {transactions['TransactionAmount'].max():.2f}")
print(f"Mean: {transactions['TransactionAmount'].mean():.2f}")
print(f"Median: {transactions['TransactionAmount'].median():.2f}")

# PART 5: PRODUCT MIX
print("\n5. PRODUCT CATEGORIES")
print(transactions['ProductCategory'].value_counts())

# PART 6: ORDER PATTERNS
print("\n6. ORDER PATTERNS")
print(f"Standard Orders: {(~transactions['IsUrgent']).sum():,} ({(~transactions['IsUrgent']).sum() / len(transactions) * 100:.1f}%)")
print(f"Urgent Orders: {transactions['IsUrgent'].sum():,} ({transactions['IsUrgent'].sum() / len(transactions) * 100:.1f}%)")

# PART 7: CHANNEL TRANSACTION VOLUME
transactions_with_channel = transactions.merge(
    customers[['CustomerID', 'ChannelName', 'CustomerSegment']], 
    on='CustomerID'
)

print("\n7. TRANSACTION VOLUME BY CHANNEL")
channel_trans = transactions_with_channel.groupby('ChannelName').agg({
    'TransactionAmount': ['count', 'sum', 'mean']
}).round(2)
channel_trans.columns = ['Order_Count', 'Total_Revenue', 'Avg_Order_Value']
print(channel_trans)

# PART 8: SEGMENT TRANSACTION VOLUME
print("\n8. TRANSACTION VOLUME BY SEGMENT")
segment_trans = transactions_with_channel.groupby('CustomerSegment').agg({
    'TransactionAmount': ['count', 'sum', 'mean']
}).round(2)
segment_trans.columns = ['Order_Count', 'Total_Revenue', 'Avg_Order_Value']
print(segment_trans)


INITIAL EDA - RAW DATA ONLY 

1. DATA QUALITY CHECK
Customers: 440 rows, 0 nulls
Transactions: 14,488 rows, 0 nulls
Products: 275 rows, 0 nulls

2. CUSTOMER OVERVIEW
By Channel: {'HORECA': 298, 'Retail': 142}
By Segment: {'Mid-Market': 228, 'SMB': 146, 'Enterprise': 66}
By Region: {'Other_Regions': 316, 'Lisbon': 77, 'Porto': 47}

3. TRANSACTION PATTERNS
Total Transactions: 14,488
Avg Orders per Customer: 32.9
Date Range: 2023-01-01 to 2023-12-28

4. TRANSACTION AMOUNT DISTRIBUTION
Min: 16.15
Max: 31214.25
Mean: 1009.08
Median: 534.89

5. PRODUCT CATEGORIES
ProductCategory
Fresh              6214
Grocery            2862
Milk               2197
Frozen             1786
DetergentsPaper     722
Delicatessen        707
Name: count, dtype: int64

6. ORDER PATTERNS
Standard Orders: 12,876 (88.9%)
Urgent Orders: 1,612 (11.1%)

7. TRANSACTION VOLUME BY CHANNEL
             Order_Count  Total_Revenue  Avg_Order_Value
ChannelName                                             
HORECA             124

In [6]:
git add python/04_analysis/eda_initial.ipynb
git commit -m "Initial EDA - Raw data only (Datasets 1-3, no costs)"
git push


SyntaxError: invalid decimal literal (1782816924.py, line 1)