In [None]:
# Dataset overview
print("Dataset Overview")
print("="*60)
print(f"Total Records: {len(df):,}")
print(f"Columns: {len(df.columns)}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"\nUnique Values:")
print(f"  - Customers: {df['customer_id'].nunique():,}")
print(f"  - Products: {df['product_id'].nunique():,}")
print(f"  - Transactions: {df['transaction_id'].nunique():,}")
print(f"  - Categories: {df['category'].nunique()}")
print(f"  - Countries: {df['country'].nunique()}")


In [None]:
# Data types and statistical summary
print("Statistical Summary")
df[['quantity', 'unit_price', 'subtotal', 'discount_amount', 
    'tax_amount', 'shipping_cost', 'total_amount', 'profit']].describe()


## 2. Data Quality Assessment


In [None]:
# Missing values analysis
print("Missing Values Analysis")
print("="*60)

missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing_Count': missing,
    'Percentage': missing_pct
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print(missing_df)
else:
    print("No missing values found in completed transactions view")


In [None]:
# Create analysis features
df['year'] = df['transaction_date'].dt.year
df['month'] = df['transaction_date'].dt.month
df['day_of_week'] = df['transaction_date'].dt.dayofweek
df['profit_margin'] = (df['profit'] / df['total_amount'] * 100)

print("Features created successfully")


## 3. Sales Performance Analysis


In [None]:
# Revenue trends over time
daily_sales = df.groupby('transaction_date').agg({
    'total_amount': 'sum',
    'transaction_id': 'count'
}).reset_index()
daily_sales.columns = ['date', 'revenue', 'transactions']

# Plot
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))

ax1.plot(daily_sales['date'], daily_sales['revenue'], linewidth=1.5, color='#2E86AB')
ax1.set_title('Daily Revenue Trend', fontsize=14, fontweight='bold')
ax1.set_xlabel('Date')
ax1.set_ylabel('Revenue ($)')
ax1.grid(True, alpha=0.3)

ax2.plot(daily_sales['date'], daily_sales['transactions'], linewidth=1.5, color='#F18F01')
ax2.set_title('Daily Transaction Count', fontsize=14, fontweight='bold')
ax2.set_xlabel('Date')
ax2.set_ylabel('Number of Transactions')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(PATHS['reports'] / 'plots' / 'notebook_daily_trends.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Average daily revenue: ${daily_sales['revenue'].mean():,.2f}")
print(f"Average daily transactions: {daily_sales['transactions'].mean():.0f}")


## 4. Customer Segment Analysis


In [None]:
# Customer segment performance
segment_analysis = df.groupby('customer_segment').agg({
    'total_amount': ['sum', 'mean', 'count'],
    'customer_id': 'nunique',
    'profit': 'sum'
}).reset_index()

segment_analysis.columns = ['segment', 'total_revenue', 'avg_transaction', 
                             'transactions', 'unique_customers', 'total_profit']
segment_analysis['revenue_per_customer'] = segment_analysis['total_revenue'] / segment_analysis['unique_customers']
segment_analysis['revenue_pct'] = segment_analysis['total_revenue'] / segment_analysis['total_revenue'].sum() * 100

print("Customer Segment Analysis")
print("="*80)
print(segment_analysis.to_string(index=False))

# Visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

colors = ['#FF6B6B', '#4ECDC4', '#95E1D3']
ax1.pie(segment_analysis['total_revenue'], labels=segment_analysis['segment'], 
        autopct='%1.1f%%', colors=colors, startangle=90)
ax1.set_title('Revenue Distribution by Customer Segment', fontsize=12, fontweight='bold')

ax2.bar(segment_analysis['segment'], segment_analysis['avg_transaction'], color=colors, alpha=0.8)
ax2.set_title('Average Transaction Value by Segment', fontsize=12, fontweight='bold')
ax2.set_ylabel('Avg Transaction ($)')
ax2.grid(True, alpha=0.3, axis='y')

ax3.bar(segment_analysis['segment'], segment_analysis['revenue_per_customer'], color=colors, alpha=0.8)
ax3.set_title('Revenue per Customer by Segment', fontsize=12, fontweight='bold')
ax3.set_ylabel('Revenue per Customer ($)')
ax3.grid(True, alpha=0.3, axis='y')

ax4.bar(segment_analysis['segment'], segment_analysis['unique_customers'], color=colors, alpha=0.8)
ax4.set_title('Number of Customers by Segment', fontsize=12, fontweight='bold')
ax4.set_ylabel('Customer Count')
ax4.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig(PATHS['reports'] / 'plots' / 'notebook_segment_analysis.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Category performance analysis
category_performance = df.groupby('category').agg({
    'total_amount': ['sum', 'mean'],
    'transaction_id': 'count',
    'quantity': 'sum',
    'profit': 'sum',
    'profit_margin': 'mean'
}).reset_index()

category_performance.columns = ['category', 'total_revenue', 'avg_order_value', 
                                 'orders', 'units_sold', 'total_profit', 'avg_margin']
category_performance = category_performance.sort_values('total_revenue', ascending=False)

print("Top 10 Product Categories by Revenue")
print("="*80)
print(category_performance.head(10).to_string(index=False))

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

top_categories = category_performance.head(10)
ax1.barh(top_categories['category'], top_categories['total_revenue'], color='#2E86AB', alpha=0.8)
ax1.set_title('Top 10 Categories by Revenue', fontsize=12, fontweight='bold')
ax1.set_xlabel('Total Revenue ($)')
ax1.invert_yaxis()
ax1.grid(True, alpha=0.3, axis='x')

top_margin = category_performance.nlargest(10, 'avg_margin')
ax2.barh(top_margin['category'], top_margin['avg_margin'], color='#F18F01', alpha=0.8)
ax2.set_title('Top 10 Categories by Profit Margin', fontsize=12, fontweight='bold')
ax2.set_xlabel('Average Profit Margin (%)')
ax2.invert_yaxis()
ax2.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.savefig(PATHS['reports'] / 'plots' / 'notebook_category_analysis.png', dpi=300, bbox_inches='tight')
plt.show()


## 6. Geographic Distribution


In [None]:
# Geographic analysis
geo_sales = df.groupby('country').agg({
    'total_amount': 'sum',
    'transaction_id': 'count',
    'customer_id': 'nunique'
}).reset_index()

geo_sales.columns = ['country', 'total_revenue', 'transactions', 'customers']
geo_sales = geo_sales.sort_values('total_revenue', ascending=False)
geo_sales['revenue_pct'] = geo_sales['total_revenue'] / geo_sales['total_revenue'].sum() * 100

print("Sales by Country")
print("="*70)
print(geo_sales.to_string(index=False))

# Visualization
fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.bar(geo_sales['country'], geo_sales['total_revenue'], 
              color=sns.color_palette('viridis', len(geo_sales)), alpha=0.8)
ax.set_title('Revenue by Country', fontsize=14, fontweight='bold')
ax.set_xlabel('Country')
ax.set_ylabel('Total Revenue ($)')
ax.tick_params(axis='x', rotation=45)
ax.grid(True, alpha=0.3, axis='y')

for i, (bar, pct) in enumerate(zip(bars, geo_sales['revenue_pct'])):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{pct:.1f}%', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig(PATHS['reports'] / 'plots' / 'notebook_geographic.png', dpi=300, bbox_inches='tight')
plt.show()


## 7. Key Findings Summary


In [None]:
# Summary insights
print("="*80)
print("KEY INSIGHTS AND FINDINGS")
print("="*80)

print("\n1. OVERALL BUSINESS METRICS")
print(f"   Total Revenue: ${df['total_amount'].sum():,.2f}")
print(f"   Total Profit: ${df['profit'].sum():,.2f}")
print(f"   Avg Profit Margin: {df['profit_margin'].mean():.2f}%")
print(f"   Total Transactions: {df['transaction_id'].nunique():,}")
print(f"   Unique Customers: {df['customer_id'].nunique():,}")
print(f"   Unique Products Sold: {df['product_id'].nunique():,}")

print("\n2. CUSTOMER INSIGHTS")
vip_revenue_pct = segment_analysis[segment_analysis['segment'] == 'VIP']['revenue_pct'].values[0] if 'VIP' in segment_analysis['segment'].values else 0
print(f"   High-value segments drive majority of revenue")
print(f"   Average customer lifetime value: ${df.groupby('customer_id')['total_amount'].sum().mean():,.2f}")
print(f"   Average transactions per customer: {len(df) / df['customer_id'].nunique():.1f}")

print("\n3. PRODUCT INSIGHTS")
print(f"   Top category: {category_performance.iloc[0]['category']} (${category_performance.iloc[0]['total_revenue']:,.2f})")
print(f"   Average order value: ${df['total_amount'].mean():.2f}")
print(f"   Average items per transaction: {df['quantity'].mean():.2f}")

print("\n4. GEOGRAPHIC INSIGHTS")
print(f"   Top country: {geo_sales.iloc[0]['country']} ({geo_sales.iloc[0]['revenue_pct']:.1f}% of revenue)")
print(f"   Markets served: {geo_sales['country'].nunique()} countries")

print("\n5. RECOMMENDATIONS")
print("   - Focus retention efforts on high-value customer segments")
print("   - Optimize inventory for top-performing categories")
print("   - Implement targeted marketing for inactive customers")
print("   - Geographic expansion in underserved markets")

print("\n" + "="*80)


In [None]:
# Save cleaned data
output_path = PATHS['data_processed'] / 'cleaned_sales_data.csv'
df.to_csv(output_path, index=False)
print(f"Cleaned data saved to: {output_path}")
print(f"Records: {len(df):,}")
print(f"Size: {output_path.stat().st_size / 1024**2:.2f} MB")


## Analysis Complete

**Next Steps:**
1. Proceed to `02_statistical_analysis.ipynb` for cohort analysis and RFM segmentation
2. Review `03_machine_learning_models.ipynb` for predictive modeling
3. Explore interactive dashboards at http://localhost:8501
