# PNG Export for Visualizations

This notebook exports all 12 visualization charts to high-resolution PNG files (300 DPI).

**Prerequisites:**
- Run `data_cleaning_analysis.ipynb` first to create `cleaned_amazon_walmart_data.csv` and `figures/outliers.csv`
- Run `visualization.ipynb` to display charts in Jupyter

**Output:**
- 12 PNG files saved to `figures/` folder at 300 DPI
- Ready for presentations, reports, and analytics

## 1. Import Libraries and Load Data

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

# Load cleaned data
df = pd.read_csv('cleaned_amazon_walmart_data.csv')

# Setup matplotlib and seaborn
plt.style.use('default')
sns.set_theme()

# Create output directory
out_dir = 'figures'
os.makedirs(out_dir, exist_ok=True)

print(f'✅ Loaded {len(df)} products from cleaned_amazon_walmart_data.csv')
print(f'✅ Output directory ready: {out_dir}/')

✅ Loaded 1921 products from cleaned_amazon_walmart_data.csv
✅ Output directory ready: figures/


## 2. Add Rating Category Column

In [None]:
def rating_category(rating):
    if rating >= 4.5:
        return 'Excellent (4.5-5)'
    elif rating >= 4:
        return 'Very Good (4-4.5)'
    elif rating >= 3:
        return 'Good (3-4)'
    elif rating >= 2:
        return 'Fair (2-3)'
    else:
        return 'Poor (<2)'

if 'rating_category' not in df.columns:
    df['rating_category'] = df['rating'].apply(rating_category)
    print('✅ Added rating_category column')
else:
    print('✅ rating_category column already exists')

## 3. Export All 12 Charts to PNG (300 DPI)

In [1]:
DPI = 300
print(f'Exporting all charts at {DPI} DPI to {out_dir}/ folder...\n')

# Chart 1: Rating distribution
plt.figure(figsize=(8,5))
sns.histplot(df['rating'], bins=10, kde=True)
plt.title('Distribution of Product Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.savefig(os.path.join(out_dir, '01_rating_distribution.png'), dpi=DPI, bbox_inches='tight')
plt.close()
print('✓ 01_rating_distribution.png')

# Chart 2: Price comparison (boxplot)
plt.figure(figsize=(7,5))
sns.boxplot(x='platform', y='final_price', data=df)
plt.title('Price Comparison by Platform')
plt.xlabel('Platform')
plt.ylabel('Final Price')
plt.savefig(os.path.join(out_dir, '02_price_comparison_boxplot.png'), dpi=DPI, bbox_inches='tight')
plt.close()
print('✓ 02_price_comparison_boxplot.png')

# Chart 3: Rating vs Final Price (scatter)
plt.figure(figsize=(8,5))
sns.scatterplot(x='rating', y='final_price', data=df, hue='platform')
plt.title('Rating vs Final Price')
plt.xlabel('Rating')
plt.ylabel('Final Price')
plt.legend()
plt.savefig(os.path.join(out_dir, '03_rating_vs_price_scatter.png'), dpi=DPI, bbox_inches='tight')
plt.close()
print('✓ 03_rating_vs_price_scatter.png')

# Chart 4: Correlation heatmap
plt.figure(figsize=(6,5))
corr = df[['final_price','initial_price','rating','discount']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.savefig(os.path.join(out_dir, '04_correlation_heatmap.png'), dpi=DPI, bbox_inches='tight')
plt.close()
print('✓ 04_correlation_heatmap.png')

# Chart 5: Average price by platform (bar)
plt.figure(figsize=(6,4))
avg_price = df.groupby('platform')['final_price'].mean()
avg_price.plot(kind='bar')
plt.title('Average Price by Platform')
plt.ylabel('Price')
plt.savefig(os.path.join(out_dir, '05_avg_price_by_platform.png'), dpi=DPI, bbox_inches='tight')
plt.close()
print('✓ 05_avg_price_by_platform.png')

# Chart 6: Discount distribution panels (Amazon vs Walmart)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
amazon_discount = df[df['platform'] == 'Amazon']['discount']
axes[0].hist(amazon_discount, bins=20, color='#FF9999', edgecolor='black')
axes[0].set_title('Amazon - Discount Distribution')
axes[0].set_xlabel('Discount Amount ($)')
axes[0].set_ylabel('Count')

walmart_discount = df[df['platform'] == 'Walmart']['discount']
axes[1].hist(walmart_discount, bins=30, color='#66B2FF', edgecolor='black')
axes[1].set_title('Walmart - Discount Distribution')
axes[1].set_xlabel('Discount Amount ($)')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.savefig(os.path.join(out_dir, '06_discount_distribution_panels.png'), dpi=DPI, bbox_inches='tight')
plt.close()
print('✓ 06_discount_distribution_panels.png')

# Chart 7: Initial vs Final Price scatter
plt.figure(figsize=(10,5))
plt.scatter(df['initial_price'], df['final_price'], alpha=0.6, s=50)
plt.plot([df['initial_price'].min(), df['initial_price'].max()], 
         [df['initial_price'].min(), df['initial_price'].max()], 
         'r--', lw=2, label='No Discount Line')
plt.title('Initial Price vs Final Price')
plt.xlabel('Initial Price ($)')
plt.ylabel('Final Price ($)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(out_dir, '07_initial_vs_final_scatter.png'), dpi=DPI, bbox_inches='tight')
plt.close()
print('✓ 07_initial_vs_final_scatter.png')

# Chart 8: Product distribution pie
product_counts = df['platform'].value_counts()
plt.figure(figsize=(7,5))
plt.pie(product_counts, labels=product_counts.index, autopct='%1.1f%%', 
        colors=['#FF9999', '#66B2FF'], startangle=90)
plt.title('Product Distribution by Platform')
plt.axis('equal')
plt.savefig(os.path.join(out_dir, '08_product_distribution_pie.png'), dpi=DPI, bbox_inches='tight')
plt.close()
print('✓ 08_product_distribution_pie.png')

# Chart 9: Rating category bar
plt.figure(figsize=(8,5))
category_counts = df['rating_category'].value_counts()
colors = sns.color_palette('viridis', len(category_counts))
plt.bar(range(len(category_counts)), category_counts.values, color=colors)
plt.title('Product Distribution by Rating Category')
plt.xlabel('Rating Category')
plt.ylabel('Count')
plt.xticks(range(len(category_counts)), category_counts.index, rotation=45, ha='right')
plt.tight_layout()
plt.savefig(os.path.join(out_dir, '09_rating_category_distribution.png'), dpi=DPI, bbox_inches='tight')
plt.close()
print('✓ 09_rating_category_distribution.png')

# Chart 10: Rating by platform (violin)
plt.figure(figsize=(8,5))
sns.violinplot(x='platform', y='rating', data=df)
plt.title('Rating Distribution by Platform')
plt.xlabel('Platform')
plt.ylabel('Rating')
plt.savefig(os.path.join(out_dir, '10_rating_by_platform_violin.png'), dpi=DPI, bbox_inches='tight')
plt.close()
print('✓ 10_rating_by_platform_violin.png')

# Chart 11: Average discount by platform (bar)
plt.figure(figsize=(8,5))
discount_by_platform = df.groupby('platform')['discount'].mean()
discount_by_platform.plot(kind='bar', color='coral')
plt.title('Average Discount by Platform')
plt.xlabel('Platform')
plt.ylabel('Average Discount ($)')
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig(os.path.join(out_dir, '11_avg_discount_by_platform.png'), dpi=DPI, bbox_inches='tight')
plt.close()
print('✓ 11_avg_discount_by_platform.png')

# Chart 12: Final price improved visualization (density + boxplot combined)
p99 = df['final_price'].quantile(0.99)
data_clipped = df[df['final_price'] <= p99]
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
palette = {'Walmart':'#66B2FF', 'Amazon':'#FF9999'}

sns.histplot(data=data_clipped, x='final_price', hue='platform', bins=40, 
             element='step', stat='density', common_norm=False, palette=palette, 
             alpha=0.35, ax=axes[0])
sns.kdeplot(data=data_clipped, x='final_price', hue='platform', bw_adjust=1, 
            common_norm=False, palette=palette, ax=axes[0], legend=False)

for platform, color in palette.items():
    med = data_clipped[data_clipped['platform'] == platform]['final_price'].median()
    axes[0].axvline(med, color=color, linestyle='--', linewidth=2)

axes[0].set_title('Final Price Density by Platform (<= 99th percentile)')
axes[0].set_xlabel('Final Price ($)')
axes[0].set_ylabel('Density')
axes[0].set_xlim(0, p99)

sns.boxplot(x='final_price', y='platform', data=data_clipped, showfliers=False, 
            orient='h', ax=axes[1])
axes[1].set_title('Final Price by Platform (boxplot, no extreme outliers)')
axes[1].set_xlabel('Final Price ($)')
axes[1].set_xscale('log')

plt.tight_layout()
plt.savefig(os.path.join(out_dir, '12_final_price_combined.png'), dpi=DPI, bbox_inches='tight')
plt.close()
print('✓ 12_final_price_combined.png')

print(f'\n✅ All 12 charts exported to {out_dir}/ at {DPI} DPI')

NameError: name 'out_dir' is not defined

## Summary

All visualization charts have been successfully exported to the `figures/` folder:

1. `01_rating_distribution.png` - Product rating distribution
2. `02_price_comparison_boxplot.png` - Price comparison by platform
3. `03_rating_vs_price_scatter.png` - Relationship between rating and price
4. `04_correlation_heatmap.png` - Correlation matrix of key metrics
5. `05_avg_price_by_platform.png` - Average price by platform
6. `06_discount_distribution_panels.png` - Discount comparison (Amazon vs Walmart)
7. `07_initial_vs_final_scatter.png` - Price impact of discounts
8. `08_product_distribution_pie.png` - Product count by platform
9. `09_rating_category_distribution.png` - Rating category breakdown
10. `10_rating_by_platform_violin.png` - Rating distribution by platform
11. `11_avg_discount_by_platform.png` - Average discount by platform
12. `12_final_price_combined.png` - Final price distribution analysis

These PNG files are ready for presentations, reports, and analytics dashboards.