# 01 â€” Data Exploration

Exploratory analysis of the Kaggle Amazon Products dataset after canonicalization (`data/raw/amazon_products_clean.csv`). This notebook validates required columns (`product_title`, `product_description`, `category`), summarises their distributions, and exports mandated plots plus summary statistics to `results/plots/` and `results/eda_summary.txt`.

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

RAW_PATH = Path('data/raw/amazon_products_clean.csv')
RESULTS_DIR = Path('results/plots')
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
sns.set(style='whitegrid', font_scale=0.9)
RAW_PATH

In [None]:
df = pd.read_csv(RAW_PATH)
print(f"Shape: {df.shape}")
df.head()

In [None]:
summary_lines = []
summary_lines.append('Head:\n' + df.head().to_string())
summary_lines.append(f"\nShape: {df.shape}")
summary_lines.append('\nDtypes:\n' + df.dtypes.to_string())
summary_lines.append('\nMissing values:\n' + df.isna().sum().to_string())
summary_lines.append(f"\nUnique categories: {df['category'].nunique()}")
summary_lines.append('\nTop 20 categories:\n' + df['category'].value_counts().head(20).to_string())
summary_text = '\n'.join(summary_lines)
Path('results/eda_summary.txt').write_text(summary_text, encoding='utf-8')
summary_text.split('\n')[:12]

In [None]:
top20 = df['category'].value_counts().head(20)
plt.figure(figsize=(12, 6))
sns.barplot(x=top20.values, y=top20.index, palette='viridis')
plt.title('Top 20 Categories by Frequency')
plt.xlabel('Count')
plt.ylabel('Category')
plt.tight_layout()
top20_path = RESULTS_DIR / 'top20_categories.png'
plt.savefig(top20_path, dpi=200)
plt.show()
top20_path

In [None]:
def length_stats(text_series):
    char_len = text_series.fillna('').str.len()
    token_len = text_series.fillna('').str.split().apply(len)
    return char_len, token_len

title_chars, title_tokens = length_stats(df['product_title'])
desc_chars, desc_tokens = length_stats(df['product_description'])

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.histplot(title_tokens, bins=50, color='steelblue', ax=axes[0])
axes[0].set_title('Product Title Length (tokens)')
sns.histplot(title_chars, bins=50, color='coral', ax=axes[1])
axes[1].set_title('Product Title Length (characters)')
sns.histplot(desc_tokens, bins=50, color='darkgreen', ax=axes[2])
axes[2].set_title('Product Description Length (tokens)')
for ax in axes:
    ax.set_xlabel('Count')
fig.tight_layout()
fig.savefig(RESULTS_DIR / 'title_description_length_hist.png', dpi=200)
plt.show()

cat_counts = df['category'].value_counts()
plt.figure(figsize=(10, 5))
sns.barplot(x=np.arange(len(cat_counts)), y=cat_counts.values, color='slateblue')
plt.yscale('log')
plt.title('Category Distribution (log scale)')
plt.xlabel('Category index (sorted by freq)')
plt.ylabel('Count (log scale)')
imbalance_path = RESULTS_DIR / 'category_distribution_log.png'
plt.tight_layout()
plt.savefig(imbalance_path, dpi=200)
plt.show()
imbalance_path