In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import re

%matplotlib inline
sns.set_style("whitegrid")

In [None]:
# Set the path to the sample data
# This path is relative to the `notebooks/` directory
DATA_PATH = "../backend/app/data_ingestion/sample_data.csv"

try:
    df = pd.read_csv(DATA_PATH)
    print(f"Successfully loaded data from {DATA_PATH}")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: File not found at {DATA_PATH}")
    print("Please ensure the sample_data.csv file exists in backend/app/data_ingestion/")

df.head()

In [None]:
df.info()

In [None]:
# Handle missing values
df = df.fillna("")

# 1. Clean 'price' column
def clean_price(price_str):
    if not isinstance(price_str, str):
        return np.nan
    try:
        return float(re.sub(r"[$,]", "", price_str))
    except (ValueError, TypeError):
        return np.nan

df['price_clean'] = df['price'].apply(clean_price)

# 2. Clean 'categories' and 'images' columns
def safe_literal_eval(val):
    if not isinstance(val, str) or not val.startswith('['):
        return []
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return []

df['categories_clean'] = df['categories'].apply(safe_literal_eval)
df['images_clean'] = df['images'].apply(safe_literal_eval)

print("Cleaned DataFrame:")
df[['title', 'price_clean', 'categories_clean', 'images_clean']].head()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['price_clean'].dropna(), bins=30, kde=True)
plt.title('Price Distribution')
plt.xlabel('Price ($)')
plt.ylabel('Count')
plt.show()

print("This chart shows the distribution of product prices, helping to understand the price range of the dataset.")

In [None]:
# Explode the lists of categories into individual rows
all_categories = df['categories_clean'].explode().str.strip()

# Get value counts
top_categories = all_categories.value_counts().head(15)

plt.figure(figsize=(12, 8))
sns.barplot(x=top_categories.values, y=top_categories.index, palette="viridis")
plt.title('Top 15 Most Common Product Categories')
plt.xlabel('Count')
plt.ylabel('Category')
plt.show()

print("This chart identifies the most common product categories, which is key for analytics and classification tasks.")

In [None]:
top_brands = df['brand'].value_counts().head(15)

plt.figure(figsize=(12, 8))
sns.barplot(x=top_brands.values, y=top_brands.index, palette="rocket")
plt.title('Top 15 Most Common Brands')
plt.xlabel('Count')
plt.ylabel('Brand')
plt.show()

print("This chart shows the top brands in the dataset, which feeds directly into the 'Top Brands' chart on the analytics page.")

In [None]:
df['has_image'] = df['images_clean'].apply(lambda x: len(x) > 0)
df['has_description'] = df['description'].apply(lambda x: len(str(x).strip()) > 10) # Has a description longer than 10 chars

image_coverage = (df['has_image'].sum() / len(df)) * 100
description_coverage = (df['has_description'].sum() / len(df)) * 100

print(f"Image Coverage: {image_coverage:.2f}%")
print(f"Description Coverage: {description_coverage:.2f}%")

coverage_data = pd.DataFrame({
    'Feature': ['Has Image', 'Has Description'],
    'Percentage': [image_coverage, description_coverage]
})

plt.figure(figsize=(8, 5))
sns.barplot(x='Feature', y='Percentage', data=coverage_data)
plt.title('Data Coverage')
plt.ylabel('Percentage (%)')
plt.ylim(0, 100)
plt.show()

print("This plot checks data quality. High coverage is essential for both the frontend (images) and the NLP models (descriptions).")