# Retail Price Optimization - Exploratory Data Analysis

This notebook performs comprehensive Exploratory Data Analysis (EDA) on the retail price optimization dataset. 
The goal is to understand data distributions, feature relationships, and correlations to inform the pricing strategy.

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# Set backend aggressively to avoid display errors in headless/test environments
matplotlib.use('Agg')

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

# Add parent directory to path to import project modules
sys.path.append('..')

# Import data retriever
try:
    from data.managament.retreiver import get_latest_data
    print("Successfully imported get_latest_data")
except ImportError as e:
    print(f"Import Error: {e}")
    # Fallback if running from a different root
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
    from data.managament.retreiver import get_latest_data

ModuleNotFoundError: No module named 'pandas'

## 1. Data Loading and Basic Inspection

In [2]:
# Load data from the database using the project's retriever module
df = get_latest_data()

print(f"Dataset Shape: {df.shape}")
df.head()

NameError: name 'get_latest_data' is not defined

In [3]:
df.info()

NameError: name 'df' is not defined

In [None]:
df.describe()

In [None]:
# Check for missing values
missing = df.isnull().sum()
missing[missing > 0]

## 2. Univariate Analysis
Analyzing the distribution of key numerical variables.

In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numeric columns: {numeric_cols}")

# Plot distributions for key metrics
key_metrics = ['total_price', 'unit_price', 'freight_price', 'qty', 'product_score']
available_metrics = [m for m in key_metrics if m in df.columns]

for metric in available_metrics:
    plt.figure(figsize=(10, 5))
    sns.histplot(df[metric], kde=True, bins=30)
    plt.title(f'Distribution of {metric}')
    plt.show()

In [None]:
# Boxplots to detect outliers
for metric in available_metrics:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x=df[metric])
    plt.title(f'Boxplot of {metric}')
    plt.show()

## 3. Categorical Analysis

In [None]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

# Analyze Product Categories if available
if 'product_category_name' in df.columns:
    plt.figure(figsize=(12, 6))
    top_cats = df['product_category_name'].value_counts().nlargest(10)
    sns.barplot(x=top_cats.values, y=top_cats.index)
    plt.title('Top 10 Product Categories by Count')
    plt.xlabel('Count')
    plt.show()

## 4. Bivariate Analysis

In [None]:
# Relationship between Unit Price and Quantity (Demand Curve proxy)
if 'unit_price' in df.columns and 'qty' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='unit_price', y='qty', alpha=0.5)
    plt.title('Unit Price vs. Quantity')
    plt.show()

In [None]:
# Relationship between Product Score and Sales
if 'product_score' in df.columns and 'total_price' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='product_score', y='total_price', alpha=0.5)
    plt.title('Product Score vs. Total Sales')
    plt.show()

## 5. Correlation Analysis

In [None]:
# Calculate correlation matrix for numeric columns
numeric_df = df.select_dtypes(include=[np.number])
corr_matrix = numeric_df.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.show()

### Interpretation
- **Strong Positive Correlations**: Look for values close to 1. For example, `qty` vs `total_price` usually correlates.
- **Strong Negative Correlations**: Look for values close to -1.
- **Multicollinearity**: Identical high correlation between independent variables (e.g. `freight_price` and `weight`) might indicate redundancy for modeling.