In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/retail-store-inventory-and-demand-forecasting/sales_data.csv


## **Preliminary Data Cleaning Hypotheses**

### **1. Missing Data & Integrity Checks**
- **Null Values**:  
  - Columns like `Weather Condition`, `Competitor Pricing`, and `Discount` may contain missing data.  
  - **Action**: Impute using forward-fill or median values based on `Category`/`Region`.  
- **Inventory-Sales Mismatch**:  
  - If `Units Sold` > `Inventory Level`, data is invalid (sales cannot exceed stock).  
  - **Action**: Flag/remove such records.  

### **2. Temporal & Categorical Issues**
- **Date Gaps**:  
  - Check for missing dates (e.g., store closures during `Epidemic=1`).  
  - **Action**: Resample time series and forward-fill static features (e.g., `Store ID`).  
- **Categorical Consistency**:  
  - `Category` or `Region` may have typos (e.g., "Electronics" vs. "Eletronics").  
  - **Action**: Standardize labels using fuzzy matching.  

### **3. Outliers & Anomalies**
- **Demand Spikes**:  
  - Extreme `Units Sold` during `Promotion=1` or `Epidemic=1`.  
  - **Action**: Winsorize or cap values at 99th percentile.  
- **Negative Inventory**:  
  - `Inventory Level` < 0 suggests data entry errors.  
  - **Action**: Set to zero or treat as missing.  

### **4. Feature Engineering Ideas**
- **Time Features**:  
  - Derive `Day_of_week`, `Is_holiday`, and `Lag_7_Demand` from `Date`.  
- **Competitor Impact**:  
  - Create `Price_ratio = Price / Competitor Pricing` to measure relative affordability.  
- **Stockout Risk**:  
  - `Stock_coverage = Inventory Level / (7-day avg Units Sold)`.



In [2]:
# Imports and Configuration
from pathlib import Path
import sys

DATA_PATH = Path('/kaggle/input/retail-store-inventory-and-demand-forecasting/sales_data.csv')
CHUNKSIZE = 50_000
DATE_COL = 'Date'

# Optimized dtypes
DTYPES = {
    'Store ID': 'category',
    'Product ID': 'category',
    'Category': 'category',
    'Region': 'category',
    'Inventory Level': 'uint16',
    'Units Sold': 'uint16',
    'Units Ordered': 'uint16',
    'Price': 'float32',
    'Discount': 'float32',
    'Weather Condition': 'category',
    'Promotion': 'bool',
    'Competitor Pricing': 'float32',
    'Seasonality': 'category',
    'Epidemic': 'bool',
    'Demand': 'uint16'
}

In [3]:
# Data Import Function
def import_retail_data():
    chunks = []
    error_log = []
    
    try:
        with pd.read_csv(
            DATA_PATH,
            chunksize=CHUNKSIZE,
            dtype=DTYPES,
            parse_dates=[DATE_COL],
            on_bad_lines='warn',
            encoding='utf-8'
        ) as reader:
            
            for chunk_idx, chunk in enumerate(reader):
                try:
                    # Validate required columns
                    required_cols = set(DTYPES.keys())
                    missing_cols = required_cols - set(chunk.columns)
                    if missing_cols:
                        raise ValueError(f"Missing columns: {missing_cols}")
                    
                    # Clean data
                    chunk[DATE_COL] = pd.to_datetime(chunk[DATE_COL], errors='coerce')
                    chunk['Inventory Level'] = chunk['Inventory Level'].clip(lower=0)
                    chunk['Units Sold'] = np.where(
                        chunk['Units Sold'] > chunk['Inventory Level'],
                        chunk['Inventory Level'],
                        chunk['Units Sold']
                    )
                    
                    # Handle numeric overflows
                    for col in ['Inventory Level', 'Units Sold', 'Units Ordered']:
                        chunk[col] = pd.to_numeric(chunk[col], downcast='unsigned')
                    
                    chunks.append(chunk)
                    
                except Exception as e:
                    error_log.append(f"Chunk {chunk_idx} failed: {str(e)}")
                    continue
        
        if not chunks:
            raise ValueError("No valid data chunks were processed")
            
        # Combine and finalize
        df = pd.concat(chunks, ignore_index=False)
        df = df.set_index(DATE_COL).sort_index()
        
        # Final dtype enforcement
        for col, dtype in DTYPES.items():
            if col in df.columns:
                try:
                    df[col] = df[col].astype(dtype)
                except Exception as e:
                    error_log.append(f"Dtype conversion failed for {col}: {str(e)}")
        
        return df, error_log
        
    except Exception as e:
        error_log.append(f"Fatal import error: {str(e)}")
        return None, error_log

In [4]:
# Execute and Display Results
# Execute the import
df, import_errors = import_retail_data()

# Display results
if df is not None:
    print(f"Successfully imported {len(df):,} rows")
    print("Sample data:")
    display(df.head())
else:
    print("Import failed")

if import_errors:
    print("\nEncountered warnings/errors:")
    for i, error in enumerate(import_errors, 1):
        print(f"{i}. {error}")

Successfully imported 76,000 rows
Sample data:


Unnamed: 0_level_0,Store ID,Product ID,Category,Region,Inventory Level,Units Sold,Units Ordered,Price,Discount,Weather Condition,Promotion,Competitor Pricing,Seasonality,Epidemic,Demand
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2022-01-01,S001,P0001,Electronics,North,195,102,252,72.720001,5.0,Snowy,False,85.730003,Winter,False,115
2022-01-01,S001,P0002,Clothing,North,117,117,249,80.160004,15.0,Snowy,True,92.019997,Winter,False,229
2022-01-01,S001,P0003,Clothing,North,247,114,612,62.939999,10.0,Snowy,True,60.080002,Winter,False,157
2022-01-01,S001,P0004,Electronics,North,139,45,102,87.629997,10.0,Snowy,False,85.190002,Winter,False,52
2022-01-01,S001,P0005,Groceries,North,152,65,271,54.41,0.0,Snowy,False,51.630001,Winter,False,59
