# **Data Exploration**

#### Add Project Root to Path
Adds the project root to Python's path so we can import modules like src.config when running notebooks inside subfolders (e.g., notebooks/).

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
project_root

'e:\\My Projects\\Saudi Retail Data ETL Pipeline'

In [3]:
from src.loaders.config_loader import get_config
config = get_config()
config

{'data': {'raw_dir': '../data/raw/saudi_store_sales_dataset.csv',
  'processed_dir': '../data/processed'},
 'spark': {'master': 'local[*]',
  'app_name': 'Retail ETL Pipeline',
  'configs': {'spark.sql.shuffle.partitions': 200}},
 'general': {'log_level': 'INFO',
  'feature_flags': {'use_new_preprocessor': False}}}

#### Load Data

In [4]:
from src.loaders.load_data import DataLoader

app_name = config['spark']['app_name']
path = config['data']['raw_dir']

loader = DataLoader('spark')
df_spark = loader.load_data(path=path, file_type='csv', header=True, inferSchema=True)

Loading CSV file from: ../data/raw/saudi_store_sales_dataset.csv


In [5]:
from src.exploration.explore_data import DataExplorer

explorer = DataExplorer(df_spark)
print("Is Spark DataFrame?:", explorer.is_spark)
print("DataFrame loaded successfully.")

Is Spark DataFrame?: True
DataFrame loaded successfully.


In [6]:
explorer.explore_data_with_spark(schema=True)

root
 |-- Invoice Date: string (nullable = true)
 |-- Invoice ID: string (nullable = true)
 |-- Customer Type: string (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Customer Gender: string (nullable = true)
 |-- Employee Name: string (nullable = true)
 |-- Manager Name: string (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Product Category: string (nullable = true)
 |-- Channel: string (nullable = true)
 |-- Customer Satisfaction: string (nullable = true)
 |-- Total Sales: integer (nullable = true)



In [7]:
explorer.explore_data_with_spark(overview=True)

Rows: 49998, Columns: ['Invoice Date', 'Invoice ID', 'Customer Type', 'Customer Name', 'City', 'Customer Gender', 'Employee Name', 'Manager Name', 'Product Name', 'Product Category', 'Channel', 'Customer Satisfaction', 'Total Sales']


In [8]:
explorer.explore_data_with_spark(summary=True)

+-------+------------+----------+---------------+----------------+---------------+---------------+--------------------+-------------+----------------+----------------+-------+---------------------+------------------+
|summary|Invoice Date|Invoice ID|  Customer Type|   Customer Name|           City|Customer Gender|       Employee Name| Manager Name|    Product Name|Product Category|Channel|Customer Satisfaction|       Total Sales|
+-------+------------+----------+---------------+----------------+---------------+---------------+--------------------+-------------+----------------+----------------+-------+---------------------+------------------+
|  count|       49998|     49998|          49998|           49998|          49998|          49998|               49998|        49998|           49998|           49998|  49998|                49998|             49998|
|   mean|        NULL|      NULL|           NULL|            NULL|           NULL|           NULL|                NULL|         NULL

In [9]:
explorer.explore_data_with_spark(nulls=True)

+------------+----------+-------------+-------------+----+---------------+-------------+------------+------------+----------------+-------+---------------------+-----------+
|Invoice Date|Invoice ID|Customer Type|Customer Name|City|Customer Gender|Employee Name|Manager Name|Product Name|Product Category|Channel|Customer Satisfaction|Total Sales|
+------------+----------+-------------+-------------+----+---------------+-------------+------------+------------+----------------+-------+---------------------+-----------+
|           0|         0|            0|            0|   0|              0|            0|           0|           0|               0|      0|                    0|          0|
+------------+----------+-------------+-------------+----+---------------+-------------+------------+------------+----------------+-------+---------------------+-----------+



In [10]:
explorer.explore_data_with_spark(duplicates=True)

Duplicate Rows: 0


In [11]:
explorer.explore_data_with_spark(value_counts=True)


--- Invoice Date ---
+------------+-----+
|Invoice Date|count|
+------------+-----+
|  11/12/2023|   59|
|    8/1/2020|   53|
|   6/25/2023|   53|
|   10/7/2020|   52|
|    7/3/2020|   52|
|  12/27/2020|   51|
|    8/2/2021|   51|
|   8/20/2020|   50|
|   2/27/2020|   50|
|    7/3/2023|   50|
|   5/15/2022|   49|
|  12/30/2021|   49|
|    3/4/2021|   49|
|   9/15/2022|   49|
|   7/30/2022|   49|
|   5/22/2020|   48|
|  10/22/2020|   48|
|    6/9/2021|   48|
|  11/15/2022|   48|
|    8/3/2021|   48|
+------------+-----+
only showing top 20 rows


--- Invoice ID ---
+----------+-----+
|Invoice ID|count|
+----------+-----+
| ID-242374|    1|
|  ID-88813|    1|
| ID-306624|    1|
|  ID-72919|    1|
|  ID-61245|    1|
|   ID-1038|    1|
| ID-198702|    1|
| ID-297519|    1|
| ID-458725|    1|
| ID-158345|    1|
| ID-101394|    1|
| ID-153240|    1|
| ID-346847|    1|
| ID-170627|    1|
| ID-340587|    1|
| ID-219963|    1|
| ID-141305|    1|
| ID-138569|    1|
| ID-447967|    1|
| ID-25503