# ETL Extraction Notebook


## Libraries

In [2]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

## Loading Raw Data
- using the head and shape we can see that the data has 100 rows and 7 columns

In [3]:
rawData = pd.read_csv('data/raw_data.csv')
print(rawData.head()) # checking the first few records 
print(rawData.shape) # checking the dimesnions

   order_id customer_name product  quantity  unit_price  order_date region
0         1         Diana  Tablet       NaN       500.0  2024-01-20  South
1         2           Eve  Laptop       NaN         NaN  2024-04-29  North
2         3       Charlie  Laptop       2.0       250.0  2024-01-08    NaN
3         4           Eve  Laptop       2.0       750.0  2024-01-07   West
4         5           Eve  Tablet       3.0         NaN  2024-03-07  South
(100, 7)


In [4]:
print(rawData.info()) # checking the data types and non-null counts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   order_id       100 non-null    int64  
 1   customer_name  99 non-null     object 
 2   product        100 non-null    object 
 3   quantity       74 non-null     float64
 4   unit_price     65 non-null     float64
 5   order_date     99 non-null     object 
 6   region         75 non-null     object 
dtypes: float64(2), int64(1), object(4)
memory usage: 5.6+ KB
None


## Extraction Time

In [None]:
# Changing to Datetime format
print(f"Analysis performed on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Analysis performed on: 2025-06-24 19:58:07


## Data Analysis
- Using info, we can see there are a few null values in the data

In [None]:
print(rawData.info())
print(rawData.isnull().sum()) # checking for missing values
print("duplicates:")
print(rawData.duplicated().sum()) # checking for duplicates

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   order_id       100 non-null    int64  
 1   customer_name  99 non-null     object 
 2   product        100 non-null    object 
 3   quantity       74 non-null     float64
 4   unit_price     65 non-null     float64
 5   order_date     99 non-null     object 
 6   region         75 non-null     object 
dtypes: float64(2), int64(1), object(4)
memory usage: 5.6+ KB
None
order_id          0
customer_name     1
product           0
quantity         26
unit_price       35
order_date        1
region           25
dtype: int64
duplicates:
1


## Loading Incremental Data
- we can see that the incremental data has 10 rows and 7 columns

In [7]:
incrementalData = pd.read_csv('data/incremental_data.csv')
print(incrementalData.head()) # checking the first few records of incremental data
print(incrementalData.shape) # checking the dimensions of incremental data

   order_id customer_name product  quantity  unit_price  order_date   region
0       101         Alice  Laptop       NaN       900.0  2024-05-09  Central
1       102           NaN  Laptop       1.0       300.0  2024-05-07  Central
2       103           NaN  Laptop       1.0       600.0  2024-05-04  Central
3       104           NaN  Tablet       NaN       300.0  2024-05-26  Central
4       105         Heidi  Tablet       2.0       600.0  2024-05-21    North
(10, 7)


## Incremental Data Analysis
- we can see that the incremental data still has some null values

In [8]:
print(incrementalData.info())
print(incrementalData.isnull().sum()) # checking for missing values in incremental data
print("duplicates in incremental data:")
print(incrementalData.duplicated().sum()) # checking for duplicates in incremental data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   order_id       10 non-null     int64  
 1   customer_name  4 non-null      object 
 2   product        10 non-null     object 
 3   quantity       6 non-null      float64
 4   unit_price     10 non-null     float64
 5   order_date     10 non-null     object 
 6   region         8 non-null      object 
dtypes: float64(2), int64(1), object(4)
memory usage: 692.0+ bytes
None
order_id         0
customer_name    6
product          0
quantity         4
unit_price       0
order_date       0
region           2
dtype: int64
duplicates in incremental data:
0


## Key Observations
  - `quantity`: 26% missing  
  - `unit_price`: 35% missing  
  - `region`: 25% missing  
  - `customer_name`: 1 missing  
  - `order_date`: 1 missing  
- Incremental Data (10 rows):  
  - `customer_name`: 60% missing  
  - `quantity`: 40% missing  
  - `region`: 20% missing  
- Numeric: `order_id`, `quantity`, `unit_price`  
- Text/Date: `customer_name`, `product`, `region`, `order_date` (needs conversion to `datetime`)  

#### SAving a copy of the data

In [9]:
# Saving processed raw data
processed_raw_path = 'data/copy_raw_data.csv'
rawData.to_csv(processed_raw_path, index=False)

# Save processed incremental data
processed_incremental_path = 'data/copy_incremental_data.csv'
incrementalData.to_csv(processed_incremental_path, index=False)
