# 🧩 Notebook 01: Data Loading & Inspection with pandas

> Load and inspect 10,000-row realistic datasets from CSV, JSON, Excel (single & multi-sheet), and Parquet using utility scripts and pandas best practices.

In [1]:
# Notebook import setup
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from scripts import utils_io

In [2]:
# Standard Imports
import pandas as pd
import numpy as np
from pathlib import Path

# Display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.precision', 2)

DATA_DIR = Path("../data")

## 📦 Dataset Overview

We now have 10,000-row mock datasets in various formats:

| Dataset               | Format      | Filename                      |
|-----------------------|-------------|-------------------------------|
| Superstore Sales      | CSV         | `superstore_sales.csv`        |
| Weather Data          | JSON        | `weather_data.json`           |
| Bank Loan Data        | Excel       | `bank_loans.xlsx`             |
| Bank Loans by Region  | Excel (multi-sheet) | `bank_loans_multisheet.xlsx` |
| COVID Data            | Parquet     | `covid_data.parquet`          |

In [3]:
# List all data files for confirmation
list(DATA_DIR.glob('*'))

[PosixPath('../data/covid_data.parquet'),
 PosixPath('../data/bank_loans_multisheet.xlsx'),
 PosixPath('../data/weather_data.json'),
 PosixPath('../data/bank_loans.xlsx'),
 PosixPath('../data/superstore_sales.csv')]

## 🧪 Data Loading: CSV, Excel, JSON, Parquet
We'll load all formats using both raw pandas and utility functions (`scripts/utils_io.py`)

In [4]:
# Load All Datasets
superstore_df = utils_io.load_csv(DATA_DIR / 'superstore_sales.csv')
weather_df = utils_io.load_json(DATA_DIR / 'weather_data.json')
loan_df = utils_io.load_excel(DATA_DIR / 'bank_loans.xlsx')
covid_df = utils_io.load_parquet(DATA_DIR / 'covid_data.parquet')

In [5]:
# Load Multi-Sheet Excel
multi_excel = pd.ExcelFile(DATA_DIR / "bank_loans_multisheet.xlsx")
print("Available Sheets:", multi_excel.sheet_names)

loan_east_df = multi_excel.parse("East")
loan_east_df.head()

Available Sheets: ['East', 'West', 'North', 'South']


Unnamed: 0,Customer_ID,Customer_Name,Age,Income,Loan_Amount,Loan_Purpose,Approved,Region
0,1001,James Reid,55,96964,36890,Education,Yes,East
1,1002,Howard Mathis,61,59846,56517,Education,Yes,East
2,1003,Karen Johnson,23,43879,42857,Home,Yes,East
3,1004,Jose Lee,54,27261,51149,Medical,No,East
4,1005,Danielle Caldwell,58,53340,17337,Vacation,Yes,East


## 🔍 Basic Inspection and Metadata
We'll inspect rows, columns, datatypes, nulls, and memory usage.

In [6]:
for name, df in zip(
    ["Superstore", "Weather", "Loan", "Loan-East", "COVID"],
    [superstore_df, weather_df, loan_df, loan_east_df, covid_df]
):
    print(f"\n📊 {name} Dataset")
    print(df.info())
    print("Shape:", df.shape)
    print("Memory (MB):", df.memory_usage(deep=True).sum() / 1024**2)
    print("-" * 60)



📊 Superstore Dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Order ID       10000 non-null  object 
 1   Customer ID    10000 non-null  object 
 2   Customer Name  10000 non-null  object 
 3   Segment        10000 non-null  object 
 4   Region         10000 non-null  object 
 5   Order Date     10000 non-null  object 
 6   Ship Date      10000 non-null  object 
 7   Category       10000 non-null  object 
 8   Sub-Category   10000 non-null  object 
 9   Product Name   10000 non-null  object 
 10  Sales          10000 non-null  float64
 11  Quantity       10000 non-null  int64  
 12  Discount       10000 non-null  float64
 13  Profit         10000 non-null  float64
dtypes: float64(3), int64(1), object(10)
memory usage: 1.1+ MB
None
Shape: (10000, 14)
Memory (MB): 6.683284759521484
------------------------------------------------

## 🔄 Type Conversion & Date Parsing
- Dates: `pd.to_datetime()`
- Categoricals: `astype('category')`

In [7]:
# Type Conversion and Date parsing example
superstore_df['Order Date'] = pd.to_datetime(superstore_df['Order Date'], errors='coerce')
superstore_df['Ship Date'] = pd.to_datetime(superstore_df['Ship Date'], errors='coerce')

superstore_df['Category'] = superstore_df['Category'].astype('category')
superstore_df['Sub-Category'] = superstore_df['Sub-Category'].astype('category')

## 🧼 5. Rename Columns & Cleanup

In [8]:
# Clean column names for consistency
superstore_df.columns = (
    superstore_df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("-", "_")
)
superstore_df.head()

Unnamed: 0,order_id,customer_id,customer_name,segment,region,order_date,ship_date,category,sub_category,product_name,sales,quantity,discount,profit
0,ORD-10000,CUST-9476,Mr. Michael Lopez,Home Office,Central,2020-01-01,2020-01-03,Furniture,Bookcases,Bookcases Model 1,1292.63,5,0.3,22.94
1,ORD-10001,CUST-9162,Robert Liu,Home Office,South,2020-01-02,2020-01-04,Furniture,Bookcases,Bookcases Model 2,1947.16,2,0.3,205.52
2,ORD-10002,CUST-3824,Nicole Bowman,Consumer,South,2020-01-03,2020-01-05,Furniture,Bookcases,Bookcases Model 3,1774.42,3,0.2,23.84
3,ORD-10003,CUST-8888,Stephen Flores,Consumer,Central,2020-01-04,2020-01-06,Furniture,Bookcases,Bookcases Model 4,591.01,8,0.0,64.85
4,ORD-10004,CUST-9980,Stephen Rodriguez,Corporate,South,2020-01-05,2020-01-07,Furniture,Bookcases,Bookcases Model 5,1969.55,4,0.2,58.99


## 📁 6. Save Intermediate Cleaned Version

In [9]:
# Save cleaned version to assets/
from scripts.utils_io import save_csv
save_csv(superstore_df, "../assets/superstore_cleaned.csv")

## Weather Data - JSON

In [10]:
# Inspect
weather_df.info()
print("Shape:", weather_df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           10000 non-null  datetime64[ns]
 1   temperature_c  10000 non-null  int64         
 2   humidity       10000 non-null  int64         
 3   condition      10000 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 312.6+ KB
Shape: (10000, 4)


In [11]:
# Exploring the dataset
weather_df.head()

Unnamed: 0,date,temperature_c,humidity,condition
0,2022-01-01,28,81,Snow
1,2022-01-02,4,90,Snow
2,2022-01-03,28,53,Cloudy
3,2022-01-04,11,82,Rain
4,2022-01-05,13,59,Rain


In [12]:
# Type Conversion
weather_df['date'] = pd.to_datetime(weather_df['date'], errors='coerce')
weather_df['condition'] = weather_df['condition'].astype('category')

In [13]:
# Column Cleanup
weather_df.columns = weather_df.columns.str.strip().str.lower().str.replace(" ", "_")

In [14]:
# Save
save_csv(weather_df, '../assets/weather_cleaned.csv')

## Bank Loan Data - Excel

In [15]:
# Inspect
loan_df.info()
print("Shape:", loan_df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Customer_ID    10000 non-null  int64 
 1   Customer_Name  10000 non-null  object
 2   Age            10000 non-null  int64 
 3   Income         10000 non-null  int64 
 4   Loan_Amount    10000 non-null  int64 
 5   Loan_Purpose   10000 non-null  object
 6   Approved       10000 non-null  object
dtypes: int64(4), object(3)
memory usage: 547.0+ KB
Shape: (10000, 7)


In [16]:
# Exploring the dataset
loan_df.head()

Unnamed: 0,Customer_ID,Customer_Name,Age,Income,Loan_Amount,Loan_Purpose,Approved
0,1001,James Reid,55,96964,36890,Education,Yes
1,1002,Howard Mathis,61,59846,56517,Education,Yes
2,1003,Karen Johnson,23,43879,42857,Home,Yes
3,1004,Jose Lee,54,27261,51149,Medical,No
4,1005,Danielle Caldwell,58,53340,17337,Vacation,Yes


In [17]:
# Type Conversion
loan_df['Loan_Amount'] = pd.to_numeric(loan_df['Loan_Amount'], errors='coerce')
loan_df['Income'] = pd.to_numeric(loan_df['Income'], errors='coerce')
loan_df['Approved'] = loan_df['Approved'].astype('category')
loan_df['Loan_Purpose'] = loan_df['Loan_Purpose'].astype('category')

In [18]:
# Column Cleanup
loan_df.columns = loan_df.columns.str.strip().str.lower().str.replace(" ", "_")

In [19]:
# Save
save_csv(loan_df, '../assets/loan_cleaned.csv')

## Multi-Sheet Bank Loans - Excel

In [20]:
# Load Excel file
multi_sheet_excel = pd.ExcelFile(DATA_DIR / 'bank_loans_multisheet.xlsx')
print("Sheets found:", multi_sheet_excel.sheet_names)

Sheets found: ['East', 'West', 'North', 'South']


In [21]:
# Loop through each region and save cleaned version
for sheet in multi_sheet_excel.sheet_names:
    regional_df = multi_sheet_excel.parse(sheet_name=sheet)

    # Type Conversion
    regional_df['Loan_Amount'] = pd.to_numeric(regional_df['Loan_Amount'], errors='coerce')
    regional_df['Income'] = pd.to_numeric(regional_df['Income'], errors='coerce')
    regional_df['Approved'] = regional_df['Approved'].astype('category')
    regional_df['Loan_Purpose'] = regional_df['Loan_Purpose'].astype('category')

    # Column Cleanup
    regional_df.columns = regional_df.columns.str.strip().str.lower().str.replace(" ", "_")

    # Save
    save_csv(regional_df, f'../assets/loan_cleaned_{sheet.lower()}.csv')

## Covid Data -- Parquet

In [22]:
# Inspect
covid_df.info()
print("Shape:", covid_df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          10000 non-null  datetime64[ns]
 1   country       10000 non-null  object        
 2   variant       10000 non-null  object        
 3   new_cases     10000 non-null  int64         
 4   new_deaths    10000 non-null  int64         
 5   hospitalized  10000 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 468.9+ KB
Shape: (10000, 6)


In [23]:
# Exploring the dataset
covid_df.head()

Unnamed: 0,date,country,variant,new_cases,new_deaths,hospitalized
0,2020-01-01,Germany,Omicron,512,10,613
1,2020-01-02,Germany,XBB,524,11,1911
2,2020-01-03,Canada,Omicron,502,9,1059
3,2020-01-04,Brazil,XBB,470,10,224
4,2020-01-05,USA,BA.5,497,16,2239


In [24]:
# Type Conversion
covid_df['date'] = pd.to_datetime(covid_df['date'], errors='coerce')
covid_df['country'] = covid_df['country'].astype('category')
covid_df['variant'] = covid_df['variant'].astype('category')

In [25]:
# Column Cleanup
covid_df.columns = covid_df.columns.str.strip().str.lower().str.replace(" ", "_")

In [26]:
# Save
save_csv(covid_df, '../assets/covid_cleaned.csv')

---
## ✅ Summary

We successfully:

- Loaded **five large datasets** from CSV, JSON, Excel (single & multi-sheet), and Parquet
- Inspected structure, shape, and memory usage
- Performed type conversions (e.g., dates, categoricals)
- Cleaned column names into `snake_case`
- Saved preprocessed versions to `assets/` for downstream analysis

📦 Next: We’ll dive into **data cleaning** — missing values, outliers, deduplication — in `02_data_cleaning.ipynb`.
