# 02 – Data Ingestion

This notebook demonstrates enhanced data ingestion capabilities including:
- Automated data profiling
- Data sampling for large datasets
- Data quality checks during ingestion
- Progress monitoring
- Source data verification

In [1]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import json
import sys
import os
sys.path.insert(0, os.path.abspath('..'))

  This is separate from the ipykernel package so we can avoid doing imports until


In [2]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [3]:
from datetime import datetime
from scripts.ingestion import ingest_csv, ingest_api

# Define paths
date_partition = "20250821"  # Using existing data folder instead of current date
source_csv_dir = os.path.join('..', 'data', 'raw', 'source_csv', date_partition)
source_api_file = os.path.join('..', 'data', 'raw', 'source_api', date_partition, 'web_logs.jsonl')
raw_root = os.path.join('..', 'data', 'raw')

# Ingest the CSV and API data
ingest_csv(source_csv_dir, raw_root)
ingest_api(source_api_file, raw_root)

print('Ingestion complete.')

2025-08-24 18:05:14 - ingest_csv - INFO - Ingested customers.csv with 5000 rows to ingested_20250824_180514_customers.csv
2025-08-24 18:05:14 - ingest_csv - INFO - Ingested telco_train.csv with 0 rows to ingested_20250824_180514_telco_train.csv
2025-08-24 18:05:14 - ingest_csv - INFO - Ingested telco_train.csv with 0 rows to ingested_20250824_180514_telco_train.csv
2025-08-24 18:05:14 - ingest_csv - INFO - Ingested transactions.csv with 24911 rows to ingested_20250824_180514_transactions.csv
2025-08-24 18:05:14 - ingest_api - INFO - Ingested web_logs.jsonl with 25099 events to ingested_20250824_180514_web_logs.jsonl
2025-08-24 18:05:14 - ingest_csv - INFO - Ingested transactions.csv with 24911 rows to ingested_20250824_180514_transactions.csv
2025-08-24 18:05:14 - ingest_api - INFO - Ingested web_logs.jsonl with 25099 events to ingested_20250824_180514_web_logs.jsonl


Ingestion complete.


In [4]:
def profile_dataset(df, title):
    """Generate a detailed profile report for the dataset."""
    profile = ProfileReport(df, title=title, explorative=True)
    return profile

def sample_large_dataset(file_path, sample_size=10000, random_state=42):
    """Sample large datasets efficiently using chunking."""
    # Get total number of rows
    total_rows = sum(1 for _ in open(file_path)) - 1  # subtract header
    
    if total_rows <= sample_size:
        return pd.read_csv(file_path)
    
    # Calculate skip rows
    skip_rate = total_rows // sample_size
    skip_indices = set(range(1, total_rows + 1))  # keep header (0)
    skip_indices -= set(range(1, total_rows + 1, skip_rate))  # keep every nth row
    skip_indices = sorted(list(skip_indices))
    
    return pd.read_csv(file_path, skiprows=skip_indices)

def verify_data_schema(df, expected_schema):
    """Verify that the dataframe matches expected schema."""
    issues = []
    
    # Check columns
    missing_cols = set(expected_schema['columns']) - set(df.columns)
    extra_cols = set(df.columns) - set(expected_schema['columns'])
    
    if missing_cols:
        issues.append(f"Missing columns: {missing_cols}")
    if extra_cols:
        issues.append(f"Extra columns: {extra_cols}")
    
    # Check data types
    for col, dtype in expected_schema['dtypes'].items():
        if col in df.columns:
            if not pd.api.types.is_dtype_equal(df[col].dtype, dtype):
                issues.append(f"Column {col} has dtype {df[col].dtype}, expected {dtype}")
    
    return issues

def plot_data_distribution(df):
    """Plot distribution of numerical and categorical columns."""
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    
    # Plot numerical distributions
    if len(numerical_cols) > 0:
        fig, axes = plt.subplots(len(numerical_cols), 1, figsize=(12, 4*len(numerical_cols)))
        if len(numerical_cols) == 1:
            axes = [axes]
        
        for ax, col in zip(axes, numerical_cols):
            sns.histplot(data=df, x=col, ax=ax)
            ax.set_title(f'Distribution of {col}')
        plt.tight_layout()
        plt.show()
    
    # Plot categorical distributions
    if len(categorical_cols) > 0:
        fig, axes = plt.subplots(len(categorical_cols), 1, figsize=(12, 4*len(categorical_cols)))
        if len(categorical_cols) == 1:
            axes = [axes]
        
        for ax, col in zip(axes, categorical_cols):
            value_counts = df[col].value_counts()
            if value_counts.empty:
                ax.set_title(f'Distribution of {col} (no data)')
                ax.axis('off')
                continue
            value_counts.plot(kind='bar', ax=ax)
            ax.set_title(f'Distribution of {col}')
            ax.tick_params(axis='x', rotation=45)
        plt.tight_layout()
        plt.show()

In [5]:
# Define expected schemas
schemas = {
    'customers': {
        'columns': ['customer_id', 'gender', 'senior_citizen', 'partner', 'dependents',
                   'tenure_months', 'monthly_charges', 'total_charges', 'contract',
                   'internet_service', 'phone_service', 'churn'],
        'dtypes': {
            'customer_id': 'object',
            'gender': 'object',
            'senior_citizen': 'int64',
            'partner': 'object',
            'dependents': 'object',
            'tenure_months': 'int64',
            'monthly_charges': 'float64',
            'total_charges': 'float64',
            'contract': 'object',
            'internet_service': 'object',
            'phone_service': 'object',
            'churn': 'object'
        }
    },
    'transactions': {
        'columns': ['transaction_id', 'customer_id', 'transaction_date', 'amount'],
        'dtypes': {
            'transaction_id': 'object',
            'customer_id': 'object',
            'transaction_date': 'object',
            'amount': 'float64'
        }
    }
}

In [6]:
# Define paths
date_partition = "20250821"  # Using existing data folder
source_csv_dir = os.path.join('..', 'data', 'raw', 'source_csv', date_partition)
source_api_file = os.path.join('..', 'data', 'raw', 'source_api', date_partition, 'web_logs.jsonl')
raw_root = os.path.join('..', 'data', 'raw')

# Process CSV files with profiling and validation
for filename in os.listdir(source_csv_dir):
    if not filename.endswith('.csv'):
        continue
        
    print(f"\nProcessing {filename}:")
    file_path = os.path.join(source_csv_dir, filename)
    
    # Sample data if file is large
    print("Loading and sampling data...")
    df = sample_large_dataset(file_path)
    
    # Verify schema
    dataset_name = filename.replace('.csv', '')
    if dataset_name in schemas:
        print("Verifying schema...")
        issues = verify_data_schema(df, schemas[dataset_name])
        if issues:
            print("Schema issues found:")
            for issue in issues:
                print(f"- {issue}")
        else:
            print("Schema verification passed!")
    
    # Generate and save profile report only if DataFrame is not empty
    if not df.empty:
        print("Generating profile report...")
        profile = profile_dataset(df, f"Data Profile - {filename}")
        profile_path = os.path.join(raw_root, f"{dataset_name}_profile.html")
        profile.to_file(profile_path)
        print(f"Profile report saved to {profile_path}")
    else:
        print(f"Skipped profiling for {filename}: DataFrame is empty.")
    
    # Plot distributions
    print("Plotting distributions...")
    plot_data_distribution(df)
    
# Ingest the data using existing functions
from scripts.ingestion import ingest_csv, ingest_api

print("\nIngesting CSV data...")
ingest_csv(source_csv_dir, raw_root)

print("\nIngesting API data...")
ingest_api(source_api_file, raw_root)

print('\nIngestion complete with enhanced profiling and validation!')


Processing customers.csv:
Loading and sampling data...
Verifying schema...
Schema verification passed!
Generating profile report...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Profile report saved to ../data/raw/customers_profile.html
Plotting distributions...





Processing telco_train.csv:
Loading and sampling data...
Skipped profiling for telco_train.csv: DataFrame is empty.
Plotting distributions...

Processing transactions.csv:
Loading and sampling data...
Verifying schema...
Schema verification passed!
Generating profile report...




Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Profile report saved to ../data/raw/transactions_profile.html
Plotting distributions...


2025-08-24 18:05:59 - ingest_csv - INFO - Ingested customers.csv with 5000 rows to ingested_20250824_180559_customers.csv
2025-08-24 18:05:59 - ingest_csv - INFO - Ingested telco_train.csv with 0 rows to ingested_20250824_180559_telco_train.csv
2025-08-24 18:05:59 - ingest_csv - INFO - Ingested customers.csv with 5000 rows to ingested_20250824_180559_customers.csv
2025-08-24 18:05:59 - ingest_csv - INFO - Ingested telco_train.csv with 0 rows to ingested_20250824_180559_telco_train.csv
2025-08-24 18:05:59 - ingest_csv - INFO - Ingested transactions.csv with 24911 rows to ingested_20250824_180559_transactions.csv
2025-08-24 18:05:59 - ingest_api - INFO - Ingested web_logs.jsonl with 25099 events to ingested_20250824_180559_web_logs.jsonl
2025-08-24 18:05:59 - ingest_csv - INFO - Ingested transactions.csv with 24911 rows to ingested_20250824_180559_transactions.csv
2025-08-24 18:05:59 - ingest_api - INFO - Ingested web_logs.jsonl with 25099 events to ingested_20250824_180559_web_logs.json


Ingesting CSV data...

Ingesting API data...

Ingestion complete with enhanced profiling and validation!
