# URL Data Loading Test

**Author:** Data Science Essentials Project  
**Date:** September 23, 2025  
**Purpose:** Testing URL data loading functionality with automatic caching

This notebook demonstrates and tests the new URL data loading functionality added to the PandasSource class.

## Prerequisites

**Before running this notebook**, make sure you have:

- Set up the project environment with required dependencies
- Network access for downloading test datasets from web URLs
- The updated `PandasSource` class with URL support

**Features Tested:**
- Direct CSV loading from web URLs
- Automatic file caching with concurrent access safety
- Performance improvements from caching
- Error handling for invalid URLs
- Support for datasets with and without headers

---

In [None]:
# Add project root to Python path
import sys
import os
import time
from pathlib import Path

# Find the project root - handle both local and CI environments
notebook_dir = Path(os.getcwd())
if notebook_dir.name == 'exploratory' and notebook_dir.parent.name == 'notebooks':
    # Running notebook directly in its folder
    project_root = notebook_dir.parent.parent
else:
    # CI environment or other directory
    for possible_root in [Path(os.getcwd()), Path(os.getcwd()).parent]:
        if (possible_root / 'notebooks' / 'exploratory').exists():
            project_root = possible_root
            break
    else:
        # Fallback to relative path from notebook
        project_root = Path('.').absolute().parent.parent

# Add to Python path if not already there
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

print(f"Project root: {project_root}")

# Import PandasSource from project
from src.data.sources.pandas_source import PandasSource
import pandas as pd

print("Environment configured successfully")

## 1. Setup and Environment Configuration

In [None]:
# Test URL - Iris dataset from UCI repository
iris_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

# Define column names for Iris dataset (it has no header)
iris_column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

print(f"Loading data from: {iris_url}")

# Measure download time
start_time = time.time()
data_source = PandasSource(
    file_path=iris_url,
    header=False,  # No header in the file
    names=iris_column_names
)
download_time = time.time() - start_time

print(f"Data loaded successfully in {download_time:.2f} seconds")

In [None]:
# Display basic dataset information
print(f"Dataset shape: {data_source.df.shape}")
print(f"Dataset columns: {list(data_source.df.columns)}")
data_source.head()

In [None]:
# Generate descriptive statistics
data_source.describe()

## 3. Caching Performance Analysis

In [None]:
# Load the same URL again - should use cached version
start_time = time.time()
data_source_cached = PandasSource(
    file_path=iris_url,
    header=False,
    names=iris_column_names
)
cached_time = time.time() - start_time

print(f"Cached load time: {cached_time:.4f} seconds")
print(f"Speed improvement: {download_time/cached_time:.1f}x faster")

In [None]:
# Verify cached data integrity
data_integrity_check = data_source.df.equals(data_source_cached.df)
print(f"Data integrity verified: {data_integrity_check}")

## 4. Cache Management and Metadata

In [None]:
# Display source metadata
data_source.metadata

In [None]:
# Examine cache directory
cache_dir = Path(data_source.cache_dir)
if cache_dir.exists():
    csv_files = list(cache_dir.glob("*.csv"))
    print(f"Cache directory: {cache_dir}")
    print(f"Cached files: {len(csv_files)}")
    for file in csv_files:
        print(f"  - {file.name} ({file.stat().st_size} bytes)")
else:
    print(f"Cache directory not found: {cache_dir}")

## 5. Error Handling and Edge Cases

In [None]:
# Test error handling for invalid URLs
error_test_cases = [
    ("https://nonexistent-domain-12345.com/data.csv", "Non-existent domain"),
    ("not-a-url", "Invalid URL format"),
    ("", "Empty string"),
]

for test_url, description in error_test_cases:
    try:
        test_source = PandasSource(file_path=test_url)
        print(f"{description}: Unexpected success")
    except Exception as e:
        print(f"{description}: {type(e).__name__} (Expected)")

## 6. Multiple Data Sources Testing

In [None]:
# Test multiple data sources simultaneously
test_urls = [
    {
        "name": "Iris Dataset", 
        "url": "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",
        "columns": ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'],
        "header": False
    },
    {
        "name": "Tips Dataset",
        "url": "https://raw.githubusercontent.com/plotly/datasets/master/tips.csv", 
        "columns": None,
        "header": True
    }
]

successful_loads = 0
total_tests = len(test_urls)

for i, test_case in enumerate(test_urls, 1):
    try:
        print(f"Loading {test_case['name']}...")
        start_time = time.time()
        
        source = PandasSource(
            file_path=test_case['url'],
            header=test_case['header'],
            names=test_case['columns'] if not test_case['header'] else None
        )
        
        load_time = time.time() - start_time
        print(f"  ✓ Shape: {source.df.shape}")
        print(f"  ✓ Columns: {list(source.df.columns)[:3]}{'...' if len(source.df.columns) > 3 else ''}")
        print(f"  ✓ Load time: {load_time:.3f}s")
        print()
        
        successful_loads += 1
        
    except Exception as e:
        print(f"  ✗ Failed: {type(e).__name__}: {e}")
        print()

print(f"Summary: {successful_loads}/{total_tests} datasets loaded successfully")

In [None]:
# Test concurrent access to cache (simulate multiple processes)
import threading

def load_iris_data(thread_id):
    """Load iris data in a separate thread to test concurrent cache access"""
    try:
        iris_data = PandasSource(
            file_path=iris_url,
            header=False,
            names=iris_column_names
        )
        return f"Thread {thread_id}: Success - Shape {iris_data.df.shape}"
    except Exception as e:
        return f"Thread {thread_id}: Error - {e}"

# Launch multiple threads to test concurrent cache access
threads = []
results = []

print("Testing concurrent cache access...")
for i in range(3):
    thread = threading.Thread(target=lambda i=i: results.append(load_iris_data(i+1)))
    threads.append(thread)
    thread.start()

# Wait for all threads to complete
for thread in threads:
    thread.join()

print("Concurrent access results:")
for result in results:
    print(f"  {result}")

## 7. Cache Refresh Functionality

In [None]:
# Test cache refresh functionality
print(f"Before refresh - cached file exists: {data_source.file_path.exists()}")

data_source.refresh_cache()

print(f"After refresh - cached file exists: {data_source.file_path.exists()}")
print(f"Data shape after refresh: {data_source.df.shape}")

In [None]:
# Test loading from GitHub datasets (with headers)
tips_url = "https://raw.githubusercontent.com/plotly/datasets/master/tips.csv"

tips_source = PandasSource(file_path=tips_url, header=True)
print(f"Tips dataset shape: {tips_source.df.shape}")
print(f"Tips dataset columns: {list(tips_source.df.columns)}")
tips_source.head(3)

## 8. Summary

This notebook successfully demonstrated the URL data loading functionality in PandasSource:

- **Basic URL Loading**: Direct CSV loading from web URLs with proper column handling
- **Caching Performance**: Significant speed improvements for repeated access
- **Data Integrity**: Cached data maintains perfect consistency with original
- **Error Handling**: Robust handling of invalid URLs and network errors
- **Multiple Sources**: Support for different CSV formats (with/without headers)
- **Cache Management**: Automatic file caching with refresh capabilities

The implementation provides a seamless API for both local files and remote URLs, making it suitable for production environments.

In [None]:
# Test refresh_cache error handling for local files
try:
    local_file_path = project_root / "data" / "raw" / "iris.csv"
    local_source = PandasSource(file_path=str(local_file_path))
    local_source.refresh_cache()
    print("Unexpected success with local file")
except ValueError as e:
    print(f"Expected error for local file: {e}")