# Environment Setup for AG News Text Classification

## Overview

This tutorial provides comprehensive setup instructions for the AG News Text Classification project following best practices from:
- Sculley et al. (2015): "Hidden Technical Debt in Machine Learning Systems"
- Amershi et al. (2019): "Software Engineering for Machine Learning: A Case Study"

### Learning Objectives
1. Set up development environment with proper dependencies
2. Verify system requirements and GPU availability
3. Configure project paths and environment variables
4. Validate installation with basic tests

Author: Võ Hải Dũng  
Email: vohaidung.work@gmail.com  
Date: 2025

## 1. System Information

In [None]:
# Standard library imports
import os
import sys
import platform
import subprocess
from pathlib import Path
from typing import Dict, List, Optional, Tuple

# Display system information
print("System Information")
print("="*50)
print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")
print(f"Platform: {platform.platform()}")
print(f"Machine: {platform.machine()}")
print(f"Processor: {platform.processor()}")
print(f"CPU cores: {os.cpu_count()}")

# Memory information
try:
    import psutil
    mem = psutil.virtual_memory()
    print(f"\nMemory Information:")
    print(f"  Total: {mem.total / (1024**3):.2f} GB")
    print(f"  Available: {mem.available / (1024**3):.2f} GB")
    print(f"  Used: {mem.percent:.1f}%")
except ImportError:
    print("\nNote: Install psutil for memory information")

## 2. Project Structure Setup

In [None]:
# Define project root and verify structure
PROJECT_ROOT = Path("../..").resolve()
print(f"Project root: {PROJECT_ROOT}")

# Verify essential directories exist
required_dirs = [
    "src",
    "configs", 
    "data",
    "scripts",
    "tests",
    "notebooks"
]

print("\nVerifying project structure:")
missing_dirs = []
for dir_name in required_dirs:
    dir_path = PROJECT_ROOT / dir_name
    exists = dir_path.exists()
    status = "✓" if exists else "✗"
    print(f"  {status} {dir_name}/")
    if not exists:
        missing_dirs.append(dir_name)

if missing_dirs:
    print(f"\nWarning: Missing directories: {', '.join(missing_dirs)}")
    print("Creating missing directories...")
    for dir_name in missing_dirs:
        (PROJECT_ROOT / dir_name).mkdir(parents=True, exist_ok=True)
    print("Directories created successfully.")

# Add project root to Python path
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
    print(f"\nAdded {PROJECT_ROOT} to Python path")

## 3. Environment Variables Configuration

In [None]:
# Load environment variables
from pathlib import Path
import os
from typing import Dict

def load_env_file(env_path: Path) -> Dict[str, str]:
    """Load environment variables from .env file."""
    env_vars = {}
    if env_path.exists():
        with open(env_path, 'r') as f:
            for line in f:
                line = line.strip()
                if line and not line.startswith('#') and '=' in line:
                    key, value = line.split('=', 1)
                    env_vars[key.strip()] = value.strip()
    return env_vars

# Check for .env file
env_file = PROJECT_ROOT / ".env"
env_example = PROJECT_ROOT / ".env.example"

if not env_file.exists() and env_example.exists():
    print("Creating .env file from .env.example...")
    import shutil
    shutil.copy(env_example, env_file)
    print("Please update .env file with your configuration.")

# Load environment variables
if env_file.exists():
    env_vars = load_env_file(env_file)
    print(f"Loaded {len(env_vars)} environment variables from .env")
    
    # Set environment variables
    for key, value in env_vars.items():
        os.environ[key] = value
else:
    print("No .env file found. Using default configuration.")

# Display important environment variables (without sensitive values)
print("\nEnvironment Configuration:")
important_vars = [
    "PROJECT_NAME",
    "ENVIRONMENT", 
    "LOG_LEVEL",
    "CUDA_VISIBLE_DEVICES"
]

for var in important_vars:
    value = os.environ.get(var, "Not set")
    # Mask sensitive values
    if "KEY" in var or "SECRET" in var:
        value = "***" if value != "Not set" else value
    print(f"  {var}: {value}")

## 4. Dependencies Installation

In [None]:
# Check and install required packages
import subprocess
import importlib.util

def check_package(package_name: str) -> bool:
    """Check if a package is installed."""
    spec = importlib.util.find_spec(package_name)
    return spec is not None

# Define package groups
package_groups = {
    "Core": ["numpy", "pandas", "scipy", "scikit-learn"],
    "Deep Learning": ["torch", "transformers", "datasets", "tokenizers"],
    "Visualization": ["matplotlib", "seaborn", "plotly"],
    "Utilities": ["tqdm", "pyyaml", "python-dotenv", "requests"]
}

print("Package Installation Status:")
print("="*50)

missing_packages = []
for group, packages in package_groups.items():
    print(f"\n{group}:")
    for package in packages:
        # Handle special cases for import names
        import_name = package
        if package == "scikit-learn":
            import_name = "sklearn"
        elif package == "python-dotenv":
            import_name = "dotenv"
        elif package == "pyyaml":
            import_name = "yaml"
            
        installed = check_package(import_name)
        status = "✓" if installed else "✗"
        print(f"  {status} {package}")
        
        if not installed:
            missing_packages.append(package)

if missing_packages:
    print(f"\nMissing packages: {', '.join(missing_packages)}")
    print("\nTo install missing packages, run:")
    print(f"pip install {' '.join(missing_packages)}")
else:
    print("\nAll required packages are installed!")

## 5. GPU/CUDA Configuration

In [None]:
# Check GPU availability
print("GPU/CUDA Configuration")
print("="*50)

try:
    import torch
    
    cuda_available = torch.cuda.is_available()
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {cuda_available}")
    
    if cuda_available:
        print(f"CUDA version: {torch.version.cuda}")
        print(f"cuDNN version: {torch.backends.cudnn.version()}")
        print(f"Number of GPUs: {torch.cuda.device_count()}")
        
        for i in range(torch.cuda.device_count()):
            props = torch.cuda.get_device_properties(i)
            print(f"\nGPU {i}: {props.name}")
            print(f"  Memory: {props.total_memory / (1024**3):.2f} GB")
            print(f"  Compute Capability: {props.major}.{props.minor}")
            
            # Current memory usage
            if torch.cuda.is_available():
                allocated = torch.cuda.memory_allocated(i) / (1024**3)
                reserved = torch.cuda.memory_reserved(i) / (1024**3)
                print(f"  Current Usage: {allocated:.2f} GB allocated, {reserved:.2f} GB reserved")
    else:
        print("\nNo GPU detected. Training will use CPU.")
        print("For GPU support:")
        print("  1. Ensure NVIDIA GPU is available")
        print("  2. Install CUDA Toolkit")
        print("  3. Install PyTorch with CUDA support")
        
    # Test tensor operations
    device = "cuda" if cuda_available else "cpu"
    test_tensor = torch.randn(100, 100).to(device)
    result = torch.matmul(test_tensor, test_tensor.t())
    print(f"\nTensor operations on {device}: Success")
    
except ImportError:
    print("PyTorch not installed. Install with:")
    print("  pip install torch torchvision torchaudio")
except Exception as e:
    print(f"Error checking GPU: {e}")

## 6. Project Dependencies Verification

In [None]:
# Verify project-specific imports
print("Project Module Verification")
print("="*50)

# Test critical imports
test_imports = [
    ("src.core.registry", "Registry"),
    ("src.core.factory", "Factory"),
    ("src.data.datasets.ag_news", "AGNewsDataset"),
    ("src.utils.logging_config", "setup_logging"),
    ("configs.config_loader", "ConfigLoader")
]

import_status = {}
for module_path, component in test_imports:
    try:
        module = __import__(module_path, fromlist=[component])
        obj = getattr(module, component, None)
        if obj:
            import_status[module_path] = True
            print(f"✓ {module_path}.{component}")
        else:
            import_status[module_path] = False
            print(f"✗ {module_path}.{component} - Component not found")
    except ImportError as e:
        import_status[module_path] = False
        print(f"✗ {module_path} - {str(e)}")
    except Exception as e:
        import_status[module_path] = False
        print(f"✗ {module_path} - Unexpected error: {str(e)}")

# Summary
success_count = sum(import_status.values())
total_count = len(import_status)
print(f"\nImport Success Rate: {success_count}/{total_count} ({success_count/total_count*100:.1f}%)")

if success_count < total_count:
    print("\nSome modules failed to import. Check error messages above.")
else:
    print("\nAll project modules imported successfully!")

## 7. Data Directory Setup

In [None]:
# Setup data directories
from pathlib import Path

DATA_ROOT = PROJECT_ROOT / "data"

# Define data directory structure
data_dirs = {
    "raw": ["ag_news"],
    "processed": ["train", "validation", "test", "stratified_folds"],
    "augmented": ["back_translated", "paraphrased", "synthetic", "mixup"],
    "external": ["news_corpus", "pretrain_data"],
    "cache": ["model_cache", "api_cache", "service_cache"]
}

print("Data Directory Setup")
print("="*50)

for parent_dir, subdirs in data_dirs.items():
    parent_path = DATA_ROOT / parent_dir
    print(f"\n{parent_dir}/")
    
    # Create parent directory
    parent_path.mkdir(parents=True, exist_ok=True)
    
    # Create subdirectories
    for subdir in subdirs:
        subdir_path = parent_path / subdir
        subdir_path.mkdir(parents=True, exist_ok=True)
        print(f"  ✓ {subdir}/")

# Check available space
try:
    import shutil
    stat = shutil.disk_usage(DATA_ROOT)
    print(f"\nDisk Space:")
    print(f"  Total: {stat.total / (1024**3):.2f} GB")
    print(f"  Free: {stat.free / (1024**3):.2f} GB")
    print(f"  Used: {(stat.used / stat.total) * 100:.1f}%")
    
    if stat.free < 10 * (1024**3):  # Less than 10GB free
        print("\nWarning: Low disk space. Consider freeing up space for model training.")
except Exception as e:
    print(f"Could not check disk space: {e}")

print(f"\nData root directory: {DATA_ROOT}")

## 8. Configuration Files Verification

In [None]:
# Verify configuration files
import yaml
from pathlib import Path

CONFIG_ROOT = PROJECT_ROOT / "configs"

print("Configuration Files Verification")
print("="*50)

# Check for key configuration files
config_categories = [
    ("environments", ["dev.yaml", "staging.yaml", "prod.yaml"]),
    ("models/single", ["deberta_v3_xlarge.yaml", "roberta_large.yaml"]),
    ("training/standard", ["base_training.yaml"]),
    ("data/preprocessing", ["standard.yaml", "advanced.yaml"]),
    ("api", ["rest_config.yaml", "auth_config.yaml"])
]

valid_configs = []
invalid_configs = []

for category, files in config_categories:
    print(f"\n{category}:")
    category_path = CONFIG_ROOT / category
    
    for file_name in files:
        file_path = category_path / file_name
        
        if file_path.exists():
            try:
                with open(file_path, 'r') as f:
                    config_data = yaml.safe_load(f)
                    if config_data:
                        print(f"  ✓ {file_name} (valid)")
                        valid_configs.append(str(file_path))
                    else:
                        print(f"  ⚠ {file_name} (empty)")
            except yaml.YAMLError as e:
                print(f"  ✗ {file_name} (invalid YAML)")
                invalid_configs.append(str(file_path))
        else:
            print(f"  ✗ {file_name} (not found)")

print(f"\n\nSummary:")
print(f"  Valid configs: {len(valid_configs)}")
print(f"  Invalid configs: {len(invalid_configs)}")

if invalid_configs:
    print(f"\nInvalid configuration files need attention:")
    for config in invalid_configs:
        print(f"  - {config}")

## 9. Test Basic Functionality

In [None]:
# Test basic functionality
print("Basic Functionality Test")
print("="*50)

# Test 1: Logging
print("\n1. Testing logging system...")
try:
    from src.utils.logging_config import setup_logging, get_logger
    
    logger = setup_logging(
        name="test_logger",
        log_level="INFO",
        log_file=None  # Console only for testing
    )
    logger.info("Logging system working correctly")
    print("   ✓ Logging system functional")
except Exception as e:
    print(f"   ✗ Logging error: {e}")

# Test 2: Configuration loading
print("\n2. Testing configuration loader...")
try:
    from configs.config_loader import ConfigLoader
    
    config_loader = ConfigLoader(CONFIG_ROOT)
    test_config = config_loader.load_config("environments/dev.yaml")
    if test_config:
        print(f"   ✓ Configuration loaded: {len(test_config)} keys")
    else:
        print("   ⚠ Configuration empty")
except Exception as e:
    print(f"   ✗ Configuration error: {e}")

# Test 3: Registry pattern
print("\n3. Testing registry pattern...")
try:
    from src.core.registry import Registry
    
    registry = Registry("test")
    
    @registry.register("test_component")
    class TestComponent:
        pass
    
    component = registry.get("test_component")
    if component:
        print("   ✓ Registry pattern functional")
    else:
        print("   ✗ Registry pattern failed")
except Exception as e:
    print(f"   ✗ Registry error: {e}")

# Test 4: Reproducibility utilities
print("\n4. Testing reproducibility utilities...")
try:
    from src.utils.reproducibility import set_seed, get_reproducible_config
    
    set_seed(42)
    config = get_reproducible_config()
    print(f"   ✓ Reproducibility utilities functional (seed: {config['seed']})")
except Exception as e:
    print(f"   ✗ Reproducibility error: {e}")

print("\n" + "="*50)
print("Setup verification complete!")

## 10. Environment Summary and Next Steps

In [None]:
# Generate environment summary
from datetime import datetime
import json

# Collect environment information
env_summary = {
    "timestamp": datetime.now().isoformat(),
    "python_version": sys.version,
    "platform": platform.platform(),
    "project_root": str(PROJECT_ROOT),
    "gpu_available": torch.cuda.is_available() if 'torch' in sys.modules else False,
    "configurations": {
        "valid": len(valid_configs),
        "invalid": len(invalid_configs)
    }
}

# Save summary
summary_path = PROJECT_ROOT / "outputs" / "setup_summary.json"
summary_path.parent.mkdir(parents=True, exist_ok=True)

with open(summary_path, 'w') as f:
    json.dump(env_summary, f, indent=2)

print("Environment Setup Summary")
print("="*50)
print(f"Setup completed at: {env_summary['timestamp']}")
print(f"Summary saved to: {summary_path}")

print("\nNext Steps:")
print("1. Review and update .env file with your specific settings")
print("2. Download AG News dataset using scripts/setup/download_all_data.py")
print("3. Proceed to 01_data_loading_basics.ipynb tutorial")
print("4. Explore model training with 03_model_training_basics.ipynb")

print("\nUseful Commands:")
print("  Download data: python scripts/setup/download_all_data.py")
print("  Verify setup: python scripts/setup/verify_installation.py")
print("  Quick start: python quickstart/minimal_example.py")

print("\n" + "="*50)
print("Environment setup complete! Ready for AG News classification.")