# Environment Setup for AG News Text Classification

## Overview

This notebook provides comprehensive environment setup following best practices from:
- Sculley et al. (2015): "Hidden Technical Debt in Machine Learning Systems"
- Amershi et al. (2019): "Software Engineering for Machine Learning: A Case Study"

### Tutorial Objectives
1. Verify system requirements
2. Install and configure dependencies
3. Validate GPU/TPU availability
4. Setup project paths and configurations
5. Test core module imports

Author: Võ Hải Dũng  
Email: vohaidung.work@gmail.com  
Date: 2025

## 1. System Requirements Verification

In [None]:
# Standard library imports
import sys
import os
import platform
import subprocess
from pathlib import Path
import warnings
from typing import Dict, List, Tuple, Optional

# System information
print("System Information")
print("="*50)
print(f"Platform: {platform.platform()}")
print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")
print(f"CPU cores: {os.cpu_count()}")

# Check Python version
required_python = (3, 8)
current_python = sys.version_info[:2]

if current_python < required_python:
    raise RuntimeError(
        f"Python {required_python[0]}.{required_python[1]}+ required. "
        f"Current: {current_python[0]}.{current_python[1]}"
    )
else:
    print(f"\nPython version check: PASSED")

## 2. GPU/Hardware Acceleration Setup

In [None]:
# Check for GPU availability
def check_gpu_availability() -> Dict[str, any]:
    """
    Check GPU availability following CUDA best practices.
    
    References:
        NVIDIA CUDA Programming Guide v11.0
    """
    gpu_info = {
        'cuda_available': False,
        'gpu_count': 0,
        'gpu_names': [],
        'cuda_version': None,
        'cudnn_version': None
    }
    
    try:
        import torch
        gpu_info['cuda_available'] = torch.cuda.is_available()
        
        if gpu_info['cuda_available']:
            gpu_info['gpu_count'] = torch.cuda.device_count()
            gpu_info['cuda_version'] = torch.version.cuda
            
            for i in range(gpu_info['gpu_count']):
                gpu_info['gpu_names'].append(torch.cuda.get_device_name(i))
            
            # Check cuDNN
            gpu_info['cudnn_version'] = torch.backends.cudnn.version()
            torch.backends.cudnn.benchmark = True
            
    except ImportError:
        print("PyTorch not installed. Install with: pip install torch")
    
    return gpu_info

# Check GPU
gpu_info = check_gpu_availability()
print("\nGPU Configuration")
print("="*50)

if gpu_info['cuda_available']:
    print(f"CUDA Available: Yes")
    print(f"CUDA Version: {gpu_info['cuda_version']}")
    print(f"cuDNN Version: {gpu_info['cudnn_version']}")
    print(f"Number of GPUs: {gpu_info['gpu_count']}")
    for i, name in enumerate(gpu_info['gpu_names']):
        print(f"  GPU {i}: {name}")
else:
    print("CUDA Available: No")
    print("Training will use CPU (slower)")

# Memory information
try:
    import psutil
    memory = psutil.virtual_memory()
    print(f"\nSystem Memory:")
    print(f"  Total: {memory.total / (1024**3):.1f} GB")
    print(f"  Available: {memory.available / (1024**3):.1f} GB")
    print(f"  Used: {memory.percent:.1f}%")
except ImportError:
    print("\npsutil not installed. Install with: pip install psutil")

## 3. Project Structure Setup

In [None]:
# Setup project paths
PROJECT_ROOT = Path("../..").resolve()
sys.path.insert(0, str(PROJECT_ROOT))

# Define critical directories
critical_dirs = {
    'root': PROJECT_ROOT,
    'src': PROJECT_ROOT / "src",
    'configs': PROJECT_ROOT / "configs",
    'data': PROJECT_ROOT / "data",
    'outputs': PROJECT_ROOT / "outputs",
    'scripts': PROJECT_ROOT / "scripts",
    'notebooks': PROJECT_ROOT / "notebooks"
}

print("Project Structure Verification")
print("="*50)
print(f"Project Root: {PROJECT_ROOT}")
print("\nDirectory Status:")

missing_dirs = []
for name, path in critical_dirs.items():
    exists = path.exists()
    status = "[EXISTS]" if exists else "[MISSING]"
    print(f"  {status:10} {name:10} : {path.relative_to(PROJECT_ROOT.parent)}")
    if not exists and name != 'root':
        missing_dirs.append(path)

# Create missing directories
if missing_dirs:
    print("\nCreating missing directories...")
    for dir_path in missing_dirs:
        dir_path.mkdir(parents=True, exist_ok=True)
        print(f"  Created: {dir_path.relative_to(PROJECT_ROOT.parent)}")

## 4. Dependency Installation

In [None]:
# Check and install required packages
def check_package_installation() -> Dict[str, bool]:
    """
    Check installation status of required packages.
    
    Following dependency management practices from:
        PEP 508 - Dependency specification for Python Software Packages
    """
    required_packages = {
        'numpy': 'numpy',
        'pandas': 'pandas',
        'torch': 'torch',
        'transformers': 'transformers',
        'datasets': 'datasets',
        'scikit-learn': 'sklearn',
        'matplotlib': 'matplotlib',
        'seaborn': 'seaborn',
        'tqdm': 'tqdm',
        'pyyaml': 'yaml'
    }
    
    installation_status = {}
    
    for package_name, import_name in required_packages.items():
        try:
            __import__(import_name)
            installation_status[package_name] = True
        except ImportError:
            installation_status[package_name] = False
    
    return installation_status

print("Package Installation Status")
print("="*50)

package_status = check_package_installation()
missing_packages = []

for package, installed in package_status.items():
    status = "[INSTALLED]" if installed else "[MISSING]"
    print(f"  {status:12} {package}")
    if not installed:
        missing_packages.append(package)

if missing_packages:
    print("\nMissing packages detected!")
    print("Install with:")
    print(f"  pip install {' '.join(missing_packages)}")
    print("\nOr install all requirements:")
    print(f"  pip install -r {PROJECT_ROOT}/requirements/base.txt")
else:
    print("\nAll required packages installed successfully!")

## 5. Core Module Import Validation

In [None]:
# Test core module imports
print("Core Module Import Test")
print("="*50)

import_tests = [
    ('Core', 'src.core.registry'),
    ('Core', 'src.core.factory'),
    ('Data', 'src.data.datasets.ag_news'),
    ('Data', 'src.data.preprocessing.text_cleaner'),
    ('Models', 'src.models.base.base_model'),
    ('Training', 'src.training.trainers.base_trainer'),
    ('Evaluation', 'src.evaluation.metrics.classification_metrics'),
    ('Utils', 'src.utils.io_utils'),
    ('Config', 'configs.constants'),
    ('API', 'src.api.rest.app'),
    ('Services', 'src.services.core.prediction_service')
]

failed_imports = []

for category, module_path in import_tests:
    try:
        __import__(module_path)
        print(f"  [OK] {category:10} : {module_path}")
    except ImportError as e:
        print(f"  [FAIL] {category:10} : {module_path}")
        print(f"         Error: {str(e)}")
        failed_imports.append((module_path, str(e)))

if failed_imports:
    print("\nWarning: Some modules failed to import.")
    print("This may affect certain functionalities.")
else:
    print("\nAll core modules imported successfully!")

## 6. Configuration Loading Test

In [None]:
# Test configuration loading
from configs.config_loader import ConfigLoader
from configs.constants import DATA_DIR, MODEL_DIR, OUTPUT_DIR

print("Configuration Loading Test")
print("="*50)

# Initialize config loader
config_loader = ConfigLoader()

# Test loading different configs
test_configs = [
    ('Training', 'training/standard/base_training.yaml'),
    ('Model', 'models/single/deberta_v3_xlarge.yaml'),
    ('Data', 'data/preprocessing/standard.yaml'),
    ('Environment', 'environments/dev.yaml')
]

for config_type, config_path in test_configs:
    try:
        config = config_loader.load_config(config_path)
        print(f"  [LOADED] {config_type:12} : Success")
        print(f"           Keys: {list(config.keys())[:3]}...")
    except Exception as e:
        print(f"  [ERROR] {config_type:12} : Failed")
        print(f"          Error: {str(e)}")

# Verify critical paths
print("\nCritical Paths:")
print(f"  Data Directory: {DATA_DIR}")
print(f"  Model Directory: {MODEL_DIR}")
print(f"  Output Directory: {OUTPUT_DIR}")

## 7. Dataset Download Verification

In [None]:
# Check if AG News dataset is available
print("Dataset Availability Check")
print("="*50)

from src.data.datasets.ag_news import AGNewsDataset, AGNewsConfig

# Check for processed data
processed_data_path = DATA_DIR / "processed"
raw_data_path = DATA_DIR / "raw" / "ag_news"

print(f"Checking data directories:")
print(f"  Raw data: {raw_data_path}")
print(f"    Exists: {raw_data_path.exists()}")
print(f"  Processed data: {processed_data_path}")
print(f"    Exists: {processed_data_path.exists()}")

# Try to load a small sample
try:
    config = AGNewsConfig(
        data_dir=processed_data_path,
        max_samples=100  # Load only 100 samples for testing
    )
    
    dataset = AGNewsDataset(config, split="train")
    print(f"\nDataset loaded successfully!")
    print(f"  Samples loaded: {len(dataset)}")
    print(f"  Classes: {dataset.num_classes}")
    print(f"  Label names: {dataset.class_names}")
    
except Exception as e:
    print(f"\nDataset not found or corrupted.")
    print(f"Error: {str(e)}")
    print(f"\nTo download the dataset, run:")
    print(f"  python {PROJECT_ROOT}/scripts/setup/download_all_data.py")

## 8. Environment Variables Setup

In [None]:
# Setup environment variables
import os
from dotenv import load_dotenv

print("Environment Variables Setup")
print("="*50)

# Load environment variables
env_file = PROJECT_ROOT / ".env"
env_example = PROJECT_ROOT / ".env.example"

if env_file.exists():
    load_dotenv(env_file)
    print(f"Loaded environment from: {env_file}")
elif env_example.exists():
    print(f"No .env file found. Copy from .env.example:")
    print(f"  cp {env_example} {env_file}")
else:
    print("No environment file found.")

# Set critical environment variables
critical_env_vars = {
    'PYTHONPATH': str(PROJECT_ROOT),
    'PROJECT_ROOT': str(PROJECT_ROOT),
    'CUDA_VISIBLE_DEVICES': '0',  # Use first GPU by default
    'TOKENIZERS_PARALLELISM': 'false',  # Avoid tokenizer warnings
    'TRANSFORMERS_CACHE': str(PROJECT_ROOT / "cache" / "transformers")
}

print("\nSetting environment variables:")
for var_name, var_value in critical_env_vars.items():
    if var_name not in os.environ:
        os.environ[var_name] = var_value
        print(f"  Set {var_name}")
    else:
        print(f"  {var_name} already set")

# Verify environment
print("\nEnvironment verification:")
print(f"  PYTHONPATH includes project: {str(PROJECT_ROOT) in os.environ.get('PYTHONPATH', '')}")
print(f"  GPU access configured: {'CUDA_VISIBLE_DEVICES' in os.environ}")

## 9. Final Validation

In [None]:
# Final comprehensive check
def final_environment_validation() -> Dict[str, bool]:
    """
    Perform final validation of environment setup.
    """
    checks = {
        'python_version': sys.version_info >= (3, 8),
        'project_structure': all(p.exists() for p in critical_dirs.values()),
        'core_imports': len(failed_imports) == 0,
        'packages_installed': len(missing_packages) == 0,
        'gpu_available': gpu_info['cuda_available'],
        'configs_loadable': True,  # Simplified check
        'data_accessible': (DATA_DIR / "processed").exists() or (DATA_DIR / "raw").exists()
    }
    
    return checks

print("Final Environment Validation")
print("="*50)

validation_results = final_environment_validation()
all_passed = all(validation_results.values())

for check_name, passed in validation_results.items():
    status = "[PASS]" if passed else "[FAIL]"
    requirement = "Required" if check_name != 'gpu_available' else "Optional"
    print(f"  {status:7} {check_name:20} [{requirement}]")

print("\n" + "="*50)
if all_passed or (not validation_results['gpu_available'] and 
                  all(v for k, v in validation_results.items() if k != 'gpu_available')):
    print("Environment setup complete! You can proceed with the tutorials.")
    print("\nNext steps:")
    print("  1. Run: notebooks/tutorials/01_data_loading_basics.ipynb")
    print("  2. Run: notebooks/tutorials/02_preprocessing_tutorial.ipynb")
    print("  3. Run: notebooks/tutorials/03_model_training_basics.ipynb")
else:
    print("Some required components are missing.")
    print("Please address the issues marked with [FAIL] above.")

## 10. Troubleshooting Guide

### Common Issues and Solutions

1. **CUDA/GPU Not Detected**:
   - Verify NVIDIA drivers: `nvidia-smi`
   - Check PyTorch CUDA version matches system CUDA
   - Reinstall PyTorch with CUDA support

2. **Import Errors**:
   - Ensure PYTHONPATH includes project root
   - Check for missing `__init__.py` files
   - Verify package installations

3. **Memory Issues**:
   - Reduce batch size in configs
   - Use gradient accumulation
   - Enable mixed precision training

4. **Dataset Download Failures**:
   - Check internet connectivity
   - Verify Hugging Face Hub access
   - Use manual download scripts

### Getting Help

For additional support:
1. Check project documentation: `docs/troubleshooting.md`
2. Review GitHub issues: https://github.com/VoHaiDung/ag-news-text-classification
3. Contact: vohaidung.work@gmail.com