# Environment Setup for AG News Classification

## Overview

This notebook provides comprehensive guidance for setting up the development environment following best practices from:
- Python Packaging Authority (PyPA) guidelines
- TensorFlow and PyTorch installation recommendations
- MLOps best practices for reproducible environments

### Objectives
1. Verify Python environment and dependencies
2. Configure GPU support for deep learning
3. Test all major components
4. Validate data pipeline and model loading

Author: Võ Hải Dũng  
Email: vohaidung.work@gmail.com  
Date: 2025

## 1. Python Environment Verification

In [None]:
# Standard library imports
import sys
import os
import platform
import subprocess
from pathlib import Path
from datetime import datetime
import warnings

# Suppress warnings for clean output
warnings.filterwarnings('ignore')

# System information
print("System Information")
print("=" * 50)
print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")
print(f"Platform: {platform.platform()}")
print(f"Architecture: {platform.machine()}")
print(f"Processor: {platform.processor()}")
print(f"Current directory: {os.getcwd()}")
print(f"Timestamp: {datetime.now().isoformat()}")

# Check Python version requirement
python_version = sys.version_info
required_version = (3, 8)

if python_version >= required_version:
    print(f"\nPython version check: PASSED (>= {required_version[0]}.{required_version[1]})")
else:
    print(f"\nPython version check: FAILED")
    print(f"Required: Python {required_version[0]}.{required_version[1]} or higher")
    print(f"Current: Python {python_version.major}.{python_version.minor}")

## 2. Project Structure Verification

In [None]:
# Verify project structure
PROJECT_ROOT = Path("../..").resolve()
print(f"Project root: {PROJECT_ROOT}")

# Essential directories
essential_dirs = [
    "src",
    "configs", 
    "data",
    "scripts",
    "tests",
    "notebooks",
    "requirements"
]

print("\nProject Structure Check:")
print("-" * 30)

missing_dirs = []
for dir_name in essential_dirs:
    dir_path = PROJECT_ROOT / dir_name
    if dir_path.exists():
        print(f"  {dir_name}: Found")
    else:
        print(f"  {dir_name}: Missing")
        missing_dirs.append(dir_name)

if missing_dirs:
    print(f"\nWarning: Missing directories: {', '.join(missing_dirs)}")
    print("Creating missing directories...")
    for dir_name in missing_dirs:
        dir_path = PROJECT_ROOT / dir_name
        dir_path.mkdir(parents=True, exist_ok=True)
        print(f"  Created: {dir_path}")
else:
    print("\nAll essential directories present.")

## 3. Core Dependencies Check

In [None]:
def check_package(package_name: str, import_name: str = None):
    """
    Check if a package is installed and get its version.
    
    Args:
        package_name: Name of the package
        import_name: Import name if different from package name
    
    Returns:
        tuple: (is_installed, version)
    """
    import_name = import_name or package_name
    
    try:
        module = __import__(import_name)
        version = getattr(module, '__version__', 'unknown')
        return True, version
    except ImportError:
        return False, None

# Core packages to check
core_packages = [
    ('numpy', None),
    ('pandas', None),
    ('torch', None),
    ('transformers', None),
    ('datasets', None),
    ('scikit-learn', 'sklearn'),
    ('matplotlib', None),
    ('seaborn', None),
    ('tqdm', None),
    ('pydantic', None),
    ('fastapi', None),
    ('uvicorn', None)
]

print("Core Dependencies Check:")
print("=" * 50)
print(f"{'Package':<20} {'Status':<15} {'Version':<15}")
print("-" * 50)

missing_packages = []
for package_name, import_name in core_packages:
    is_installed, version = check_package(package_name, import_name)
    
    if is_installed:
        status = "Installed"
        version_str = version
    else:
        status = "Missing"
        version_str = "-"
        missing_packages.append(package_name)
    
    print(f"{package_name:<20} {status:<15} {version_str:<15}")

if missing_packages:
    print(f"\nMissing packages detected: {', '.join(missing_packages)}")
    print("Install missing packages with:")
    print(f"  pip install {' '.join(missing_packages)}")
else:
    print("\nAll core dependencies are installed.")

## 4. GPU/CUDA Configuration

In [None]:
# Check GPU availability
print("GPU/CUDA Configuration:")
print("=" * 50)

try:
    import torch
    
    cuda_available = torch.cuda.is_available()
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {cuda_available}")
    
    if cuda_available:
        print(f"CUDA version: {torch.version.cuda}")
        print(f"cuDNN version: {torch.backends.cudnn.version()}")
        print(f"Number of GPUs: {torch.cuda.device_count()}")
        
        for i in range(torch.cuda.device_count()):
            props = torch.cuda.get_device_properties(i)
            print(f"\nGPU {i}: {props.name}")
            print(f"  Memory: {props.total_memory / 1024**3:.1f} GB")
            print(f"  Compute Capability: {props.major}.{props.minor}")
            
        # Test GPU computation
        print("\nTesting GPU computation...")
        device = torch.device('cuda:0')
        x = torch.randn(1000, 1000).to(device)
        y = torch.randn(1000, 1000).to(device)
        z = torch.matmul(x, y)
        print(f"GPU computation test: PASSED")
        print(f"Result shape: {z.shape}")
    else:
        print("\nNo GPU detected. Training will use CPU.")
        print("For GPU support, ensure:")
        print("  1. NVIDIA GPU with CUDA support is available")
        print("  2. CUDA toolkit is installed")
        print("  3. PyTorch is installed with CUDA support")
        
except ImportError:
    print("PyTorch not installed. Cannot check GPU configuration.")
except Exception as e:
    print(f"Error checking GPU: {e}")

## 5. Import Project Modules

In [None]:
# Add project root to path
sys.path.insert(0, str(PROJECT_ROOT))

print("Testing Project Module Imports:")
print("=" * 50)

# Test imports
test_imports = [
    "src.data.datasets.ag_news",
    "src.data.preprocessing.text_cleaner",
    "src.models.base.base_model",
    "src.training.trainers.base_trainer",
    "src.evaluation.metrics.classification_metrics",
    "src.utils.logging_config",
    "src.utils.reproducibility",
    "configs.config_loader",
    "configs.constants"
]

import_errors = []
for module_path in test_imports:
    try:
        module = __import__(module_path, fromlist=[''])
        print(f"  {module_path}: SUCCESS")
    except ImportError as e:
        print(f"  {module_path}: FAILED - {str(e)}")
        import_errors.append((module_path, str(e)))

if import_errors:
    print("\nImport errors detected:")
    for module_path, error in import_errors:
        print(f"  {module_path}: {error}")
else:
    print("\nAll project modules imported successfully.")

## 6. Configuration Loading Test

In [None]:
# Test configuration loading
print("Configuration Loading Test:")
print("=" * 50)

try:
    from configs.config_loader import ConfigLoader
    from configs.constants import (
        AG_NEWS_CLASSES,
        AG_NEWS_NUM_CLASSES,
        DATA_DIR,
        MODEL_DIR
    )
    
    # Load configuration
    config_loader = ConfigLoader()
    
    # Test loading different configs
    config_files = [
        "models/single/deberta_v3_xlarge.yaml",
        "training/standard/base_training.yaml",
        "data/preprocessing/standard.yaml"
    ]
    
    for config_file in config_files:
        try:
            config = config_loader.load_config(config_file)
            print(f"  {config_file}: Loaded successfully")
            if config:
                print(f"    Keys: {list(config.keys())[:5]}...")
        except Exception as e:
            print(f"  {config_file}: Failed - {str(e)}")
    
    # Display constants
    print("\nProject Constants:")
    print(f"  AG News Classes: {AG_NEWS_CLASSES}")
    print(f"  Number of Classes: {AG_NEWS_NUM_CLASSES}")
    print(f"  Data Directory: {DATA_DIR}")
    print(f"  Model Directory: {MODEL_DIR}")
    
except ImportError as e:
    print(f"Failed to import configuration modules: {e}")
except Exception as e:
    print(f"Configuration test failed: {e}")

## 7. Data Pipeline Test

In [None]:
# Test data pipeline
print("Data Pipeline Test:")
print("=" * 50)

try:
    from src.data.datasets.ag_news import AGNewsDataset, AGNewsConfig
    from src.data.preprocessing.text_cleaner import TextCleaner, CleaningConfig
    
    # Test text cleaning
    print("\nTesting text cleaner...")
    cleaner_config = CleaningConfig(
        lowercase=True,
        remove_punctuation=False,
        remove_numbers=False
    )
    cleaner = TextCleaner(cleaner_config)
    
    test_text = "This is a TEST text with Numbers 123 and Punctuation!!!"
    cleaned_text = cleaner.clean(test_text)
    print(f"  Original: {test_text}")
    print(f"  Cleaned: {cleaned_text}")
    
    # Test dataset loading
    print("\nTesting dataset loading...")
    dataset_config = AGNewsConfig(
        data_dir=DATA_DIR / "processed",
        max_samples=100,  # Load only 100 samples for testing
        use_cache=False
    )
    
    # Try to load a small sample
    try:
        dataset = AGNewsDataset(dataset_config, split="train")
        print(f"  Dataset loaded successfully")
        print(f"  Number of samples: {len(dataset)}")
        
        # Test getting a sample
        sample = dataset[0]
        print(f"  Sample keys: {list(sample.keys())}")
        print(f"  Sample text (truncated): {sample['text'][:100]}...")
        print(f"  Sample label: {sample['label']}")
        
    except FileNotFoundError:
        print("  Dataset files not found. Run data preparation scripts first.")
        print("  Use: python scripts/data_preparation/prepare_ag_news.py")
    
except ImportError as e:
    print(f"Failed to import data modules: {e}")
except Exception as e:
    print(f"Data pipeline test failed: {e}")

## 8. Model Loading Test

In [None]:
# Test model loading
print("Model Loading Test:")
print("=" * 50)

try:
    from transformers import AutoTokenizer, AutoModel
    
    # Test loading a small model
    model_name = "bert-base-uncased"
    print(f"\nTesting loading of {model_name}...")
    
    # Load tokenizer
    print("  Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(f"    Tokenizer loaded: {type(tokenizer).__name__}")
    print(f"    Vocab size: {tokenizer.vocab_size}")
    
    # Test tokenization
    test_text = "This is a test sentence for tokenization."
    tokens = tokenizer(test_text, return_tensors="pt")
    print(f"    Tokenization test: SUCCESS")
    print(f"    Input IDs shape: {tokens['input_ids'].shape}")
    
    # Load model (only if sufficient memory)
    print("  Loading model architecture...")
    try:
        model = AutoModel.from_pretrained(model_name)
        print(f"    Model loaded: {type(model).__name__}")
        print(f"    Number of parameters: {sum(p.numel() for p in model.parameters()):,}")
        
        # Test forward pass
        with torch.no_grad():
            outputs = model(**tokens)
            print(f"    Forward pass: SUCCESS")
            print(f"    Output shape: {outputs.last_hidden_state.shape}")
            
    except MemoryError:
        print("    Insufficient memory to load model")
    except Exception as e:
        print(f"    Model loading failed: {e}")
        
except ImportError:
    print("Transformers library not installed.")
    print("Install with: pip install transformers")
except Exception as e:
    print(f"Model test failed: {e}")

## 9. Memory and Resource Check

In [None]:
# Check system resources
print("System Resources:")
print("=" * 50)

try:
    import psutil
    
    # CPU information
    print("\nCPU Information:")
    print(f"  Physical cores: {psutil.cpu_count(logical=False)}")
    print(f"  Logical cores: {psutil.cpu_count(logical=True)}")
    print(f"  Current usage: {psutil.cpu_percent(interval=1)}%")
    
    # Memory information
    memory = psutil.virtual_memory()
    print("\nMemory Information:")
    print(f"  Total: {memory.total / (1024**3):.1f} GB")
    print(f"  Available: {memory.available / (1024**3):.1f} GB")
    print(f"  Used: {memory.used / (1024**3):.1f} GB ({memory.percent}%)")
    
    # Disk information
    disk = psutil.disk_usage('/')
    print("\nDisk Information:")
    print(f"  Total: {disk.total / (1024**3):.1f} GB")
    print(f"  Available: {disk.free / (1024**3):.1f} GB")
    print(f"  Used: {disk.used / (1024**3):.1f} GB ({disk.percent}%)")
    
    # Recommendations based on resources
    print("\nRecommendations:")
    if memory.available / (1024**3) < 8:
        print("  - Limited memory available. Consider using smaller models or batch sizes.")
    else:
        print("  - Sufficient memory for most transformer models.")
    
    if disk.free / (1024**3) < 20:
        print("  - Limited disk space. May need to clean up cached models.")
    else:
        print("  - Sufficient disk space for model storage.")
        
except ImportError:
    print("psutil not installed. Cannot check system resources.")
    print("Install with: pip install psutil")
except Exception as e:
    print(f"Resource check failed: {e}")

## 10. Environment Summary and Next Steps

In [None]:
# Generate environment summary
print("Environment Setup Summary")
print("=" * 50)

# Collect status
checks = {
    "Python Version": python_version >= required_version,
    "Project Structure": len(missing_dirs) == 0,
    "Core Dependencies": len(missing_packages) == 0,
    "GPU Available": 'cuda_available' in locals() and cuda_available,
    "Project Imports": len(import_errors) == 0,
    "Configuration": 'config_loader' in locals(),
    "Data Pipeline": 'dataset' in locals(),
    "Model Loading": 'tokenizer' in locals()
}

print("\nStatus Report:")
for check_name, passed in checks.items():
    status = "READY" if passed else "NEEDS ATTENTION"
    print(f"  {check_name:<20}: {status}")

# Overall readiness
all_passed = all(checks.values())
critical_passed = checks["Python Version"] and checks["Core Dependencies"]

print("\nOverall Status:")
if all_passed:
    print("  Environment is fully configured and ready for development.")
elif critical_passed:
    print("  Critical components are ready. Some optional features may need configuration.")
else:
    print("  Environment needs configuration. Please address the issues above.")

print("\nNext Steps:")
print("  1. If any dependencies are missing, install them using pip")
print("  2. Run data preparation scripts to download and process AG News dataset")
print("  3. Configure GPU support if available for faster training")
print("  4. Review the project documentation in docs/ directory")
print("  5. Start with the data exploration notebook (01_data_exploration.ipynb)")

# Save environment report
from datetime import datetime
report = {
    "timestamp": datetime.now().isoformat(),
    "python_version": f"{python_version.major}.{python_version.minor}.{python_version.micro}",
    "platform": platform.platform(),
    "checks": {k: bool(v) for k, v in checks.items()},
    "gpu_available": 'cuda_available' in locals() and cuda_available
}

import json
report_path = PROJECT_ROOT / "outputs" / "setup" / "environment_report.json"
report_path.parent.mkdir(parents=True, exist_ok=True)
with open(report_path, 'w') as f:
    json.dump(report, f, indent=2)

print(f"\nEnvironment report saved to: {report_path}")