# üöÄ Tri-Force Model Stack Setup

**Phase 1: Sovereign Runtime Environment on Google Colab Pro (A100)**

This notebook demonstrates hot-path inference with three specialist models:

1. **Forecaster**: `amazon/chronos-t5-base` - Zero-shot time series forecasting
2. **Logic Engineer**: `Qwen/Qwen2.5-Coder-7B` - Python code generation
3. **Cultural Analyst**: `behbudiy/Llama-3.1-8B-Uz` - Uzbek linguistic analysis

---

## Prerequisites

- Google Colab Pro with A100 GPU
- ~30GB VRAM available
- High-RAM runtime enabled

## 1. Environment Setup

In [None]:
# Clone repository (skip if already cloned)
!git clone https://github.com/Shohruh127/Chrono_LLM_RAG.git 2>/dev/null || echo 'Repository already exists'
%cd Chrono_LLM_RAG

In [None]:
# Install dependencies
!pip install -q -r requirements.txt

# Verify bitsandbytes installation
!python -c "import bitsandbytes; print(f'bitsandbytes version: {bitsandbytes.__version__}')"

In [None]:
# Check GPU availability
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Total VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("‚ö†Ô∏è No GPU detected! Enable GPU in Runtime > Change runtime type")

## 2. Hardware Optimization

In [None]:
from src.tri_force import HardwareOptimizer, check_gpu

# Initialize hardware optimizer
optimizer = HardwareOptimizer(vram_budget_gb=30.0)

print(f"Device: {optimizer.device}")
print(f"Device name: {optimizer.device_name}")

# Get initial VRAM report
optimizer.print_vram_report()

In [None]:
# Show quantization configuration
quant_config = optimizer.get_quantization_config()
print("NF4 Quantization Config:")
for key, value in quant_config.items():
    print(f"  {key}: {value}")

# Estimate VRAM requirements
print("\nüìä Estimated VRAM Requirements:")
print(f"  Chronos-T5-Base (~0.2B params): {optimizer.get_model_memory_estimate(0.2, 'fp16'):.2f} GB")
print(f"  Qwen2.5-Coder-7B (NF4): {optimizer.get_model_memory_estimate(7, 'nf4'):.2f} GB")
print(f"  Llama-3.1-8B-Uz (NF4): {optimizer.get_model_memory_estimate(8, 'nf4'):.2f} GB")
total_est = optimizer.get_model_memory_estimate(0.2, 'fp16') + optimizer.get_model_memory_estimate(7, 'nf4') + optimizer.get_model_memory_estimate(8, 'nf4')
print(f"  Total Estimated: {total_est:.2f} GB")

## 3. Initialize Tri-Force Stack

In [None]:
from src.tri_force import TriForceStack, QueryType

# Initialize stack with configuration
stack = TriForceStack(config_path="configs/models_config.yaml")

print("Tri-Force Stack initialized")
print(f"Device: {stack.hardware.device}")

## 4. Benchmark: Sequential Loading vs Hot-Path Inference

In [None]:
import time

# Clear cache before benchmark
optimizer.clear_cache()

# Sequential Loading Benchmark
print("\n" + "="*60)
print("üìä BENCHMARK: Sequential Loading")
print("="*60)

sequential_times = []

# Load and unload each model sequentially
start = time.time()
stack.load_forecaster()
sequential_times.append(("Forecaster", time.time() - start))
stack._forecaster = None
stack._models_loaded["forecaster"] = False
optimizer.clear_cache()

start = time.time()
stack.load_logic_engineer()
sequential_times.append(("Logic Engineer", time.time() - start))
stack._logic_engineer = None
stack._models_loaded["logic_engineer"] = False
optimizer.clear_cache()

start = time.time()
stack.load_cultural_analyst()
sequential_times.append(("Cultural Analyst", time.time() - start))
stack._cultural_analyst = None
stack._models_loaded["cultural_analyst"] = False
optimizer.clear_cache()

total_sequential = sum(t for _, t in sequential_times)
print(f"\nSequential load times:")
for name, t in sequential_times:
    print(f"  {name}: {t:.2f}s")
print(f"Total sequential: {total_sequential:.2f}s")

In [None]:
# Hot-Path Loading Benchmark
print("\n" + "="*60)
print("üöÄ BENCHMARK: Hot-Path Inference (All Models Loaded)")
print("="*60)

start = time.time()
stack.load_all()
hot_path_time = time.time() - start

print(f"\nHot-path load time: {hot_path_time:.2f}s")

# Calculate improvement
improvement = ((total_sequential - hot_path_time) / total_sequential) * 100 if total_sequential > 0 else 0
print(f"\nüìà Latency improvement: {improvement:.1f}%")
print(f"   (Sequential: {total_sequential:.2f}s vs Hot-path: {hot_path_time:.2f}s)")

## 5. VRAM Usage Visualization

In [None]:
import matplotlib.pyplot as plt

# Get final VRAM report
report = optimizer.get_vram_usage()

# Create VRAM visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart: VRAM usage
if report.total_gb > 0:
    sizes = [report.used_gb, report.free_gb]
    labels = [f'Used\n{report.used_gb:.1f} GB', f'Free\n{report.free_gb:.1f} GB']
    colors = ['#ff6b6b', '#4ecdc4']
    explode = (0.05, 0)
    
    axes[0].pie(sizes, explode=explode, labels=labels, colors=colors, 
                autopct='%1.1f%%', shadow=True, startangle=90)
    axes[0].set_title(f'VRAM Usage on {report.device_name}\n(Total: {report.total_gb:.1f} GB)')

# Bar chart: Model VRAM estimates
models = ['Chronos-T5\n(FP16)', 'Qwen2.5-7B\n(NF4)', 'Llama-3.1-8B\n(NF4)', 'Total\nEstimate', 'Actual\nUsage', 'Budget']
vram_values = [
    optimizer.get_model_memory_estimate(0.2, 'fp16'),
    optimizer.get_model_memory_estimate(7, 'nf4'),
    optimizer.get_model_memory_estimate(8, 'nf4'),
    total_est,
    report.used_gb,
    optimizer.vram_budget_gb
]
colors = ['#3498db', '#9b59b6', '#e74c3c', '#2ecc71', '#f39c12', '#95a5a6']

bars = axes[1].bar(models, vram_values, color=colors)
axes[1].set_ylabel('VRAM (GB)')
axes[1].set_title('VRAM Allocation by Model')
axes[1].axhline(y=optimizer.vram_budget_gb, color='red', linestyle='--', label='Budget')

# Add value labels
for bar, val in zip(bars, vram_values):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
                 f'{val:.1f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('vram_usage.png', dpi=150)
plt.show()

print(f"\n‚úÖ VRAM usage saved to vram_usage.png")

## 6. Model Health Check

In [None]:
import json

# Run health check
health = stack.health_check()

print("\n" + "="*60)
print("üè• Model Health Check")
print("="*60)

print(json.dumps(health, indent=2, default=str))

## 7. Query Routing Test

In [None]:
# Test query routing
test_queries = [
    "Forecast GDP growth for the next 4 years",
    "Write Python code to calculate compound interest",
    "O'zbekiston iqtisodiyoti haqida gapirib bering",
    "What is the trend in agricultural output?",
    "Calculate the mean of these values",
    "Toshkent viloyatining sanoat ko'rsatkichlari"
]

print("\n" + "="*60)
print("üîÄ Query Routing Test")
print("="*60)

for query in test_queries:
    query_type = stack.detect_query_type(query)
    print(f"\nüìù Query: '{query[:50]}...'")
    print(f"   ‚Üí Type: {query_type.value}")
    print(f"   ‚Üí Model: {['forecaster', 'logic_engineer', 'cultural_analyst'][['FORECAST', 'CODE', 'CULTURAL'].index(query_type.name) if query_type.name in ['FORECAST', 'CODE', 'CULTURAL'] else 2]}")

## 8. Example Inference

In [None]:
# Test forecasting with sample data
import torch

# Sample time series data (e.g., annual GDP values)
sample_context = torch.tensor([100.0, 105.0, 110.0, 115.0, 120.0, 125.0, 130.0, 135.0])

print("\n" + "="*60)
print("üìà Forecasting Example")
print("="*60)
print(f"\nHistorical data: {sample_context.tolist()}")

# Generate forecasts
forecasts = stack.forecast(
    context=sample_context,
    prediction_length=4,
    num_samples=20
)

print(f"\nForecast shape: {forecasts.shape}")
print(f"Forecast mean: {forecasts.mean(dim=1).squeeze().tolist()}")
print(f"Forecast 10th percentile: {forecasts.quantile(0.1, dim=1).squeeze().tolist()}")
print(f"Forecast 90th percentile: {forecasts.quantile(0.9, dim=1).squeeze().tolist()}")

In [None]:
# Test code generation
print("\n" + "="*60)
print("üíª Code Generation Example")
print("="*60)

code_query = "Write a Python function to calculate moving average of a list"
result = stack.route_query(code_query, QueryType.CODE)

print(f"\nQuery: {code_query}")
print(f"Model: {result['model']}")
print(f"\nResponse:\n{result['response'][:500]}...")

In [None]:
# Test cultural analysis
print("\n" + "="*60)
print("üåç Cultural Analysis Example")
print("="*60)

cultural_query = "Toshkent viloyatining iqtisodiy rivojlanishi haqida qisqacha ma'lumot bering"
result = stack.route_query(cultural_query, QueryType.CULTURAL)

print(f"\nQuery: {cultural_query}")
print(f"Model: {result['model']}")
print(f"\nResponse:\n{result['response'][:500]}...")

## 9. Final VRAM Report

In [None]:
# Final VRAM report
optimizer.print_vram_report()

print("\n" + "="*60)
print("‚úÖ Tri-Force Stack Setup Complete!")
print("="*60)
print(f"\nüìä Summary:")
print(f"   - All 3 models loaded: {stack.health_check()['all_loaded']}")
print(f"   - VRAM within budget: {optimizer.check_vram_budget()}")
print(f"   - Device: {optimizer.device_name}")
print(f"   - Hot-path inference ready!")

## 10. Cleanup (Optional)

In [None]:
# Unload all models (optional - for memory management)
# stack.unload_all()
# optimizer.print_vram_report()