# Testing CPU Inference for Legal Reasoning Model

This notebook demonstrates how to test the CPU-optimized Legal Reasoning Model locally before deploying to SageMaker.

In [None]:
import os
import sys
import json
import torch
import time

# Add the src directory to the path
sys.path.append('../src')

# Import CPU inference code
from inference.cpu_inference import model_fn, predict_fn

## Load the Model

First, let's load the optimized model.

In [None]:
# Path to the optimized model
model_dir = "../models/optimized"

# Load the model
print("Loading model...")
start_time = time.time()
model, tokenizer = model_fn(model_dir)
load_time = time.time() - start_time
print(f"Model loaded in {load_time:.2f} seconds")

## Test Inference

Now, let's test inference with a sample legal document.

In [None]:
# Load a sample document
sample_path = "../data/german/processed/IX ZB 72_08.json"
with open(sample_path, 'r', encoding='utf-8') as f:
    sample_doc = json.load(f)

# Extract text
sample_text = sample_doc['full_text']

# Truncate text to avoid token limits
max_chars = 5000
if len(sample_text) > max_chars:
    sample_text = sample_text[:max_chars] + "..."

print(f"Sample text length: {len(sample_text)} characters")

In [None]:
# Prepare input data
input_data = {
    "text": sample_text,
    "task": "summarization",
    "max_new_tokens": 256,
    "temperature": 0.7,
    "top_p": 0.9,
    "language": "de"
}

In [None]:
# Run inference
print("Running inference...")
start_time = time.time()
result = predict_fn(input_data, (model, tokenizer))
inference_time = time.time() - start_time
print(f"Inference completed in {inference_time:.2f} seconds")

In [None]:
# Display the result
print("Generated response:")
print(result["response"])

## Benchmark Different Tasks

Let's benchmark the model on different tasks.

In [None]:
# Define tasks to benchmark
tasks = ["classification", "summarization", "case_analysis", "statute_interpretation"]

# Benchmark results
benchmark_results = {}

for task in tasks:
    print(f"\nBenchmarking task: {task}")
    
    # Prepare input data
    input_data = {
        "text": sample_text,
        "task": task,
        "max_new_tokens": 256,
        "temperature": 0.7,
        "top_p": 0.9,
        "language": "de"
    }
    
    # Run inference
    start_time = time.time()
    result = predict_fn(input_data, (model, tokenizer))
    inference_time = time.time() - start_time
    
    # Store result
    benchmark_results[task] = {
        "time": inference_time,
        "tokens_per_second": 256 / inference_time
    }
    
    print(f"Inference time: {inference_time:.2f} seconds")
    print(f"Tokens per second: {256 / inference_time:.2f}")

In [None]:
# Plot benchmark results
import matplotlib.pyplot as plt
import pandas as pd

# Create DataFrame
df = pd.DataFrame({
    'Task': list(benchmark_results.keys()),
    'Inference Time (s)': [benchmark_results[task]['time'] for task in benchmark_results],
    'Tokens per Second': [benchmark_results[task]['tokens_per_second'] for task in benchmark_results]
})

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Inference time
ax1.bar(df['Task'], df['Inference Time (s)'])
ax1.set_title('Inference Time by Task')
ax1.set_ylabel('Time (seconds)')
ax1.set_xlabel('Task')

# Tokens per second
ax2.bar(df['Task'], df['Tokens per Second'])
ax2.set_title('Tokens per Second by Task')
ax2.set_ylabel('Tokens per Second')
ax2.set_xlabel('Task')

plt.tight_layout()
plt.show()

## Memory Usage Analysis

Let's analyze the memory usage of the model.

In [None]:
# Get model size
def get_model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    size_all_mb = (param_size + buffer_size) / 1024**2
    return size_all_mb

model_size_mb = get_model_size(model)
print(f"Model size in memory: {model_size_mb:.2f} MB")

## Conclusion

This notebook has demonstrated how to test the CPU-optimized Legal Reasoning Model locally. The model can now be deployed to a CPU-based SageMaker endpoint using the `deploy_cpu_model.py` script.