# Analyze Packaged Model from S3

This notebook downloads and inspects the packaged model.tar.gz to verify:
1. The circular import fix is present
2. The package structure is correct
3. All required files are included

In [None]:
import boto3
import tarfile
import os
import tempfile
from pathlib import Path
import shutil

## 1. Download Model from S3

In [None]:
# S3 location of your packaged model
s3_uri = "s3://sandboxdependency-abuse-secureaisandboxteamshare-1l77v9am252um/lukexie-Names3Risk-pytorch-NA-1-0-0-pipeline-2026-01-17-07-38-05/package/packaged_model/model.tar.gz"

# Parse S3 URI
s3_parts = s3_uri.replace("s3://", "").split("/", 1)
bucket_name = s3_parts[0]
object_key = s3_parts[1]

print(f"Bucket: {bucket_name}")
print(f"Key: {object_key}")

In [None]:
# Download the model
s3_client = boto3.client("s3")
local_tar_path = "/tmp/model.tar.gz"

print("Downloading model.tar.gz from S3...")
s3_client.download_file(bucket_name, object_key, local_tar_path)
print(f"‚úì Downloaded to {local_tar_path}")

# Check file size
file_size_mb = os.path.getsize(local_tar_path) / (1024 * 1024)
print(f"File size: {file_size_mb:.2f} MB")

## 2. Extract and Inspect Contents

In [None]:
# Extract to temporary directory
extract_dir = "/tmp/extracted_model"

# Clean up if exists
if os.path.exists(extract_dir):
    shutil.rmtree(extract_dir)
os.makedirs(extract_dir)

print("Extracting model.tar.gz...")
with tarfile.open(local_tar_path, "r:gz") as tar:
    tar.extractall(extract_dir)
print(f"‚úì Extracted to {extract_dir}")

In [None]:
# List directory structure
def print_tree(directory, prefix="", max_depth=3, current_depth=0):
    """Print directory tree structure"""
    if current_depth >= max_depth:
        return

    items = sorted(Path(directory).iterdir(), key=lambda x: (not x.is_dir(), x.name))

    for i, item in enumerate(items):
        is_last = i == len(items) - 1
        current_prefix = "‚îî‚îÄ‚îÄ " if is_last else "‚îú‚îÄ‚îÄ "
        print(f"{prefix}{current_prefix}{item.name}")

        if item.is_dir():
            extension_prefix = "    " if is_last else "‚îÇ   "
            print_tree(item, prefix + extension_prefix, max_depth, current_depth + 1)


print("\nüì¶ Package Structure:")
print(extract_dir)
print_tree(extract_dir, max_depth=4)

## 3. Verify Circular Import Fix

In [None]:
# Read the pytorch_inference_handler.py file
handler_path = Path(extract_dir) / "code" / "pytorch_inference_handler.py"

if handler_path.exists():
    with open(handler_path, "r") as f:
        content = f.read()

    print("‚úì Found pytorch_inference_handler.py")
    print(f"File size: {len(content)} bytes")
    print("\n" + "=" * 70)
else:
    print("‚úó pytorch_inference_handler.py not found!")

In [None]:
# Check for the circular import fix
print("üîç Checking for Circular Import Fix...\n")

# Check 1: Module-level AutoTokenizer import should be REMOVED
lines = content.split("\n")
module_level_imports = []
for i, line in enumerate(lines[:400], 1):  # Check first 400 lines
    if "from transformers import AutoTokenizer" in line and not line.strip().startswith(
        "#"
    ):
        # Check if it's at module level (not inside a function)
        # Look backwards to see if we're inside a function
        in_function = False
        for j in range(max(0, i - 50), i):
            if "def " in lines[j]:
                in_function = True
                break

        if not in_function:
            module_level_imports.append((i, line))

if module_level_imports:
    print("‚ùå ISSUE FOUND: Module-level AutoTokenizer import still exists!")
    for line_num, line in module_level_imports:
        print(f"   Line {line_num}: {line.strip()}")
else:
    print("‚úÖ PASS: No module-level AutoTokenizer import found")

# Check 2: Function-level AutoTokenizer imports should be present
function_level_count = 0
for i, line in enumerate(lines, 1):
    if "from transformers import AutoTokenizer" in line:
        # Check if inside model_fn
        in_model_fn = False
        for j in range(max(0, i - 100), i):
            if "def model_fn" in lines[j]:
                in_model_fn = True
                break

        if in_model_fn:
            function_level_count += 1
            print(f"‚úÖ Found function-level import at line {i}")
            # Show context
            start = max(0, i - 3)
            end = min(len(lines), i + 2)
            print("\nContext:")
            for idx in range(start, end):
                marker = ">>>" if idx == i - 1 else "   "
                print(f"{marker} {idx + 1:4d}: {lines[idx]}")
            print()

if function_level_count == 0:
    print("\n‚ö†Ô∏è  WARNING: No function-level AutoTokenizer imports found")
    print("   BERT tokenizer loading may fail!")
elif function_level_count >= 2:
    print(
        f"\n‚úÖ PASS: Found {function_level_count} function-level imports (expected 2)"
    )
else:
    print(f"\n‚ö†Ô∏è  Found only {function_level_count} function-level import (expected 2)")

In [None]:
# Check 3: Verify custom tokenizer import pattern
print("\nüîç Checking Custom Tokenizer Import Pattern...\n")

custom_tokenizer_pattern = "from tokenizers import Tokenizer"
found_custom_import = False

for i, line in enumerate(lines, 1):
    if custom_tokenizer_pattern in line and not line.strip().startswith("#"):
        # Check if inside model_fn
        in_model_fn = False
        for j in range(max(0, i - 100), i):
            if "def model_fn" in lines[j]:
                in_model_fn = True
                break

        if in_model_fn:
            found_custom_import = True
            print(f"‚úÖ Found custom tokenizer import at line {i}")
            print(f"   Pattern: function-level import (CORRECT)")
            # Show context
            start = max(0, i - 3)
            end = min(len(lines), i + 2)
            print("\nContext:")
            for idx in range(start, end):
                marker = ">>>" if idx == i - 1 else "   "
                print(f"{marker} {idx + 1:4d}: {lines[idx]}")
            break

if found_custom_import:
    print("\n‚úÖ PASS: Custom tokenizer uses function-level import")
else:
    print("\n‚ö†Ô∏è  WARNING: Custom tokenizer import pattern not found")

## 4. Check for Tokenizers Directory

In [None]:
# Check if both custom_tokenizers and tokenizers directories exist
code_dir = Path(extract_dir) / "code"

custom_tokenizers_dir = code_dir / "custom_tokenizers"
tokenizers_dir = code_dir / "tokenizers"

print("üìÅ Checking tokenizer directories...\n")

if custom_tokenizers_dir.exists():
    print(f"‚úÖ custom_tokenizers/ exists")
    files = list(custom_tokenizers_dir.glob("*"))
    for f in files:
        print(f"   - {f.name}")
else:
    print("‚úó custom_tokenizers/ NOT found")

print()

if tokenizers_dir.exists():
    print(f"‚ö†Ô∏è  tokenizers/ exists (may cause issues!)")
    files = list(tokenizers_dir.glob("*"))
    for f in files:
        print(f"   - {f.name}")
    print("\n‚ö†Ô∏è  This directory shadows the HuggingFace tokenizers package!")
    print("   However, the fix should handle this correctly.")
else:
    print("‚úÖ tokenizers/ NOT found (good - no shadowing)")

## 5. Summary Report

In [None]:
print("\n" + "=" * 70)
print("ANALYSIS SUMMARY")
print("=" * 70)

# Compile checks
checks = [
    ("Module-level AutoTokenizer import removed", len(module_level_imports) == 0),
    ("Function-level AutoTokenizer imports present", function_level_count >= 2),
    ("Custom tokenizer uses function-level import", found_custom_import),
    ("Handler file exists", handler_path.exists()),
]

all_passed = all(result for _, result in checks)

for check_name, result in checks:
    status = "‚úÖ PASS" if result else "‚ùå FAIL"
    print(f"{status}: {check_name}")

print("\n" + "=" * 70)

if all_passed:
    print("‚úÖ ALL CHECKS PASSED - Fix is properly deployed!")
    print("\nThe circular import fix has been successfully applied.")
    print("transformer2risk/lstm2risk models will NOT import transformers.")
    print("BERT models will import AutoTokenizer only when needed.")
else:
    print("‚ö†Ô∏è  SOME CHECKS FAILED - Review the issues above")
    print("\nThe fix may not be properly deployed.")
    print("Consider re-running the packaging step.")

print("=" * 70)

## 6. View Specific Code Sections (Optional)

In [None]:
# View the model_fn tokenizer loading section
print("üìÑ model_fn() Tokenizer Loading Section:\n")

in_model_fn = False
in_tokenizer_section = False
line_buffer = []

for i, line in enumerate(lines, 1):
    if "def model_fn" in line:
        in_model_fn = True

    if in_model_fn and "Loading tokenizer" in line:
        in_tokenizer_section = True
        # Start from a few lines before
        start_idx = max(0, i - 5)
        for j in range(start_idx, i):
            print(f"{j:4d}: {lines[j]}")

    if in_tokenizer_section:
        print(f"{i:4d}: {line}")

        # Stop after Reconstruct pipelines section
        if "Reconstruct pipelines" in line:
            # Print a few more lines
            for j in range(i, min(i + 3, len(lines))):
                print(f"{j + 1:4d}: {lines[j]}")
            break

## 7. Cleanup (Optional)

In [None]:
# Uncomment to clean up extracted files
# shutil.rmtree(extract_dir)
# os.remove(local_tar_path)
# print("‚úì Cleaned up temporary files")