# PTX Kernel Execution Time Prediction using GPT-4

This notebook uses OpenAI's GPT-4 to analyze PTX (Parallel Thread Execution) assembly code and predict execution time for different GPU architectures.

## Features:
- PTX assembly code analysis
- Multi-GPU execution time prediction (RTX_2080_Ti, TITAN_V, RTX_4070, GTX_TITAN_X)
- GPU-specific performance analysis
- Batch processing of multiple kernels
- Automatic GPU classification from PTX filenames


## Setup and Dependencies


In [None]:
# Install required packages
!pip install openai pandas matplotlib seaborn numpy


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import os
os.environ['KERNELS_DIR'] = '/content/drive/MyDrive/kernels_src'
os.environ['OUTPUT_DIR']  = '/content/drive/MyDrive'


In [None]:
import openai
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from typing import Dict, List, Optional, Tuple
import re
from datetime import datetime
from pathlib import Path
from collections import defaultdict

# Google Colab setup for API key
try:
    from google.colab import userdata
    api_key = userdata.get('OPENAI_API_KEY')
    print("‚úÖ API key loaded from Google Colab secrets")
except ImportError:
    api_key = os.getenv('OPENAI_API_KEY')
    if not api_key:
        import getpass
        api_key = getpass.getpass("Enter your OpenAI API key: ")
    print("‚úÖ API key loaded from environment/manual input")
except Exception as e:
    print(f"‚ùå Error loading API key: {e}")
    api_key = None

# Set up OpenAI client
if api_key:
    client = openai.OpenAI(api_key=api_key)
    print("üöÄ OpenAI client initialized successfully!")
else:
    print("‚ö†Ô∏è  OpenAI client not initialized - please check your API key setup")
    client = None

print("Setup complete!")


## GPU Architecture Specifications


In [None]:
GPU_SPECS = {
    'RTX_2080_Ti': {
        'architecture': 'Turing',
        'sm_count': 68,
        'cuda_cores': 4352,
        'tensor_cores': 544,
        'base_clock_mhz': 1350,
        'boost_clock_mhz': 1545,
        'memory_gb': 11,
        'memory_bandwidth_gbps': 616,
        'l2_cache_mb': 5.5,
        'compute_capability': '7.5',
        'max_threads_per_block': 1024,
        'max_threads_per_sm': 1024,
        'warp_size': 32
    },
    'TITAN_V': {
        'architecture': 'Volta',
        'sm_count': 80,
        'cuda_cores': 5120,
        'tensor_cores': 640,
        'base_clock_mhz': 1200,
        'boost_clock_mhz': 1455,
        'memory_gb': 12,
        'memory_bandwidth_gbps': 653,
        'l2_cache_mb': 4.5,
        'compute_capability': '7.0',
        'max_threads_per_block': 1024,
        'max_threads_per_sm': 2048,
        'warp_size': 32
    },
    'RTX_4070': {
        'architecture': 'Ada Lovelace',
        'sm_count': 46,
        'cuda_cores': 5888,
        'tensor_cores': 184,
        'base_clock_mhz': 1920,
        'boost_clock_mhz': 2475,
        'memory_gb': 12,
        'memory_bandwidth_gbps': 504,
        'l2_cache_mb': 36,
        'compute_capability': '8.9',
        'max_threads_per_block': 1024,
        'max_threads_per_sm': 2048,
        'warp_size': 32
    },
    'GTX_TITAN_X': {
        'architecture': 'Maxwell',
        'sm_count': 24,
        'cuda_cores': 3072,
        'base_clock_mhz': 1000,
        'boost_clock_mhz': 1075,
        'memory_gb': 12,
        'memory_bandwidth_gbps': 336,
        'l2_cache_mb': 3,
        'compute_capability': '5.2',
        'max_threads_per_block': 1024,
        'max_threads_per_sm': 2048,
        'warp_size': 32
    }
}

print("GPU specifications loaded:")
for gpu, specs in GPU_SPECS.items():
    print(f"  {gpu}: {specs['architecture']} - {specs['cuda_cores']} cores")


## PTX File Processing Functions


In [None]:
def parse_ptx_filename(filename: str) -> Optional[Tuple[str, str, str]]:
    """
    Parse PTX filename to extract folder_id, kernel_id, and GPU model.
    Format: kernel_{folder_id}_{kernel_id}_{GPU_MODEL}.ptx
    Returns: (folder_id, kernel_id, gpu_model) or None
    """
    name = filename.replace('.ptx', '')
    match = re.match(r'kernel_(\d+)_(\d+)_(.+)', name)
    if match:
        folder_id = match.group(1)
        kernel_id = match.group(2)
        gpu_model = match.group(3)
        return (folder_id, kernel_id, gpu_model)
    return None

def classify_ptx_files(kernels_dir: Path) -> Dict[Tuple[str, str, str], Dict[str, Path]]:
    """
    Scan directory for PTX files and classify them by kernel and GPU.
    Returns: {(dataset, folder_id, kernel_id): {gpu_model: ptx_file_path}}
    """
    ptx_files = defaultdict(lambda: {})
    
    for ptx_file in kernels_dir.rglob('*.ptx'):
        filename = ptx_file.name
        parsed = parse_ptx_filename(filename)
        
        if parsed is None:
            continue
        
        folder_id, kernel_id, gpu_model = parsed
        
        # Determine dataset from path
        parts = ptx_file.parts
        if 'test' in parts:
            dataset = 'test'
        elif 'validation' in parts:
            dataset = 'validation'
        else:
            dataset = 'unknown'
        
        key = (dataset, folder_id, kernel_id)
        ptx_files[key][gpu_model] = ptx_file
    
    print(f"Found {len(ptx_files)} unique kernels with PTX files")
    
    # Count GPUs per kernel
    gpu_counts = defaultdict(int)
    for key, gpus in ptx_files.items():
        gpu_counts[len(gpus)] += 1
    
    print(f"GPU distribution per kernel:")
    for count, num_kernels in sorted(gpu_counts.items()):
        print(f"  {count} GPU(s): {num_kernels} kernels")
    
    return ptx_files

print("PTX processing functions defined!")


## GPT-4 PTX Analysis Functions


In [None]:
def create_ptx_analysis_prompt(ptx_code: str, gpu_model: str, grid_dim: Tuple[int, int, int] = (1, 1, 1),
                               block_dim: Tuple[int, int, int] = (256, 1, 1),
                               data_size: Optional[int] = None) -> str:
    """
    Create a comprehensive prompt for GPT-4 to analyze PTX assembly code.
    """
    gpu_specs = GPU_SPECS.get(gpu_model, {})
    
    # Extract PTX version and target from code
    ptx_version_match = re.search(r'\.version\s+(\d+\.\d+)', ptx_code)
    ptx_target_match = re.search(r'\.target\s+(.+)', ptx_code)
    
    ptx_version = ptx_version_match.group(1) if ptx_version_match else 'Unknown'
    ptx_target = ptx_target_match.group(1) if ptx_target_match else 'Unknown'
    
    # Limit PTX code size to avoid token limits (keep first 5000 chars)
    ptx_preview = ptx_code[:5000] if len(ptx_code) > 5000 else ptx_code
    
    prompt = f"""
You are an expert CUDA PTX (Parallel Thread Execution) assembly code analyst. Analyze the following PTX assembly code and provide detailed performance predictions for the specified GPU architecture.

**Target GPU Specifications:**
Model: {gpu_model}
Architecture: {gpu_specs.get('architecture', 'Unknown')}
SM Count: {gpu_specs.get('sm_count', 'Unknown')}
CUDA Cores: {gpu_specs.get('cuda_cores', 'Unknown')}
Memory Bandwidth: {gpu_specs.get('memory_bandwidth_gbps', 'Unknown')} GB/s
L2 Cache: {gpu_specs.get('l2_cache_mb', 'Unknown')} MB
Compute Capability: {gpu_specs.get('compute_capability', 'Unknown')}
Max Threads per SM: {gpu_specs.get('max_threads_per_sm', 'Unknown')}

**PTX Information:**
PTX Version: {ptx_version}
PTX Target: {ptx_target}

**Launch Configuration:**
Grid Dimensions: {grid_dim}
Block Dimensions: {block_dim}
Total Threads: {grid_dim[0] * grid_dim[1] * grid_dim[2] * block_dim[0] * block_dim[1] * block_dim[2]}
{f'Data Size: {data_size} elements' if data_size else ''}

**PTX Assembly Code:**
```ptx
{ptx_preview}
```

**Please provide a comprehensive analysis in the following JSON format:**

{{
    "execution_time_estimate": {{
        "microseconds_min": <minimum_estimate>,
        "microseconds_max": <maximum_estimate>,
        "microseconds_typical": <typical_estimate>,
        "confidence_level": "<high/medium/low>"
    }},
    "performance_analysis": {{
        "primary_bottleneck": "<memory/compute/divergence/occupancy>",
        "bottleneck_explanation": "<detailed_explanation>",
        "arithmetic_intensity": <ops_per_byte>,
        "memory_pattern": "<coalesced/strided/random/complex>",
        "branch_divergence_risk": "<none/low/medium/high>",
        "occupancy_estimate": "<percentage>"
    }},
    "resource_utilization": {{
        "registers_per_thread": <estimate>,
        "shared_memory_per_block_bytes": <estimate>,
        "global_memory_transactions": <estimate>,
        "instruction_count_estimate": <estimate>
    }},
    "ptx_analysis": {{
        "instruction_count": <count>,
        "memory_instructions": <count>,
        "compute_instructions": <count>,
        "synchronization_instructions": <count>,
        "complexity_score": <0-100>
    }},
    "optimization_suggestions": [
        {{
            "category": "<memory/computation/occupancy/algorithm>",
            "suggestion": "<detailed_suggestion>",
            "expected_improvement": "<percentage_or_description>",
            "difficulty": "<easy/medium/hard>"
        }}
    ]
}}

Focus on:
1. Analyzing PTX instruction patterns and their performance implications
2. Realistic timing estimates based on the specific GPU architecture and PTX instructions
3. Identification of performance bottlenecks from assembly-level analysis
4. Memory access pattern analysis from PTX load/store instructions
5. Occupancy and resource utilization estimation
6. Actionable optimization recommendations based on PTX code structure
"""
    return prompt

def analyze_ptx_with_gpt4(ptx_code: str, gpu_model: str, grid_dim: Tuple[int, int, int] = (1, 1, 1),
                          block_dim: Tuple[int, int, int] = (256, 1, 1),
                          data_size: Optional[int] = None,
                          model: str = "gpt-4o") -> Dict:
    """
    Analyze PTX assembly code using GPT-4 and return structured performance predictions.
    """
    if client is None:
        return {'error': 'OpenAI client not initialized. Please check your API key setup.'}

    try:
        prompt = create_ptx_analysis_prompt(ptx_code, gpu_model, grid_dim, block_dim, data_size)

        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an expert CUDA PTX assembly code analyst with deep knowledge of GPU architectures, instruction-level performance, and optimization techniques. Provide accurate, detailed analysis in the requested JSON format."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.1,
            max_tokens=3000
        )

        content = response.choices[0].message.content

        # Extract JSON from response
        json_match = re.search(r'\{.*\}', content, re.DOTALL)
        if json_match:
            json_str = json_match.group(0)
            analysis = json.loads(json_str)
            analysis['_metadata'] = {
                'timestamp': datetime.now().isoformat(),
                'model': model,
                'gpu_target': gpu_model,
                'grid_dim': grid_dim,
                'block_dim': block_dim,
                'data_size': data_size,
                'raw_response': content
            }
            return analysis
        else:
            return {'error': 'Could not parse JSON from response', 'raw_response': content}

    except Exception as e:
        return {'error': str(e)}

print("PTX analysis functions defined!")


In [None]:
def analyze_ptx_files_in_directory(
    kernels_dir: Path,
    grid_dim: Tuple[int, int, int] = (1, 1, 1),
    block_dim: Tuple[int, int, int] = (256, 1, 1),
    data_size: Optional[int] = None,
    model: str = "gpt-4o",
    max_kernels: Optional[int] = None
) -> Dict:
    """
    Analyze all PTX files in the given directory.
    """
    # Classify PTX files
    ptx_files = classify_ptx_files(kernels_dir)
    
    results = {
        "_batch_metadata": {
            "timestamp": datetime.now().isoformat(),
            "directory": str(kernels_dir.resolve()),
            "grid_dim": grid_dim,
            "block_dim": block_dim,
            "data_size": data_size,
            "model": model
        },
        "analyses": {}
    }
    
    # Process each kernel
    kernel_keys = list(ptx_files.keys())
    if max_kernels:
        kernel_keys = kernel_keys[:max_kernels]
    
    for key_idx, (dataset, folder_id, kernel_id) in enumerate(kernel_keys, 1):
        gpu_ptx_map = ptx_files[(dataset, folder_id, kernel_id)]
        
        print(f"\n[{key_idx}/{len(kernel_keys)}] Processing kernel: {dataset}/{folder_id}/{kernel_id}")
        print(f"  Found PTX files for {len(gpu_ptx_map)} GPU(s): {list(gpu_ptx_map.keys())}")
        
        kernel_results = {}
        
        # Analyze each GPU's PTX file
        for gpu_model, ptx_path in gpu_ptx_map.items():
            print(f"    Analyzing {gpu_model} PTX...")
            
            try:
                # Read PTX file
                with open(ptx_path, 'r', errors='ignore') as f:
                    ptx_code = f.read()
                
                # Analyze with GPT-4
                analysis = analyze_ptx_with_gpt4(
                    ptx_code=ptx_code,
                    gpu_model=gpu_model,
                    grid_dim=grid_dim,
                    block_dim=block_dim,
                    data_size=data_size,
                    model=model
                )
                
                # Store result with GPU model as key
                rel_path = str(ptx_path.relative_to(kernels_dir))
                kernel_results[rel_path] = analysis
                
                if 'error' not in analysis:
                    timing = analysis.get('execution_time_estimate', {})
                    typical_time = timing.get('microseconds_typical', 'N/A')
                    print(f"      ‚úÖ Predicted time: {typical_time} Œºs")
                else:
                    print(f"      ‚ùå Error: {analysis.get('error', 'Unknown error')}")
                    
            except Exception as e:
                print(f"      ‚ùå Exception: {e}")
                rel_path = str(ptx_path.relative_to(kernels_dir))
                kernel_results[rel_path] = {'error': str(e)}
        
        # Store all GPU analyses for this kernel
        kernel_key = f"{dataset}/{folder_id}/{kernel_id}"
        results["analyses"][kernel_key] = kernel_results
    
    return results

print("Batch analysis function defined!")


## Configuration and Execution


In [None]:
# Paths and configuration
import os
from pathlib import Path

def get_config_paths():
    """
    Resolve important paths from environment variables with sensible defaults.
    """
    kernels_dir = Path(os.getenv("KERNELS_DIR", "/Users/james/GPU-Project/src/kernels_src"))
    output_dir = Path(os.getenv("OUTPUT_DIR", "/Users/james/GPU-Project/src/LLM_baseline"))

    kd_exists = kernels_dir.exists()
    od_exists = output_dir.exists()

    print("Configuration paths:")
    print(f"  KERNELS_DIR: {kernels_dir} (exists: {kd_exists})")
    print(f"  OUTPUT_DIR : {output_dir} (exists: {od_exists})")

    if not kd_exists:
        print("‚ö†Ô∏è  KERNELS_DIR does not exist. Please create it or set KERNELS_DIR.")
    if not od_exists:
        print("‚ö†Ô∏è  OUTPUT_DIR does not exist. It will be created if possible.")

    return kernels_dir, output_dir

kernels_dir, output_dir = get_config_paths()


In [None]:
# Configure batch analysis parameters
batch_grid_dim = (1, 1, 1)
batch_block_dim = (256, 1, 1)
batch_data_size = None  # optional
max_kernels_to_process = None  # Set to a number to limit processing, None for all

print(f"Batch analysis configuration:")
print(f"  Grid Dimensions: {batch_grid_dim}")
print(f"  Block Dimensions: {batch_block_dim}")
print(f"  Data Size: {batch_data_size}")
print(f"  Max Kernels: {max_kernels_to_process or 'All'}")


In [None]:
# Run batch analysis
print("Starting batch PTX analysis...")
print("=" * 80)

all_results = analyze_ptx_files_in_directory(
    kernels_dir=kernels_dir,
    grid_dim=batch_grid_dim,
    block_dim=batch_block_dim,
    data_size=batch_data_size,
    model="gpt-4o",
    max_kernels=max_kernels_to_process
)

# Save results
output_dir.mkdir(parents=True, exist_ok=True)
output_name = f"ptx_kernels_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
output_path = output_dir / output_name

with open(output_path, 'w') as f:
    json.dump(all_results, f, indent=2)

print(f"\nüíæ Batch analysis saved to: {output_path}")
print(f"üìä Analyzed {len(all_results['analyses'])} kernels")


## Results Analysis and Visualization


In [None]:
# Load and analyze results
def analyze_ptx_results(results_file: Path):
    """
    Load and analyze PTX prediction results.
    """
    with open(results_file, 'r') as f:
        results = json.load(f)
    
    # Extract predictions by GPU
    gpu_predictions = defaultdict(list)
    
    for kernel_key, kernel_analyses in results['analyses'].items():
        for ptx_path, analysis in kernel_analyses.items():
            if 'error' not in analysis:
                # Extract GPU from path
                gpu_match = re.search(r'_(RTX_2080_Ti|TITAN_V|RTX_4070|GTX_TITAN_X)\.ptx', ptx_path)
                if gpu_match:
                    gpu_model = gpu_match.group(1)
                    timing = analysis.get('execution_time_estimate', {})
                    typical_time = timing.get('microseconds_typical')
                    if typical_time:
                        gpu_predictions[gpu_model].append(typical_time)
    
    # Create visualization
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Box plot by GPU
    ax1 = axes[0]
    gpu_data = [gpu_predictions[gpu] for gpu in GPU_SPECS.keys() if gpu in gpu_predictions]
    gpu_labels = [gpu for gpu in GPU_SPECS.keys() if gpu in gpu_predictions]
    
    if gpu_data:
        ax1.boxplot(gpu_data, labels=gpu_labels)
        ax1.set_ylabel('Predicted Execution Time (Œºs)')
        ax1.set_title('PTX Execution Time Predictions by GPU')
        ax1.tick_params(axis='x', rotation=45)
        ax1.grid(axis='y', alpha=0.3)
    
    # Histogram
    ax2 = axes[1]
    all_times = []
    for gpu, times in gpu_predictions.items():
        all_times.extend(times)
    
    if all_times:
        ax2.hist(all_times, bins=30, alpha=0.7, edgecolor='black')
        ax2.set_xlabel('Predicted Execution Time (Œºs)')
        ax2.set_ylabel('Frequency')
        ax2.set_title('Distribution of PTX Execution Time Predictions')
        ax2.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print("\nüìä Prediction Statistics by GPU:")
    for gpu in GPU_SPECS.keys():
        if gpu in gpu_predictions:
            times = gpu_predictions[gpu]
            print(f"\n  {gpu}:")
            print(f"    Count: {len(times)}")
            print(f"    Mean: {np.mean(times):.2f} Œºs")
            print(f"    Median: {np.median(times):.2f} Œºs")
            print(f"    Min: {np.min(times):.2f} Œºs")
            print(f"    Max: {np.max(times):.2f} Œºs")
    
    return gpu_predictions

# Uncomment to analyze results
# analyze_ptx_results(output_path)
