In [1]:
# ============================================================
# Run FastAPI Inference Server in Google Colab with ngrok
# Makes your model accessible via public URL
# ============================================================

# Install dependencies
!pip install -q fastapi uvicorn pyngrok nest-asyncio

import nest_asyncio
from pyngrok import ngrok
import uvicorn
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import time
from typing import List

In [2]:
# Allow nested async (required for Colab)
nest_asyncio.apply()

# ============================================================
# MODELS
# ============================================================
class GenerateRequest(BaseModel):
    prompt: str
    max_length: int = 150
    temperature: float = 0.7
    do_sample: bool = True

class GenerateResponse(BaseModel):
    generated_text: str
    inference_time: float

In [3]:
# ============================================================
# SETUP
# ============================================================
app = FastAPI(title="StarCoder2 API")

# Global variables
MODEL = None
TOKENIZER = None
USE_GPU = torch.cuda.is_available()

print(f"GPU Available: {USE_GPU}")

GPU Available: True


In [7]:
def load_model():
    global MODEL, TOKENIZER

    print("Loading model...")

    BASE_MODEL = "/content/drive/MyDrive/starcoder2-3b"
    LORA_PATH = "/content/starcoder-finetuned"

    # Tokenizer
    TOKENIZER = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
    TOKENIZER.pad_token = TOKENIZER.eos_token

    # Model
    if USE_GPU:
        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )
    else:
        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL,
            torch_dtype=torch.float32,
            trust_remote_code=True
        )

    # Load LoRA and merge for faster inference
    print("Merging LoRA weights...")
    MODEL = PeftModel.from_pretrained(base_model, LORA_PATH)
    MODEL = MODEL.merge_and_unload()
    MODEL.eval()

    print("✓ Model loaded!")

In [5]:

# ============================================================
# ENDPOINTS
# ============================================================
@app.get("/")
async def root():
    return {
        "message": "StarCoder2 Inference API",
        "endpoints": ["/generate", "/health"],
        "model": "starcoder2-3b-finetuned"
    }

@app.get("/health")
async def health():
    return {
        "status": "healthy",
        "model_loaded": MODEL is not None,
        "gpu": USE_GPU
    }

@app.post("/generate", response_model=GenerateResponse)
async def generate(request: GenerateRequest):
    if MODEL is None:
        raise HTTPException(status_code=503, detail="Model not loaded")

    start = time.time()

    try:
        inputs = TOKENIZER(request.prompt, return_tensors="pt")

        if USE_GPU:
            inputs = {k: v.to("cuda") for k, v in inputs.items()}

        with torch.no_grad():
            outputs = MODEL.generate(
                **inputs,
                max_length=request.max_length,
                temperature=request.temperature if request.do_sample else 1.0,
                do_sample=request.do_sample,
                pad_token_id=TOKENIZER.eos_token_id,
                use_cache=True,
            )

        text = TOKENIZER.decode(outputs[0], skip_special_tokens=True)
        elapsed = time.time() - start

        return GenerateResponse(generated_text=text, inference_time=elapsed)

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

In [8]:
# ============================================================
# START SERVER
# ============================================================
# Load model first
load_model()

# Setup ngrok tunnel (get free auth token from ngrok.com)
print("\n" + "="*60)
print("Setting up ngrok tunnel...")
print("="*60)

# Optional: Set your ngrok auth token
ngrok.set_auth_token("35xJxc60HL0DZahzW4SfIwyq7nK_6KHx5x9m9w7MWtngkgdMz")

# Start ngrok tunnel
public_url = ngrok.connect(8000)
print(f"\n✓ Public URL: {public_url}")
print(f"\nAPI Docs: {public_url}/docs")
print("="*60 + "\n")

# Start server
print("Starting FastAPI server...")
print("Press Ctrl+C to stop\n")

uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")

Loading model...
Merging LoRA weights...
✓ Model loaded!

Setting up ngrok tunnel...

✓ Public URL: NgrokTunnel: "https://capitalistic-subfusiform-iluminada.ngrok-free.dev" -> "http://localhost:8000"

API Docs: NgrokTunnel: "https://capitalistic-subfusiform-iluminada.ngrok-free.dev" -> "http://localhost:8000"/docs

Starting FastAPI server...
Press Ctrl+C to stop



RuntimeError: asyncio.run() cannot be called from a running event loop

In [None]:
# ============================================================
# USAGE FROM ANYWHERE
# ============================================================
import requests

# Your public URL from ngrok
API_URL = "https://xxxx-xx-xxx-xxx-xx.ngrok-free.app"

# Generate code
response = requests.post(
    f"{API_URL}/generate",
    json={
        "prompt": "def fibonacci(n):",
        "max_length": 150,
        "temperature": 0.7
    }
)

result = response.json()
print(f"Generated: {result['generated_text']}")
print(f"Time: {result['inference_time']}s")

In [9]:
# ============================================================
# Simple Fast Inference in Colab (No Server Setup)
# Just run this cell and use the functions directly
# ============================================================

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import time

print("="*60)
print("LOADING MODEL FOR FAST INFERENCE")
print("="*60 + "\n")

# Configuration
BASE_MODEL = "/content/drive/MyDrive/starcoder2-3b"
LORA_PATH = "/content/starcoder-finetuned"

# Check GPU
USE_GPU = torch.cuda.is_available()
print(f"Using: {'GPU ✓' if USE_GPU else 'CPU ⚠'}\n")

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Load model
print("Loading model...")
if USE_GPU:
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        low_cpu_mem_usage=True
    )
else:
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.float32,
        trust_remote_code=True,
        low_cpu_mem_usage=True
    )

# Load LoRA weights
print("Loading fine-tuned weights...")
model = PeftModel.from_pretrained(base_model, LORA_PATH)

# CRITICAL: Merge LoRA weights for faster inference
print("Merging weights (this makes inference much faster)...")
model = model.merge_and_unload()
model.eval()

print("\n" + "="*60)
print("✓ MODEL READY FOR FAST INFERENCE!")
print("="*60 + "\n")

# ============================================================
# INFERENCE FUNCTIONS
# ============================================================

def generate(prompt, max_length=150, temperature=0.7, fast=True):
    """
    Generate code from a prompt

    Args:
        prompt: Code prompt
        max_length: Maximum length of generated code
        temperature: Sampling temperature (ignored if fast=True)
        fast: Use fast greedy decoding (recommended)

    Returns:
        Generated code as string
    """
    start_time = time.time()

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt")
    if USE_GPU:
        inputs = {k: v.to("cuda") for k, v in inputs.items()}

    # Generate
    with torch.no_grad():
        if fast:
            # Fast mode: greedy decoding, no sampling
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id,
                use_cache=True,
            )
        else:
            # Quality mode: sampling enabled
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                temperature=temperature,
                do_sample=True,
                top_p=0.95,
                pad_token_id=tokenizer.eos_token_id,
                use_cache=True,
            )

    # Decode
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    elapsed = time.time() - start_time

    print(f"⏱️  Generation time: {elapsed:.2f}s")
    return result

def generate_batch(prompts, max_length=150, fast=True):
    """
    Generate code for multiple prompts at once (faster!)

    Args:
        prompts: List of code prompts
        max_length: Maximum length for each generation
        fast: Use fast greedy decoding

    Returns:
        List of generated code strings
    """
    start_time = time.time()

    # Tokenize all prompts
    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True
    )
    if USE_GPU:
        inputs = {k: v.to("cuda") for k, v in inputs.items()}

    # Generate batch
    with torch.no_grad():
        if fast:
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id,
                use_cache=True,
            )
        else:
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                temperature=0.7,
                do_sample=True,
                top_p=0.95,
                pad_token_id=tokenizer.eos_token_id,
                use_cache=True,
            )

    # Decode all
    results = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    elapsed = time.time() - start_time

    print(f"⏱️  Batch generation time: {elapsed:.2f}s")
    print(f"⏱️  Average per prompt: {elapsed/len(prompts):.2f}s")
    return results

# ============================================================
# EXAMPLES
# ============================================================

print("="*60)
print("EXAMPLE USAGE")
print("="*60 + "\n")

# Example 1: Single generation
print("Example 1: Single generation")
print("-" * 60)
prompt1 = "def fibonacci(n):"
print(f"Prompt: {prompt1}\n")
result1 = generate(prompt1, max_length=150)
print(f"\nGenerated:\n{result1}")
print("="*60 + "\n")

# Example 2: Batch generation (faster for multiple prompts)
print("Example 2: Batch generation")
print("-" * 60)
prompts = [
    "def calculate_sum(numbers):",
    "function sortArray(arr) {",
    "class DatabaseManager:",
]
print(f"Generating {len(prompts)} prompts...\n")
results = generate_batch(prompts, max_length=120)

for i, (prompt, result) in enumerate(zip(prompts, results), 1):
    print(f"\n{i}. Prompt: {prompt}")
    print(f"   Result: {result[:80]}...")

print("\n" + "="*60)
print("✓ ALL DONE!")
print("="*60)
print("""
Now you can use:
  - generate(prompt)           # Single generation
  - generate_batch([prompts])  # Batch generation (faster)

Examples:
  result = generate("def quicksort(arr):")
  results = generate_batch(["def func1():", "def func2():"])
""")
print("="*60)

LOADING MODEL FOR FAST INFERENCE

Using: GPU ✓

Loading tokenizer...
Loading model...
Loading fine-tuned weights...
Merging weights (this makes inference much faster)...

✓ MODEL READY FOR FAST INFERENCE!

EXAMPLE USAGE

Example 1: Single generation
------------------------------------------------------------
Prompt: def fibonacci(n):



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


⏱️  Generation time: 6.26s

Generated:
def fibonacci(n):
    if n == 0:
        return 0
    elif n == 1:
        return 1
    else:
        return fibonacci(n-1) + fibonacci(n-2)

print(fibonacci(10))

# +
# 10. Write a Python program to find the sum of the first n positive integers.
# -

def sum_of_first_n_positive_integers(n):
    if n == 0:
        return 0
    else:
        return n + sum_of_first_n_positive_integers(n-1)
print(sum_of_first_n_

Example 2: Batch generation
------------------------------------------------------------
Generating 3 prompts...

⏱️  Batch generation time: 4.66s
⏱️  Average per prompt: 1.55s

1. Prompt: def calculate_sum(numbers):
   Result: def calculate_sum(numbers):
    return sum(numbers)

def calculate_product(numbe...

2. Prompt: function sortArray(arr) {
   Result: function sortArray(arr) {
  // sort the array
  arr.sort(function(a, b) {
    re...

3. Prompt: class DatabaseManager:
   Result: class DatabaseManager:
	def __init__(self):
		self.db =