In [None]:
%%capture
!pip install unsloth "xformers==0.0.28.post2"
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
!pip install fastapi nest_asyncio uvicorn pyngrok diffusers transformers torch accelerate python-multipart

Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.8-py3-none-any.whl.metadata (10 kB)
Collecting python-multipart
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Downloading fastapi-0.115.12-py3-none-any.whl (95 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m95.2/95.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvicorn-0.34.2-py3-none-any.whl (62 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m62.5/62.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok

In [None]:
!ngrok config add-authtoken

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from unsloth import FastLanguageModel
import torch

# ‚úÖ Auto-detect best dtype
dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# ‚úÖ Use 4-bit quantization if on GPU (reduces VRAM usage)
load_in_4bit = torch.cuda.is_available()

# ‚úÖ Load the fine-tuned model and tokenizer
model_name = "sarmadsiddiqui29/Llama-3.1-8B-Instruct-Urdu-Story"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token="",
)

# ‚úÖ Ensure model is on the correct device
device = "cuda" if torch.cuda.is_available() else "cpu"

# ‚úÖ Confirm everything is set correctly
print(f"Model loaded on {device} with dtype={dtype} (4-bit={load_in_4bit})")
FastLanguageModel.for_inference(model)

In [None]:
import torch
import re
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from unsloth import FastLanguageModel
import uvicorn
import os
import nest_asyncio
from pyngrok import ngrok
import asyncio
from threading import Thread # Import Thread

# Apply nest_asyncio to allow running asyncio event loop in environments like notebooks
nest_asyncio.apply()

# ----------------------------
# Helper Functions
# ----------------------------

def fix_spacing(text):
    """Fix missing spaces in Urdu text."""
    # Using the updated regex for Urdu characters
    return re.sub(r'(?<=[ÿÄ-€ø])(?=[ÿÄ-€ø])', ' ', text)

def extract_text_after_last_story(text: str) -> str:
    """
    Extracts text after the last occurrence of "Story:" and ensures it ends with "€î"
    """
    matches = [m.end() for m in re.finditer(r'(?i)Story:', text)]
    if matches:
        extracted_text = text[matches[-1]:].strip()
        # Using the correct Urdu full stop
        last_full_stop = extracted_text.rfind("€î")
        if last_full_stop != -1:
            return extracted_text[:last_full_stop + 1].strip()
    return ""

def remove_duplicate_sentences(text: str) -> str:
    """Removes duplicate sentences from the text based on the Urdu full stop '€î'."""
    # Using the correct Urdu full stop
    sentences = text.split("€î")
    seen = set()
    cleaned_sentences = []
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence and sentence not in seen:
            seen.add(sentence)
            cleaned_sentences.append(sentence)
    # Using the correct Urdu full stop for joining
    return "€î ".join(cleaned_sentences) + "€î" if cleaned_sentences else ""

# ----------------------------
# Story Generation Function
# ----------------------------


def generate_story_outline(concept: str, initial_story: str = "", max_steps: int = 9) -> str:
    """
    Generates a structured, coherent, and grammatically sound Urdu story iteratively.
    Uses a step-wise template to build narrative depth, with explicit instructions to ensure
    a complete and engaging journey that concludes definitively.
    """
    if model is None or tokenizer is None or device is None:
        return "Model or tokenizer not loaded. Cannot generate story. Please check your environment setup and model paths/permissions."

    story_text = initial_story
    complete_story = initial_story

    for step in range(1, max_steps + 1):
        if step == 1:
            template = f"""
**üí° ÿ®ŸÜ€åÿßÿØ€å ÿÆ€åÿßŸÑ:** {concept}

⁄©€ÅÿßŸÜ€å:
[üîπ ÿ¢ÿ∫ÿßÿ≤ ⁄©ÿ±€å⁄∫:
- ÿ¨ŸÖŸÑ€í ⁄Ü⁄æŸàŸπ€í ÿßŸàÿ± ÿ®ÿ±ÿß€Å ÿ±ÿßÿ≥ÿ™ €ÅŸà⁄∫€î
- ⁄©€ÅÿßŸÜ€å ŸÖ€å⁄∫ ÿßÿ≥ÿ±ÿßÿ± ÿßŸàÿ± ÿØŸÑ⁄Üÿ≥Ÿæ€å Ÿæ€åÿØÿß ⁄©ÿ±€å⁄∫€î
- ÿµÿ±ŸÅ ÿßÿ±ÿØŸà ŸÖ€å⁄∫ ⁄©€ÅÿßŸÜ€å ÿ¥ÿ±Ÿàÿπ ⁄©ÿ±€å⁄∫€î]

üìú **ÿ®ÿ±ÿß€Å ⁄©ÿ±ŸÖ ⁄©€ÅÿßŸÜ€å ⁄©ÿß ÿØŸÑ⁄©ÿ¥ ÿ¢ÿ∫ÿßÿ≤ ⁄©ÿ±€å⁄∫€î**
After this dont write indicators for users just Start the story in Urdu only after the word "Story:"
            """
        elif step == 2:
            template = f"""
**üí° ÿ®ŸÜ€åÿßÿØ€å ÿÆ€åÿßŸÑ:** {concept}

⁄©€ÅÿßŸÜ€å:
{story_text}

[üîπ ⁄©€ÅÿßŸÜ€å ⁄©Ÿà ÿ¢⁄Ø€í ÿ®⁄ë⁄æÿßÿ¶€å⁄∫:
- ⁄©ÿ±ÿØÿßÿ± ⁄©€í ÿßÿ®ÿ™ÿØÿßÿ¶€å ÿßÿ±ÿßÿØŸà⁄∫ ÿßŸàÿ± ⁄Ü⁄æŸæ€í ÿ±ÿßÿ≤Ÿà⁄∫ ⁄©Ÿà ŸÜŸÖÿß€åÿß⁄∫ ⁄©ÿ±€å⁄∫€î
- ⁄©€ÅÿßŸÜ€å ŸÖ€å⁄∫ ŸÖÿ≤€åÿØ ÿ≥ŸàÿßŸÑÿßÿ™ ÿßŸàÿ± ÿßÿ≥ÿ±ÿßÿ± Ÿæ€åÿØÿß ⁄©ÿ±€å⁄∫€î]

üìú **ÿ®ÿ±ÿß€Å ⁄©ÿ±ŸÖ ⁄©€ÅÿßŸÜ€å ⁄©Ÿà ÿß€å⁄© ŸÜ€åÿß ŸÖŸà⁄ë ÿØ€å⁄∫ ÿßŸàÿ± ⁄Ø€Åÿ±ÿßÿ¶€å Ÿæ€åÿØÿß ⁄©ÿ±€å⁄∫€î**
After this dont write indicators for users just Start the story in Urdu only after the word "Story:"
            """
        elif step == 3:
            template = f"""
**üí° ÿ®ŸÜ€åÿßÿØ€å ÿÆ€åÿßŸÑ:** {concept}

⁄©€ÅÿßŸÜ€å:
{story_text}

[üîπ ÿ™ŸÜÿßÿ§ ÿßŸàÿ± ⁄©ÿ¥ŸÖ⁄©ÿ¥ ⁄©Ÿà ÿ®⁄ë⁄æÿßÿ¶€å⁄∫:
- ⁄©ÿ±ÿØÿßÿ± ⁄©€å ÿØÿßÿÆŸÑ€å ⁄©ÿ¥ŸÖ⁄©ÿ¥ ÿßŸàÿ± ÿ∫€åÿ± ŸÖÿ™ŸàŸÇÿπ ŸÖŸà⁄ë ⁄©Ÿà ÿßÿ¨ÿß⁄Øÿ± ⁄©ÿ±€å⁄∫€î
- ⁄©€ÅÿßŸÜ€å ŸÖ€å⁄∫ Ÿæ€å⁄Ü€åÿØ⁄Ø€å ÿßŸàÿ± ÿØŸÑ⁄Üÿ≥Ÿæ€å Ÿæ€åÿØÿß ⁄©ÿ±€å⁄∫€î]

üìú **ÿ®ÿ±ÿß€Å ⁄©ÿ±ŸÖ ⁄©€ÅÿßŸÜ€å ⁄©Ÿà ÿß€å⁄© ÿß€åÿ≥€í ŸÖŸÇÿßŸÖ Ÿæÿ± ŸÑ€í ÿ¨ÿßÿ¶€å⁄∫ ÿ¨€Åÿß⁄∫ ŸÇÿßÿ±€å ÿ≠€åÿ±ÿßŸÜ ÿ±€Å ÿ¨ÿßÿ¶€í€î**
After this dont write indicators for users just Start the story in Urdu only after the word "Story:"
            """
        elif step == 4:
            template = f"""
**üí° ÿ®ŸÜ€åÿßÿØ€å ÿÆ€åÿßŸÑ:** {concept}

⁄©€ÅÿßŸÜ€å:
{story_text}

[üîπ ÿπÿ±Ÿàÿ¨ ⁄©€å ÿ∑ÿ±ŸÅ ÿ®⁄ë⁄æ€å⁄∫:
- ⁄©€ÅÿßŸÜ€å ŸÖ€å⁄∫ ÿ≥ŸÜÿ≥ŸÜ€å ÿÆ€åÿ≤€å ÿßŸàÿ± ŸÜÿ¶€í ÿßŸÜ⁄©ÿ¥ÿßŸÅÿßÿ™ ÿ¥ÿßŸÖŸÑ ⁄©ÿ±€å⁄∫€î
- ⁄©ÿ±ÿØÿßÿ±Ÿà⁄∫ ⁄©€å ÿ¨ÿØŸàÿ¨€ÅÿØ ÿßŸàÿ± ŸÖŸÇÿßÿ®ŸÑ€í ⁄©Ÿà Ÿàÿßÿ∂ÿ≠ ⁄©ÿ±€å⁄∫€î]

üìú **ÿ®ÿ±ÿß€Å ⁄©ÿ±ŸÖ ⁄©€ÅÿßŸÜ€å ⁄©Ÿà ÿß€å⁄© ÿπÿ±Ÿàÿ¨ Ÿæÿ± Ÿæ€ÅŸÜ⁄Üÿßÿ¶€å⁄∫ ÿ¨€Åÿß⁄∫ €Åÿ± ŸÑŸÖÿ≠€Å ŸÜ€åÿß ÿßŸÜ⁄©ÿ¥ÿßŸÅ €ÅŸà€î**
After this dont write indicators for users just Start the story in Urdu only after the word "Story:"
            """
        elif step == 5:
            template = f"""
**üí° ÿ®ŸÜ€åÿßÿØ€å ÿÆ€åÿßŸÑ:** {concept}

⁄©€ÅÿßŸÜ€å:
{story_text}

[üîπ ŸÖŸà⁄ë ÿßŸàÿ± ŸÜ€åÿß ÿ±ÿÆ:
- ⁄©€ÅÿßŸÜ€å ⁄©Ÿà ÿß€å⁄© ŸÜÿ¶€í ÿßŸàÿ± ÿ∫€åÿ± ŸÖÿ™ŸàŸÇÿπ ŸÖŸà⁄ë Ÿæÿ± ŸÑ€í ÿ¨ÿßÿ¶€å⁄∫€î
- ŸÖÿ≤ÿßÿ≠ŸÖÿ™ ÿßŸàÿ± ⁄Ü€åŸÑŸÜÿ¨ÿ≤ ⁄©Ÿà ÿßÿ¨ÿß⁄Øÿ± ⁄©ÿ±€å⁄∫€î]

üìú **ÿ®ÿ±ÿß€Å ⁄©ÿ±ŸÖ ⁄©€ÅÿßŸÜ€å ŸÖ€å⁄∫ ŸÜ€åÿß ÿ±ÿÆ ÿßŸàÿ± ŸÖÿ≤€åÿØ ⁄©ÿ¥ŸÖ⁄©ÿ¥ ÿ¥ÿßŸÖŸÑ ⁄©ÿ±€å⁄∫€î**
After this dont write indicators for users just Start the story in Urdu only after the word "Story:"
            """
        elif step == 6:
            template = f"""
**üí° ÿ®ŸÜ€åÿßÿØ€å ÿÆ€åÿßŸÑ:** {concept}

⁄©€ÅÿßŸÜ€å:
{story_text}

[üîπ ÿßÿÆÿ™ÿ™ÿßŸÖ€å ŸÖÿ±ÿßÿ≠ŸÑ ⁄©€å ÿ∑ÿ±ŸÅ ÿ®⁄ë⁄æ€å⁄∫:
- ⁄©€ÅÿßŸÜ€å ⁄©Ÿà ŸÖÿ≤€åÿØ ÿ™ŸÅÿµ€åŸÑ ÿ≥€í ÿ®€åÿßŸÜ ⁄©ÿ±€å⁄∫ ÿßŸàÿ± ⁄Ü€åŸÑŸÜÿ¨ÿ≤ ⁄©€å ÿ¥ÿØÿ™ ÿ®⁄ë⁄æÿßÿ¶€å⁄∫€î
- ⁄©ÿ±ÿØÿßÿ± ⁄©€å ÿ¨ÿØŸàÿ¨€ÅÿØ ⁄©Ÿà ⁄Ø€Åÿ±ÿßÿ¶€å ÿ≥€í Ÿæ€åÿ¥ ⁄©ÿ±€å⁄∫€î]

üìú **ÿ®ÿ±ÿß€Å ⁄©ÿ±ŸÖ ⁄©€ÅÿßŸÜ€å ⁄©Ÿà ÿßÿÆÿ™ÿ™ÿßŸÖ€å ŸÖÿ±ÿßÿ≠ŸÑ ⁄©€å ÿ∑ÿ±ŸÅ ŸÑ€í ÿ¨ÿßÿ¶€å⁄∫ÿå ŸÖ⁄Øÿ± ÿß€å⁄© ÿ¢ÿÆÿ±€å ÿ≠€åÿ±ÿßŸÜ ⁄©ŸÜ ŸÖŸà⁄ë ⁄Ü⁄æŸà⁄ë€å⁄∫€î**
After this dont write indicators for users just Start the story in Urdu only after the word "Story:"
            """
        elif step == 7:
            template = f"""
**üí° ÿ®ŸÜ€åÿßÿØ€å ÿÆ€åÿßŸÑ:** {concept}

⁄©€ÅÿßŸÜ€å:
{story_text}

[üîπ ŸÖ⁄©ŸÖŸÑ ÿßÿÆÿ™ÿ™ÿßŸÖ ⁄©€å ÿ™€åÿßÿ±€å:
- ⁄©€ÅÿßŸÜ€å ⁄©Ÿà ÿß€å⁄© ŸÖÿ±ÿ®Ÿàÿ∑ ÿßŸàÿ± ŸÖ⁄©ŸÖŸÑ ÿßŸÜÿ¨ÿßŸÖ ⁄©€å ÿ∑ÿ±ŸÅ ŸÑ€í ÿ¨ÿßÿ¶€å⁄∫€î
- ÿ™ŸÖÿßŸÖ ÿß€ÅŸÖ ŸÖŸà⁄ë ÿßŸàÿ± ⁄©ÿ¥ŸÖ⁄©ÿ¥ ⁄©Ÿà ÿ≠ŸÑ ⁄©ÿ±€å⁄∫€î]

üìú **ÿ®ÿ±ÿß€Å ⁄©ÿ±ŸÖ ⁄©€ÅÿßŸÜ€å ⁄©Ÿà ÿ¥ÿßŸÜÿØÿßÿ± ÿßŸÜÿ¨ÿßŸÖ ÿ™⁄© Ÿæ€ÅŸÜ⁄Üÿßÿ¶€å⁄∫€î**
After this dont write indicators for users just Start the story in Urdu only after the word "Story:"
            """
        elif step == 8:
            template = f"""
**üí° ÿ®ŸÜ€åÿßÿØ€å ÿÆ€åÿßŸÑ:** {concept}

⁄©€ÅÿßŸÜ€å:
{story_text}

[üîπ ÿ¨ÿ≤ÿ¶€åÿßÿ™ ⁄©€å ÿ™ÿ±ÿ™€åÿ® ÿßŸàÿ± ÿßÿÆÿ™ÿ™ÿßŸÖ:
- ⁄©€ÅÿßŸÜ€å ⁄©Ÿà ŸÖ⁄©ŸÖŸÑ ⁄©ÿ±€å⁄∫ ÿßŸàÿ± ÿ¢ÿÆÿ±€å ÿ™ÿßÿ´ÿ±ÿßÿ™ ⁄Ü⁄æŸà⁄ë€å⁄∫€î
- ÿ∫€åÿ± ÿ∂ÿ±Ÿàÿ±€å ÿ™ŸÅÿµ€åŸÑÿßÿ™ ⁄©Ÿà ÿ≠ÿ∞ŸÅ ⁄©ÿ±ÿ™€í €ÅŸàÿ¶€í ŸÖÿ±⁄©ÿ≤€å ⁄©€ÅÿßŸÜ€å Ÿæÿ± ÿ™Ÿàÿ¨€Å ÿØ€å⁄∫€î]

üìú **ÿ®ÿ±ÿß€Å ⁄©ÿ±ŸÖ ⁄©€ÅÿßŸÜ€å ⁄©Ÿà ÿß€å⁄© Ÿàÿßÿ∂ÿ≠ ÿßŸàÿ± ŸÖ⁄©ŸÖŸÑ ÿßŸÜÿ¨ÿßŸÖ ⁄©€í ÿ≥ÿßÿ™⁄æ ÿÆÿ™ŸÖ ⁄©ÿ±€å⁄∫€î**
After this dont write indicators for users just Start the story in Urdu only after the word "Story:"
            """
        elif step == 9:  # Final polishing step with lower temperature for coherence
            template = f"""
**üí° ÿ®ŸÜ€åÿßÿØ€å ÿÆ€åÿßŸÑ:** {concept}

⁄©€ÅÿßŸÜ€å:
{story_text}

[üîπ ÿ®ÿ±ÿß€Å ⁄©ÿ±ŸÖ ⁄©€ÅÿßŸÜ€å ⁄©Ÿà ÿß€å⁄© ŸÖ⁄©ŸÖŸÑÿå ŸÖÿ±ÿ®Ÿàÿ∑ ÿßŸàÿ± ÿØŸÑ⁄©ÿ¥ ÿßŸÜÿ¨ÿßŸÖ ÿ™⁄© Ÿæ€ÅŸÜ⁄Üÿßÿ¶€å⁄∫:
- ⁄©€ÅÿßŸÜ€å ⁄©€í ÿ™ŸÖÿßŸÖ ÿß€ÅŸÖ ŸÖŸà⁄ë ÿßŸàÿ± ⁄©ÿ¥ŸÖ⁄©ÿ¥ ⁄©Ÿà ÿ≠ŸÑ ⁄©ÿ±€å⁄∫€î
- ⁄©ÿ±ÿØÿßÿ± ⁄©€å ÿ∞€ÅŸÜ€å ÿßŸàÿ± ÿ¨ÿ∞ÿ®ÿßÿ™€å ÿ™ÿ±ŸÇ€å ⁄©Ÿà ÿßÿ¨ÿß⁄Øÿ± ⁄©ÿ±€å⁄∫€î
- ÿ±Ÿàÿ≤ŸÖÿ±€Å ⁄©€å ÿ™ŸÅÿµ€åŸÑÿßÿ™ ÿ≥€í €ÅŸπ ⁄©ÿ± ÿß€å⁄© ÿØŸÑ⁄Üÿ≥Ÿæ ÿ≥ŸÅÿ± ÿßŸàÿ± ÿ™ÿ®ÿØ€åŸÑ€å ⁄©Ÿà ŸÖÿ±⁄©ÿ≤€å ÿ≠€åÿ´€åÿ™ ÿØ€å⁄∫€î
- ⁄©€ÅÿßŸÜ€å ÿß€å⁄© Ÿàÿßÿ∂ÿ≠ÿå ÿßÿ´ÿ± ÿßŸÜ⁄Ø€åÿ≤ ÿßÿÆÿ™ÿ™ÿßŸÖ Ÿæÿ± ÿÆÿ™ŸÖ €ÅŸà€î]

üìú **ÿ®ÿ±ÿß€Å ⁄©ÿ±ŸÖ ⁄©€ÅÿßŸÜ€å ⁄©Ÿà ŸÖ⁄©ŸÖŸÑ ÿßŸàÿ± ŸÖÿ±ÿ®Ÿàÿ∑ ÿßÿÆÿ™ÿ™ÿßŸÖ ⁄©€í ÿ≥ÿßÿ™⁄æ ÿÆÿ™ŸÖ ⁄©ÿ±€å⁄∫€î**
ÿµÿ±ŸÅ ÿßÿ±ÿØŸà ŸÖ€å⁄∫ ⁄©€ÅÿßŸÜ€å ŸÑ⁄©⁄æ€å⁄∫ ÿßŸàÿ± ÿßÿ∂ÿßŸÅ€å €ÅÿØÿß€åÿßÿ™ ÿ¥ÿßŸÖŸÑ ŸÜ€Å ⁄©ÿ±€å⁄∫€î
After this dont write indicators for users just Start the story in Urdu only after the word "Story:"
            """

        # Prepare input tokens from the template
        inputs = tokenizer(template, return_tensors="pt").to(device)

        # Adjust max tokens and temperature per step
        max_tokens = 250
        temperature = 0.7
        if step >= 7:
            max_tokens = 400
        if step == 9:
            max_tokens = 500
            temperature = 0.6

        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=0.9,
            top_k=50,
            repetition_penalty=1.1,
            eos_token_id=tokenizer.eos_token_id,
            early_stopping=True,
            return_dict_in_generate=True
        )

        # Extract the new text generated after the last occurrence of "Story:"
        new_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
        extracted_text = extract_text_after_last_story(new_text)

        # Update story text for the next step
        story_text = extracted_text
        complete_story += " " + extracted_text

    complete_story = remove_duplicate_sentences(complete_story)
    return complete_story

app = FastAPI()

class StoryRequest(BaseModel):
    concept: str
    initial_story: str = ""
    max_steps: int = 9

@app.post("/generate_story/")
async def generate_story(request: StoryRequest):
    try:
        story = generate_story_outline(request.concept, request.initial_story, request.max_steps)
        return {"story": story}
    except Exception as e:
        # Log the exception for debugging
        print(f"An error occurred: {e}")
        raise HTTPException(status_code=500, detail=f"An error occurred during story generation: {e}")

# Code to run the FastAPI app with uvicorn and expose with ngrok
# This pattern is suitable for environments like Google Colab or Jupyter notebooks
if __name__ == "__main__":
    # Set the ngrok authtoken from environment variable
    # Make sure to set the NGROK_AUTH_TOKEN environment variable
    ngrok_auth_token = os.environ.get("NGROK_AUTH_TOKEN")
    if ngrok_auth_token:
        ngrok.set_auth_token(ngrok_auth_token)
    else:
        print("Warning: NGROK_AUTH_TOKEN environment variable not set. ngrok might not work.")
        print("You can set it like this: export NGROK_AUTH_TOKEN='YOUR_NGROK_AUTH_TOKEN' (Linux/macOS) or set NGROK_AUTH_TOKEN='YOUR_NGROK_AUTH_TOKEN' (Windows)")


    # Define the port for FastAPI
    port = 8000

    # Start ngrok tunnel
    try:
        print(f"Starting ngrok tunnel for port {port}...")
        # Use bind_tls=True if you need HTTPS, but stick to False for simplicity if not required
        public_url = ngrok.connect(port, "http", bind_tls=False).public_url
        print(f" FastAPI app is exposed at: {public_url}")
        print(f"Access the FastAPI docs at: {public_url}/docs")
    except Exception as e:
        print(f"Error starting ngrok tunnel: {e}")
        print("Please ensure ngrok is installed, authenticated, and not already running on this port.")
        public_url = None # Set to None if ngrok fails

    # Run the FastAPI application using uvicorn in a separate thread
    # Only start the thread if ngrok started successfully or if you intend to access locally
    if public_url or not ngrok_auth_token: # Added condition to allow local run without ngrok token
        try:
            print("Starting Uvicorn server in a separate thread...")
            # Use reload=False when running with ngrok to avoid issues
            # daemon=True allows the thread to exit when the main program exits
            uvicorn_thread = Thread(target=uvicorn.run,
                                    kwargs={"app": app, "host": "0.0.0.0", "port": port, "reload": False},
                                    daemon=True)
            uvicorn_thread.start()
            print("Uvicorn server thread started. The script can now continue.")
            # If running as a script, you might want to add a line here to keep the main thread alive,
            # e.g., input("Press Enter to stop the server...\n") or a loop
            # In Colab, the notebook environment keeps the script alive.
        except Exception as e:
            print(f"Error starting uvicorn thread: {e}")
            print("Please check if the port is already in use.")
    else:
        print("ngrok failed to start. Uvicorn server will not be started automatically.")

    # If running in a script that would otherwise exit, you might need a way to keep it alive
    # This is often not needed in environments like Colab where the notebook keeps the kernel alive
    # For a standalone script, you could add:
    # try:
    #     while True:
    #         time.sleep(1)
    # except KeyboardInterrupt:
    #     print("Server stopped.")
    #     # Clean up ngrok tunnel on exit
    #     if public_url:
    #         ngrok.disconnect(public_url)


# Instructions to run:
# 1. Save the code above as main.py
# 2. Set your Hugging Face token and ngrok auth token as environment variables:
#    export HF_TOKEN="YOUR_HF_TOKEN"
#    export NGROK_AUTH_TOKEN="YOUR_NGROK_AUTH_TOKEN" # Get your token from ngrok dashboard
#    (On Windows, use set HF_TOKEN="YOUR_HF_TOKEN" and set NGROK_AUTH_TOKEN="YOUR_NGROK_AUTH_TOKEN")
#    Replace 'YOUR_HF_TOKEN' and 'YOUR_NGROK_AUTH_TOKEN' with your actual tokens.
# 3. Install the required libraries:
#    pip install fastapi uvicorn python-multipart unsloth[cu{your_cuda_version}] transformers torch accelerate nest_asyncio pyngrok threading
#    Note: `threading` is a built-in Python module, no need to install with pip.
#    Replace '{your_cuda_version}' with your CUDA version (e.g., cu118, cu121). The previous CUDA errors
#    strongly suggest an issue with your CUDA setup or the unsloth installation not matching your CUDA version.
#    Ensure your system has the necessary CUDA libraries installed and accessible in your environment's PATH.
# 4. Make sure you have ngrok installed (https://ngrok.com/download) and authenticated (ngrok authtoken <your_auth_token>).
# 5. Run the python script from your terminal in the directory where you saved main.py:
#    python main.py
# 6. The script will print the ngrok public URL if successful, and then the "Uvicorn server thread started..." message.
#    The script will not "hang" or block the console after starting the server thread.
#    Use the ngrok public URL to access your FastAPI application.
#    You can test the /generate_story/ endpoint with a POST request containing JSON data.
#    If ngrok fails, the uvicorn server will still attempt to start locally, and you might be able to access it at http://127.0.0.1:8000.

You can set it like this: export NGROK_AUTH_TOKEN='YOUR_NGROK_AUTH_TOKEN' (Linux/macOS) or set NGROK_AUTH_TOKEN='YOUR_NGROK_AUTH_TOKEN' (Windows)
Starting ngrok tunnel for port 8000...
 FastAPI app is exposed at: http://3576-34-87-126-232.ngrok-free.app
Access the FastAPI docs at: http://3576-34-87-126-232.ngrok-free.app/docs
Starting Uvicorn server in a separate thread...
Uvicorn server thread started. The script can now continue.


In [None]:
# import requests
# import json

# url = "http://3cfe-34-125-125-99.ngrok-free.app/generate_story/"

# payload = {
#     "concept": "ÿß€åÿ≥€å ⁄©€ÅÿßŸÜ€å ŸÑ⁄©⁄æ€å⁄∫ ÿ¨ÿ≥ ŸÖ€å⁄∫ ÿß€å⁄© ÿ¥ÿÆÿµ ÿßŸÜÿ™€Åÿßÿ¶€å ÿ∫ÿ±€åÿ® €ÅŸàÿ™ÿß €Å€í€î Ÿà€Å ÿßŸæŸÜ€å ÿ≤ŸÜÿØ⁄Ø€å ÿßŸÖ€åÿ± ÿ®ŸÜŸÜ€í ⁄©€í ŸÑ€å€í ŸàŸÇŸÅ ⁄©ÿ± ÿØ€åÿ™ÿß €Å€íÿå ŸÑ€å⁄©ŸÜ ÿ¨ÿ® Ÿà€Å ÿØŸàŸÑÿ™ ŸÖŸÜÿØ ÿ®ŸÜ ÿ¨ÿßÿ™ÿß €Å€í ÿ™Ÿà ÿßÿ≥€í ÿßÿ≠ÿ≥ÿßÿ≥ €ÅŸàÿ™ÿß €Å€í ⁄©€Å ÿßÿ≥ ŸÜ€í ÿßŸæŸÜ€å ÿ≤ŸÜÿØ⁄Ø€å ŸÖ€å⁄∫ ⁄©€åÿß ⁄©⁄æŸà ÿØ€åÿß €Å€í€î",
#     "initial_story": "",
#     "max_steps": 9
# }

# headers = {
#     "Content-Type": "application/json"
# }

# try:
#     response = requests.post(url, data=json.dumps(payload), headers=headers)
#     response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
#     print("Response Status Code:", response.status_code)
#     print("Response Body:", response.json())
# except requests.exceptions.RequestException as e:
#     print(f"An error occurred during the request: {e}")