In [1]:
!pip install fastapi uvicorn pyngrok pydub faster-whisper openai
!apt-get install ffmpeg
!pip install python-multipart

Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.8-py3-none-any.whl.metadata (10 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting faster-whisper
  Downloading faster_whisper-1.1.1-py3-none-any.whl.metadata (16 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Collecting ctranslate2<5,>=4.0 (from faster-whisper)
  Downloading ctranslate2-4.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting av>=11 (from faster-whisper)
  Downloading av-14.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadat

In [2]:
%%writefile inference.py
import logging
import re
from pathlib import Path

from fastapi import FastAPI, HTTPException, UploadFile, File, Form
from fastapi.responses import JSONResponse
import numpy as np
import torch
from pydub import AudioSegment
from faster_whisper import WhisperModel
from openai import OpenAI

# ============ Logging ============
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ============ API Key & Directories ============
OPENAI_API_KEY = ""     # ← your key here
client = OpenAI(api_key=OPENAI_API_KEY)

VOICE_DIR = Path("/tmp/voice_registry")
VOICE_DIR.mkdir(parents=True, exist_ok=True)
TMP_WAV = Path("/tmp/temp.wav")

# ============ Model Initialization ============
device = "cuda" if torch.cuda.is_available() else "cpu"
model = WhisperModel("large-v3", device=device, compute_type="float32")

EGYPTIAN_PROMPT = (
    "This is Egyptian Arabic speech. Use dialect words like 'aywa', 'la2', 'mesh', "
    "'enta fein', '3ayez', 'hat', 'mashy'. Transcribe naturally as Egyptians speak."
)

# ===== Helper Functions =====
def extract_features(path: Path, target: int = 480_000) -> np.ndarray:
    audio = AudioSegment.from_file(path).set_channels(1).set_frame_rate(16000)
    samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
    samples /= np.max(np.abs(samples))
    return np.pad(samples, (0, max(0, target - len(samples))))[:target]

def save_profile(emp_id: str, wav_path: Path):
    np.save(VOICE_DIR / f"{emp_id}.npy", extract_features(wav_path))

def load_profile(emp_id: str) -> np.ndarray:
    p = VOICE_DIR / f"{emp_id}.npy"
    return np.load(p) if p.exists() else None

def gpt_rephrase_full(text: str) -> str:
    system_prompt = (
        "You are a professional real estate sales assistant.\n"
        "Rephrase each line individually to be more polite, fluent, and domain-specific for real estate sales.\n"
        "Keep 'Client:' and 'Broker:' labels exactly as they are.\n"
        "Rewrite in clear business English.\n"
    )
    resp = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": text}
        ],
        temperature=0.4,
        max_tokens=2000
    )
    return resp.choices[0].message.content.strip()

def transcribe_logic(wav_path: Path, emp_id: str) -> dict:
    save_profile(emp_id, wav_path)
    profile = load_profile(emp_id)
    if profile is None:
        raise ValueError(f"No voice profile for emp_id={emp_id}")

    # ASR
    params = dict(
        beam_size=2, vad_filter=True,
        vad_parameters={"min_silence_duration_ms": 500},
        temperature=0.0, condition_on_previous_text=False,
        compression_ratio_threshold=2.4, language="ar",
        best_of=1, word_timestamps=False,
        initial_prompt=EGYPTIAN_PROMPT
    )
    segments, info = model.transcribe(str(wav_path), **params)
    feats = extract_features(wav_path)

    out = []
    for i, seg in enumerate(segments):
        is_emp = False
        if i % 2 == 0:
            start, end = int(seg.start*16000), int(seg.end*16000)
            chunk = np.pad(feats[start:end], (0, len(profile)-len(feats[start:end])))[:len(profile)]
            sim = np.dot(chunk, profile) / (np.linalg.norm(chunk)*np.linalg.norm(profile)+1e-6)
            is_emp = sim > 0.5
        out.append({
            "start": seg.start,
            "end": seg.end,
            "speaker": emp_id if is_emp else "Client",
            "text": seg.text
        })

    full_text = "\n".join(f"[{o['start']:.2f}-{o['end']:.2f}] {o['speaker']}: {o['text']}" for o in out)

    # Translation
    tseg, _ = model.transcribe(
        str(wav_path), task="translate", language="ar",
        beam_size=2, temperature=0.0, best_of=1,
        vad_filter=True, initial_prompt=EGYPTIAN_PROMPT
    )
    cleaned = "\n".join(t.text for t in tseg)
    cleaned = re.sub(r"\[\d+\.\d+-\d+\.\d+\]\s+", "", cleaned)
    rephrased = gpt_rephrase_full(cleaned)

    return {
        "status": "success",
        "language": info.language,
        "transcription": full_text,
        "translation": cleaned,
        "final_rephrased_text": rephrased
    }

# ============ FastAPI App ============
app = FastAPI()

@app.get("/ping")
def ping():
    return JSONResponse({"status": "ok"})

@app.post("/invocations")
async def invoke(
    file: UploadFile = File(...),
    emp_id: str = Form("Broker")
):
    try:
        content = await file.read()
        TMP_WAV.write_bytes(content)
        result = transcribe_logic(TMP_WAV, emp_id)
        return JSONResponse(result)
    except ValueError as ve:
        raise HTTPException(status_code=400, detail=str(ve))
    except Exception as e:
        logger.error("Invocation error: %s", e)
        raise HTTPException(status_code=500, detail="Internal error")


Writing inference.py


In [3]:
!ngrok config add-authtoken 2wECNofpyItvbpfFXHvTDvrrAHh_6V4BeKEqH5aNZvQcn6zys

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [4]:
# in a new Colab cell
!pkill -f uvicorn


In [5]:
# Colab cell continued: ngrok + Uvicorn startup

from pyngrok import ngrok
import nest_asyncio, threading, uvicorn
from inference import app  # your FastAPI app

# 1) Kill any existing tunnels
ngrok.kill()

# 2) (Re-)authenticate
ngrok.set_auth_token("2wECNofpyItvbpfFXHvTDvrrAHh_6V4BeKEqH5aNZvQcn6zys")

# 3) Allow nested loops in Colab
nest_asyncio.apply()

# 4) Open an HTTP-only tunnel on port 8000
tunnel = ngrok.connect(
    8000,
    proto="http",            # make it an HTTP tunnel
    bind_tls=False           # disable the TLS (HTTPS) endpoint
)
print("→ HTTP tunnel URL:", tunnel.public_url)

# 5) Start Uvicorn as before
def run_server():
    uvicorn.run(app, host="0.0.0.0", port=8000)

threading.Thread(target=run_server, daemon=True).start()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/2.39k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

vocabulary.json:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

→ HTTP tunnel URL: http://cfc5-34-16-136-168.ngrok-free.app


In [None]:
import requests

resp = requests.get(f"{public_url}/ping")
print(resp.status_code, resp.json())


INFO:     34.125.179.44:0 - "GET /ping HTTP/1.1" 200 OK
200 {'status': 'ok'}


In [None]:
with open("/content/call_2_trimmed.wav", "rb") as f:
    resp = requests.post(
        f"{public_url}/invocations",
        files={"file": ("call.wav", f, "audio/wav")},
        data={"emp_id": "Broker"}
    )
    print(resp.status_code, resp.json())


INFO:     34.125.179.44:0 - "POST /invocations HTTP/1.1" 200 OK
200 {'status': 'success', 'language': 'ar', 'transcription': '[3.82-4.82] Client:  السلام عليكم.\n[4.82-6.11] Client:  السلام عليكم.\n[6.11-9.90] Client:  مين معايا؟\n[9.90-12.90] Client:  مع حضرتك كريم من شركة Property Finder.\n[12.90-15.06] Client:  زكا كريم عامل ايه؟\n[15.06-23.06] Client:  الحمد لله. حضرتك بس كنت مليت فورم في الحملة التسويقية بتاعتنا بخصوص وحدة في العاصمة الإدارية الجديدة.\n[23.06-27.06] Client:  فحبيت أتابع مع حضرتك لو مفيش مانع.\n[27.06-30.19] Client:  تمام. قول. قول اللي عندك.\n[30.19-33.48] Client:  أولا شكرا جدا على اهتمام حضرتك\n[33.48-36.08] Client:  أنا بس حابب أبدأ بخطوة بسيطة\n[36.08-38.76] Client:  عشان أشرح لك المشاريع\n[38.76-41.54] Client:  ونحاول نوصل لأنسب مشروع في يوم\n[41.54-43.24] Client:  بالنسبة للمنطقة\n[43.24-49.32] Client:  حضرتك محدد حي معين أو كومباونت معين في العاصمة؟\n[49.32-52.24] Client:  والله أنا كنت بدور في الـ R8 أو الـ R7\n[52.24-54.52] Client:  بس ما عنديش منع أشوف ا