In [None]:
!pip install torch transformers fastapi uvicorn accelerate sentencepiece bitsandbytes pyngrok



In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch


In [None]:
# Model Name (You need access to LLaMA-2)
MODEL_NAME = "tiiuae/falcon-7b-instruct"

# Enable 8-bit quantization using bitsandbytes
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Use 8-bit precision
    llm_int8_threshold=6.0,  # Threshold for outlier-sensitive layers
    llm_int8_enable_fp32_cpu_offload=True  # Offload some operations to CPU
)

In [None]:
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load Model with 8-bit Quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

# Input Data Model
class FinancialQuery(BaseModel):
    transaction_history: str  # Raw transaction data
    credit_score: int         # User's credit score
    debt_to_income: float     # Debt-to-income ratio


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# FastAPI App
app = FastAPI()

@app.get("/")
def home():
    return {"message": "LLaMA-2 Financial Insights API is running!"}

@app.post("/generate-insight/")
def generate_financial_insight(data: FinancialQuery):
    """Generates a financial health summary based on user input"""

    # Construct prompt
    prompt = f"""
    Based on the following financial data:
    - Transactions: {data.transaction_history}
    - Credit Score: {data.credit_score}
    - Debt-to-Income Ratio: {data.debt_to_income}

    Provide a financial health summary, risk assessment (High/Medium/Low), and personalized recommendations.
    """

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate response
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=200, temperature=0.7)

    # Decode generated text
    response_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return {"financial_insight": response_text}


In [None]:
from google.colab import userdata
token = userdata.get('ngrok_authtoken')

In [None]:
!ngrok config add-authtoken $token

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from pyngrok import ngrok
import uvicorn
import threading

# Start ngrok tunnel
public_url = ngrok.connect(8000).public_url
print(f"🌍 Public API URL: {public_url}")

# Run FastAPI in a separate thread
def run_api():
    uvicorn.run(app, host="0.0.0.0", port=8000)

thread = threading.Thread(target=run_api)
thread.start()


🌍 Public API URL: https://d230-35-198-250-77.ngrok-free.app


INFO:     Started server process [11299]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
