In [1]:
#!/usr/bin/env python3
"""
Load GGUF from Hugging Face and run NIRF Q&A (CLI).
Repo: coderop12/gemma2b-nirf-lookup-gguf
File: gemma2b-nirf-lookup-f16.gguf
"""

import os
from typing import Optional
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

HF_REPO_ID = "coderop12/gemma2b-nirf-lookup-gguf"
HF_FILENAME = "gemma2b-nirf-lookup-f16.gguf"
# If the repo is private, set HF_TOKEN in your env; otherwise leave None
HF_TOKEN: Optional[str] = os.environ.get("HF_TOKEN", None)

# llama.cpp runtime settings (tweak as you like)
N_CTX = 2048
N_THREADS = int(os.environ.get("N_THREADS", "4"))
N_GPU_LAYERS = int(os.environ.get("N_GPU_LAYERS", "0"))  # set >0 if you have GPU offload
VERBOSE = False

def download_model() -> str:
    """Download the GGUF file from Hugging Face (cached locally by HF)."""
    print(f"‚¨áÔ∏è  Downloading from HF: {HF_REPO_ID}/{HF_FILENAME}")
    local_path = hf_hub_download(
        repo_id=HF_REPO_ID,
        filename=HF_FILENAME,
        local_dir=None,         # use HF cache dir
        token=HF_TOKEN,         # None if public
        force_download=False,
        resume_download=True
    )
    print(f"‚úÖ Model ready at: {local_path}")
    return local_path

class NIRFRankingModel:
    def __init__(self, model_path: str):
        self.model_path = model_path
        self.llm = None
        self.load_model()

    def load_model(self):
        print(f"üîß Loading GGUF: {self.model_path}")
        self.llm = Llama(
            model_path=self.model_path,
            n_ctx=N_CTX,
            n_threads=N_THREADS,
            n_gpu_layers=N_GPU_LAYERS,
            verbose=VERBOSE
        )
        print("‚úÖ Model loaded")

    def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.3) -> str:
        out = self.llm(
            prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=0.9,
            repeat_penalty=1.1,
            stop=["</s>", "\n\n"]
        )
        return (out["choices"][0]["text"] or "").strip()

    def ask(self, question: str) -> str:
        prompt = f"Question: {question}\n\nAnswer:"
        return self.generate(prompt)

def main():
    model_path = download_model()
    nirf = NIRFRankingModel(model_path)

    print("=== NIRF GGUF Inference (from Hugging Face) ===")
    samples = [
        "What is the NIRF ranking of IIT Delhi in 2024?",
        "Which are the top 5 universities in NIRF Overall ranking 2024?",
    ]
    for i, q in enumerate(samples, 1):
        print(f"\n--- Test {i} ---")
        print("Q:", q)
        a = nirf.ask(q)
        print("A:", a)

    print("\n=== Interactive Mode ===")
    print("Type 'quit' to exit.")
    while True:
        try:
            q = input("\nYour question: ").strip()
            if q.lower() in {"quit", "exit", "q"}:
                break
            print("A:", nirf.ask(q))
        except KeyboardInterrupt:
            break

if __name__ == "__main__":
    main()


‚¨áÔ∏è  Downloading from HF: coderop12/gemma2b-nirf-lookup-gguf/gemma2b-nirf-lookup-f16.gguf
‚úÖ Model ready at: /teamspace/studios/this_studio/.cache/huggingface/hub/models--coderop12--gemma2b-nirf-lookup-gguf/snapshots/113a395c55a965971c1f6a5ddb890245ee495f3b/gemma2b-nirf-lookup-f16.gguf
üîß Loading GGUF: /teamspace/studios/this_studio/.cache/huggingface/hub/models--coderop12--gemma2b-nirf-lookup-gguf/snapshots/113a395c55a965971c1f6a5ddb890245ee495f3b/gemma2b-nirf-lookup-f16.gguf




: 

In [1]:
!free -h


               total        used        free      shared  buff/cache   available
Mem:            29Gi       3.8Gi       461Mi       2.1Mi        25Gi        25Gi
Swap:           15Gi       256Ki        15Gi
