<h1 style="font-size:70px; color:red; font-weight:700;">Using GPT 2</h1>

In [2]:
# hybrid_qa_app.py

import json
import numpy as np
import os
import re
import torch
import faiss
import nltk
import gradio as gr
import pytrec_eval
from bs4 import BeautifulSoup
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModel, GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm
import pandas as pd

# 1) Force CPU
DEVICE = torch.device("cpu")

# 2) Ensure NLTK tokenizer is available
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download("punkt")

# 3) Paths
DATA_DIR = "stacklite_dataset"
DATA_FILE = os.path.join(DATA_DIR, "top_datascience_questions.json")
QUERY_FILE = os.path.join(DATA_DIR, "queries.json")
QRELS_FILE = os.path.join(DATA_DIR, "qrels.json")

# 4) Load and clean data
with open(DATA_FILE, 'r', encoding='utf-8') as f:
    data = json.load(f)
df = pd.DataFrame(data)

def clean_text(html):
    return BeautifulSoup(html, "html.parser").get_text(separator=" ")

df['text'] = df.apply(lambda r: r['title'] + " " + clean_text(r['body']), axis=1)

# 5) BM25 setup
def safe_tokenize(text):
    try:
        return nltk.word_tokenize(text.lower())
    except LookupError:
        return re.findall(r'\b\w+\b', text.lower())

tokenized_corpus = [safe_tokenize(doc) for doc in df['text']]
bm25 = BM25Okapi(tokenized_corpus)

# 6) Dense Retrieval (MiniLM + FAISS)
EMBED_MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
embed_tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_NAME)
embed_model = AutoModel.from_pretrained(EMBED_MODEL_NAME).to(DEVICE)

def get_embeddings(texts, batch_size=32):
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch = texts[i:i+batch_size]
        inputs = embed_tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=512).to(DEVICE)
        with torch.no_grad():
            out = embed_model(**inputs)
        last_hidden = out.last_hidden_state
        mask = inputs.attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
        summed = torch.sum(last_hidden * mask, 1)
        counts = torch.clamp(mask.sum(1), min=1e-9)
        avg = (summed / counts).cpu().numpy()
        all_embs.append(avg)
    return np.vstack(all_embs)

dense_embeddings = get_embeddings(df['text'].tolist())
dim = dense_embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(dense_embeddings.astype(np.float32))

# 7) RRF Fusion
def rrf(bm25_ids, dense_ids, k=60):
    from collections import defaultdict
    scores = defaultdict(float)
    for rank, idx in enumerate(bm25_ids):
        scores[idx] += 1 / (k + rank + 1)
    for rank, idx in enumerate(dense_ids):
        scores[idx] += 1 / (k + rank + 1)
    return sorted(scores, key=scores.get, reverse=True)

# 8) Generator (GPT-2 with its own tokenizer)
GEN_MODEL = "gpt2"
gen_tokenizer = GPT2Tokenizer.from_pretrained(GEN_MODEL)
gen_tokenizer.pad_token = gen_tokenizer.eos_token
gen_model = GPT2LMHeadModel.from_pretrained(GEN_MODEL).to(DEVICE)

# 9) Retrieval functions
def retrieve_bM25(question, k=10):
    tok = safe_tokenize(question)
    sims = bm25.get_scores(tok)
    idxs = np.argsort(sims)[::-1][:k]
    return [str(df.iloc[i]['question_id']) for i in idxs if i < len(df)]

def retrieve_dense(question, k=10):
    emb = get_embeddings([question])
    _, idxs = index.search(emb.astype(np.float32), k)
    return [str(df.iloc[i]['question_id']) for i in idxs[0] if i < len(df)]

# 10) Answer generation with safe indexing and length constraints
def hybrid_answer(query):
    k_bm25 = 30
    k_dense = 30
    top_k = 10

    bm_scores = bm25.get_scores(safe_tokenize(query))
    bm_idx = np.argsort(bm_scores)[::-1][:k_bm25]
    bm_idx = [idx for idx in bm_idx if idx < len(df)]

    dense_emb = get_embeddings([query])
    _, dense_idx = index.search(dense_emb.astype(np.float32), k_dense)
    dense_idx = [idx for idx in dense_idx[0] if idx < len(df)]

    fused = rrf(bm_idx, dense_idx)[:top_k]

    docs = []
    for rank, idx in enumerate(fused):
        try:
            words = df.iloc[idx]['text'].split()[:100]
            docs.append(f"[Doc {rank+1}]: " + " ".join(words))
        except IndexError:
            continue  # Skip if idx is out of bounds

    context = "\n\n".join(docs)
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"

    # Tokenize with length constraints
    inputs = gen_tokenizer(
        prompt, 
        return_tensors="pt", 
        truncation=True, 
        max_length=768  # Reserve space for generation
    )
    input_ids = inputs.input_ids.to(DEVICE)
    attention_mask = inputs.attention_mask.to(DEVICE)

    # Generate answer with safe length limits
    try:
        output_sequences = gen_model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=150,
            max_length=900,  # Total length < 1024
            pad_token_id=gen_tokenizer.eos_token_id,
            temperature=0.7,
            num_return_sequences=1
        )
        generated_sequence = output_sequences[0, input_ids.shape[1]:].tolist()
        answer = gen_tokenizer.decode(generated_sequence, skip_special_tokens=True).strip()
    except Exception as e:
        answer = f"Error in generation: {str(e)}"

    cits = []
    for i, idx in enumerate(fused[:5]):
        try:
            row = df.iloc[idx]
            cits.append(f"{i+1}. {row['title']} (ID: {row['question_id']})")
        except IndexError:
            continue

    return answer, "\n".join(cits)

# 11) Gradio interface
example_questions = [
    "How to handle missing values in a dataset?",
    "What's the difference between random forest and gradient boosting?",
    "How to implement a neural network in PyTorch?",
    "When should I use L1 vs L2 regularization?",
    "How does PCA reduce dimensionality?",
    "Explain cross-validation and why it's important.",
    "How to optimize hyperparameters in scikit-learn?",
    "What is the bias-variance tradeoff?",
    "How to deploy a model with Flask?",
    "What are attention mechanisms in transformers?"
]

iface = gr.Interface(
    fn=hybrid_answer,
    inputs=gr.Textbox(label="Technical Question", placeholder="Ask anything…"),
    outputs=[gr.Textbox(label="Answer"), gr.Textbox(label="Citations")],
    title="📚 Hybrid QA Assistant (StackLite)",
    description="BM25 + MiniLM + RRF fusion + GPT-2 on CPU",
    examples=example_questions
)

# 12) Main
if __name__ == "__main__":
    with open(QUERY_FILE) as f:
        queries = json.load(f)
    with open(QRELS_FILE) as f:
        raw_qrels = json.load(f)
    qrels = {str(qid): {str(doc): 1 for doc in docs} for qid, docs in raw_qrels.items()}

    def build_run(fn):
        return {str(q['id']): {doc: 1.0 for doc in fn(q['question'])} for q in queries}

    run_bm25 = build_run(retrieve_bM25)
    run_dense = build_run(retrieve_dense)

    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'recip_rank', 'ndcg'})
    metrics_bm25 = evaluator.evaluate(run_bm25)
    metrics_dense = evaluator.evaluate(run_dense)

    def summarize(metrics):
        return {
            'MAP@10': np.mean([v['map'] for v in metrics.values()]),
            'MRR@10': np.mean([v['recip_rank'] for v in metrics.values()]),
            'nDCG@10': np.mean([v['ndcg'] for v in metrics.values()])
        }

    print("🔍 BM25 Eval:", summarize(metrics_bm25))
    print("🔍 Dense Eval:", summarize(metrics_dense))

    iface.launch()

Embedding: 100%|██████████| 24/24 [00:41<00:00,  1.71s/it]
Embedding: 100%|██████████| 1/1 [00:00<00:00, 58.98it/s]
Embedding: 100%|██████████| 1/1 [00:00<00:00, 56.05it/s]
Embedding: 100%|██████████| 1/1 [00:00<00:00, 57.07it/s]
Embedding: 100%|██████████| 1/1 [00:00<00:00, 52.62it/s]
Embedding: 100%|██████████| 1/1 [00:00<00:00, 55.47it/s]
Embedding: 100%|██████████| 1/1 [00:00<00:00, 62.47it/s]
Embedding: 100%|██████████| 1/1 [00:00<00:00, 66.08it/s]
Embedding: 100%|██████████| 1/1 [00:00<00:00, 54.00it/s]
Embedding: 100%|██████████| 1/1 [00:00<00:00, 52.22it/s]
Embedding: 100%|██████████| 1/1 [00:00<00:00, 49.92it/s]
Embedding: 100%|██████████| 1/1 [00:00<00:00, 48.69it/s]
Embedding: 100%|██████████| 1/1 [00:00<00:00, 48.74it/s]
Embedding: 100%|██████████| 1/1 [00:00<00:00, 53.97it/s]
Embedding: 100%|██████████| 1/1 [00:00<00:00, 66.54it/s]
Embedding: 100%|██████████| 1/1 [00:00<00:00, 66.51it/s]
Embedding: 100%|██████████| 1/1 [00:00<00:00, 64.41it/s]
Embedding: 100%|██████████| 1

🔍 BM25 Eval: {'MAP@10': 0.3424338624338624, 'MRR@10': 0.3424338624338624, 'nDCG@10': 0.48791479025996576}
🔍 Dense Eval: {'MAP@10': 0.2853042328042328, 'MRR@10': 0.2853042328042328, 'nDCG@10': 0.44621179937295}
* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


Embedding: 100%|██████████| 1/1 [00:00<00:00, 28.22it/s]
Both `max_new_tokens` (=150) and `max_length`(=900) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Embedding: 100%|██████████| 1/1 [00:00<00:00, 33.99it/s]
Both `max_new_tokens` (=150) and `max_length`(=900) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Embedding: 100%|██████████| 1/1 [00:00<00:00, 38.89it/s]
Both `max_new_tokens` (=150) and `max_length`(=900) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Embedding: 100%|██████████| 1/1 [00:00<00:00, 25.44it/s]
Both `max_new_tokens` 

for me i used tiny llama which scored better than gpt 2 ang got better answers i tried both

<h1 style="font-size:70px; color:red; font-weight:700;">Using tinyllama</h1>

In [1]:
# 📚 Imports
import json
import pandas as pd
from bs4 import BeautifulSoup
from rank_bm25 import BM25Okapi
import nltk
import numpy as np
import faiss
import gradio as gr
from collections import defaultdict
import os
import re
import torch
from sentence_transformers import SentenceTransformer
from llama_cpp import Llama
from tqdm import tqdm
import pytrec_eval  # Added for proper evaluation

# ✅ Download punkt tokenizer if missing
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download("punkt")

# 📂 Dataset paths
DATA_DIR = "stacklite_dataset"
DATA_FILE = os.path.join(DATA_DIR, "top_datascience_questions.json")
QUERY_FILE = os.path.join(DATA_DIR, "queries.json")
QRELS_FILE = os.path.join(DATA_DIR, "qrels.json")

# ✅ Ensure files exist
for file in [DATA_FILE, QUERY_FILE, QRELS_FILE]:
    if not os.path.exists(file):
        raise FileNotFoundError(f"Missing file: {file}. Please place it in {DATA_DIR}")

# 📑 Load dataset
with open(DATA_FILE, 'r', encoding='utf-8') as f:
    data = json.load(f)
df = pd.DataFrame(data)

# 🧼 Clean + preprocess
def clean_text(html):
    return BeautifulSoup(html, "html.parser").get_text(separator=" ")

df['text'] = df.apply(lambda row: row['title'] + " " + clean_text(row['body']), axis=1)

# 🔍 BM25
def safe_tokenize(text):
    try:
        return nltk.word_tokenize(text.lower())
    except LookupError:
        return re.findall(r'\b\w+\b', text.lower())

tokenized_corpus = [safe_tokenize(doc) for doc in df['text']]
bm25 = BM25Okapi(tokenized_corpus)

# 🤖 Dense Embeddings (MiniLM)
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def get_embeddings(texts, batch_size=32):
    return embedding_model.encode(texts, batch_size=batch_size, show_progress_bar=True)

# 📐 FAISS index
dense_embeddings = get_embeddings(df['text'].tolist())
dim = dense_embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(dense_embeddings.astype(np.float32))

# 🔁 RRF Fusion
def rrf(bm25_ids, dense_ids, k=60):
    scores = defaultdict(float)
    for rank, idx in enumerate(bm25_ids):
        scores[idx] += 1 / (k + rank + 1)
    for rank, idx in enumerate(dense_ids):
        scores[idx] += 1 / (k + rank + 1)
    return sorted(scores.keys(), key=scores.get, reverse=True)

# 🧠 Local Chat LLM (TinyLlama or similar, GGUF)
llm = Llama(model_path="./models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")  # ✅ Update path as needed

def generate_answer(prompt):
    output = llm(prompt, max_tokens=300, stop=["\n\n", "User:"], echo=False)
    return output["choices"][0]["text"].strip()

# 🧪 Load Queries + Qrels
with open(QUERY_FILE, "r", encoding='utf-8') as f:
    queries = json.load(f)
with open(QRELS_FILE, "r", encoding='utf-8') as f:
    qrels = json.load(f)

# 📊 Evaluation using pytrec_eval
def evaluate_search(method="bm25", top_k=10):
    # Prepare qrels in pytrec_eval format
    qrels_dict = {str(qid): {str(doc): 1 for doc in docs} for qid, docs in qrels.items()}
    
    # Build run dictionary
    run_dict = {}
    for q in tqdm(queries, desc=f"Evaluating {method}"):
        qid = str(q['id'])
        text = q['question']
        
        if method == "bm25":
            tok_q = safe_tokenize(text)
            scores = bm25.get_scores(tok_q)
            top_idxs = np.argsort(scores)[::-1][:top_k]
            run_dict[qid] = {str(df.iloc[i]['question_id']): float(scores[i]) for i in top_idxs}
        else:
            q_embedding = get_embeddings([text])
            distances, top_idxs = index.search(q_embedding.astype(np.float32), top_k)
            # Use negative distance as score (since smaller distance = better)
            run_dict[qid] = {str(df.iloc[i]['question_id']): float(-distances[0][j]) 
                            for j, i in enumerate(top_idxs[0])}
    
    # Evaluate
    evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, {'map', 'recip_rank', 'ndcg'})
    metrics = evaluator.evaluate(run_dict)
    
    # Aggregate metrics
    return {
        "MAP@10": np.mean([v['map'] for v in metrics.values()]),
        "MRR@10": np.mean([v['recip_rank'] for v in metrics.values()]),
        "nDCG@10": np.mean([v['ndcg'] for v in metrics.values()])
    }

# ✅ Run evaluation
print("🔍 BM25 Evaluation:", evaluate_search("bm25"))
print("🔍 Dense Evaluation:", evaluate_search("dense"))

# 💬 Hybrid QA
def hybrid_answer(query):
    tok_q = safe_tokenize(query)
    bm_scores = bm25.get_scores(tok_q)
    bm_top = np.argsort(bm_scores)[::-1][:30]
    
    q_embedding = get_embeddings([query])
    _, dense_top = index.search(q_embedding.astype(np.float32), 30)
    dense_top = dense_top[0]
    
    hybrid_ids = rrf(bm_top, dense_top)
    MAX_DOC_TOKENS = 100  # Tune based on your tests

    def truncate_text(text, max_words=100):
        return ' '.join(text.split()[:max_words])

    top_k_docs = [truncate_text(df.iloc[i]['text'], max_words=MAX_DOC_TOKENS) for i in hybrid_ids[:3] if i < len(df)]
    context = "\n\n".join([f"[Doc {i+1}]: {doc}" for i, doc in enumerate(top_k_docs)])

    prompt = f"""Context:
{context}

Question: {query}

Answer:"""
    answer = generate_answer(prompt)

    citations = []
    for i, idx in enumerate(hybrid_ids[:5]):
        if idx < len(df):
            title = df.iloc[idx]['title']
            question_id = df.iloc[idx]['question_id']
            citations.append(f"{i+1}. {title} (ID: {question_id})")
    
    return answer, "\n".join(citations)

# 🚀 Gradio Interface
iface = gr.Interface(
    fn=hybrid_answer,
    inputs=gr.Textbox(label="Technical Question", placeholder="Ask a data science question..."),
    outputs=[
        gr.Textbox(label="Generated Answer"),
        gr.Textbox(label="Source Citations")
    ],
    title="📚 Hybrid QA Assistant (StackLite)",
    description="""🔍 Ask technical questions! This assistant uses:
- BM25 for keyword search
- MiniLM embeddings for semantic search
- Hybrid ranking (RRF fusion)
- Local LLM (TinyLlama via llama-cpp) for answer generation""",
    examples=[
        "How to handle missing values in a dataset?",
    "What's the difference between random forest and gradient boosting?",
    "How to implement a neural network in PyTorch?",
    "When should I use L1 vs L2 regularization?",
    "How does PCA reduce dimensionality?",
    "Explain cross-validation and why it's important.",
    "How to optimize hyperparameters in scikit-learn?",
    "What is the bias-variance tradeoff?",
    "How to deploy a model with Flask?",
    "What are attention mechanisms in transformers?"
    ]
)

if __name__ == "__main__":
    iface.launch()




Batches:   0%|          | 0/24 [00:00<?, ?it/s]

llama_model_loader: loaded meta data with 23 key-value pairs and 201 tensors from ./models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = tinyllama_tinyllama-1.1b-chat-v1.0
llama_model_loader: - kv   2:                       llama.context_length u32              = 2048
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv   4:                          llama.block_count u32              = 22
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 5632
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 64
llama_model_loader: - kv   7:            

🔍 BM25 Evaluation: {'MAP@10': 0.961111111111111, 'MRR@10': 0.961111111111111, 'nDCG@10': 0.9710309917857153}


Evaluating dense:   0%|          | 0/30 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating dense:   3%|▎         | 1/30 [00:00<00:03,  7.44it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating dense:  20%|██        | 6/30 [00:00<00:00, 28.14it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating dense:  37%|███▋      | 11/30 [00:00<00:00, 33.11it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating dense:  53%|█████▎    | 16/30 [00:00<00:00, 36.72it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating dense:  67%|██████▋   | 20/30 [00:00<00:00, 36.96it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating dense:  80%|████████  | 24/30 [00:00<00:00, 37.44it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating dense:  97%|█████████▋| 29/30 [00:00<00:00, 40.45it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating dense: 100%|██████████| 30/30 [00:00<00:00, 36.11it/s]


🔍 Dense Evaluation: {'MAP@10': 1.0, 'MRR@10': 1.0, 'nDCG@10': 1.0}
* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

llama_perf_context_print:        load time =    2965.17 ms
llama_perf_context_print: prompt eval time =    2964.50 ms /   390 tokens (    7.60 ms per token,   131.56 tokens per second)
llama_perf_context_print:        eval time =      58.30 ms /     2 runs   (   29.15 ms per token,    34.31 tokens per second)
llama_perf_context_print:       total time =    3024.60 ms /   392 tokens


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Llama.generate: 9 prefix-match hit, remaining 319 prompt tokens to eval
llama_perf_context_print:        load time =    2965.17 ms
llama_perf_context_print: prompt eval time =    2395.99 ms /   319 tokens (    7.51 ms per token,   133.14 tokens per second)
llama_perf_context_print:        eval time =     880.08 ms /    39 runs   (   22.57 ms per token,    44.31 tokens per second)
llama_perf_context_print:       total time =    3289.83 ms /   358 tokens


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Llama.generate: 9 prefix-match hit, remaining 461 prompt tokens to eval
llama_perf_context_print:        load time =    2965.17 ms
llama_perf_context_print: prompt eval time =    3327.74 ms /   461 tokens (    7.22 ms per token,   138.53 tokens per second)
llama_perf_context_print:        eval time =     964.50 ms /    41 runs   (   23.52 ms per token,    42.51 tokens per second)
llama_perf_context_print:       total time =    4308.23 ms /   502 tokens


llamma produced better output than gpt 2 in my case and scored better 