# Gonyai-v1 — Extended Benchmark: 160M vs 1B+ Models
### Investigating Specialization vs. Scale in Low-Resource Konkani NLP

**Models Tested:**
* **Specialized:** Gonyai-v1 (160M)
* **Sub-1B Baselines:** SmolLM2 (360M), Qwen2.5 (0.5B)
* **1B+ Challengers:** Gemma 3 (1B), TinyLlama (1.1B), Qwen2.5 (1.5B)

**Author:** Omdeep (stud.odb1@gec.ac.in)

In [None]:
import torch, math, numpy as np, pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from transformers import AutoModelForCausalLM, AutoTokenizer

try:
    from transformers import Gemma3ForCausalLM
    GEMMA3_AVAILABLE = True
except ImportError:
    GEMMA3_AVAILABLE = False
    print("⚠ Gemma3ForCausalLM not found — update transformers: pip install -q -U transformers")

In [None]:
MODELS = [
    {"display": "Gonyai-v1\n(160M)", "hf_id": "omdeep22/Gonyai-v1", "params_m": 160, "group": "specialized", "loader": "auto"},
    {"display": "SmolLM2\n(360M)", "hf_id": "HuggingFaceTB/SmolLM2-360M-Instruct", "params_m": 360, "group": "sub1b", "loader": "auto"},
    {"display": "Qwen2.5\n(0.5B)", "hf_id": "Qwen/Qwen2.5-0.5B-Instruct", "params_m": 500, "group": "sub1b", "loader": "auto"},
    {"display": "Gemma3\n(1B)", "hf_id": "google/gemma-3-1b-it", "params_m": 1000, "group": "1b_plus", "loader": "gemma3"},
    {"display": "TinyLlama\n(1.1B)", "hf_id": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "params_m": 1100, "group": "1b_plus", "loader": "auto"},
    {"display": "Qwen2.5\n(1.5B)", "hf_id": "Qwen/Qwen2.5-1.5B-Instruct", "params_m": 1500, "group": "1b_plus", "loader": "auto"}
]

In [None]:
DOMAINS = {
    "Poetic": ["सांजवेळार दर्यावेळेर बसतना मनाक वेगळीच शांती जाणवता.", "पावसाचे थेंब माटयेर पडटात तेन्ना रानाचो सुगंध पांगता.", "चंद्राच्या उजवाडांत न्हंयच्या उदकाचें प्रतिबिंब दिसता.", "फुलाचो रंग पळेतना काळजांत मोगाची लाट उठता."],
    "Conversational": ["आज जेवणाक कितें केलांस, भूक लागल्या.", "तूं कोठून आयलो, इतल्या उशिरान?", "बाजारांत गेल्लो तेन्ना तुजो भाव भेटलो.", "ये, बस, चहा पी, मागीर उलोव."],
    "Storytelling": ["एके सांजे म्हातारो दर्याकांठार बसून काणी सांगतालो.", "त्या गांवांत एक सोबीत चेडूं आशिल्ली.", "राजाच्या वाड्यांत अशें घडलें जें कोणें पळेल्लें नाशिल्लें.", "धुक्यान भरिल्ल्या वाटेर ल्हान भुरगो एकलोच भोंवतालो."],
    "Grammar/Prose": ["ताणें सांगिल्लें काम ताणेंच केल्लें नाशिल्ल्यान राग आयलो.", "जर तूं वेळार आयलो आसतो तर आमी एकठांय वचूं येतालें.", "ती रडपाक लागली कारण तिचो आवडटो फूल तोडिल्लो.", "भुरग्यांनी खेळ संपयलो आनी घरा परत येवन जेवण केलें."]
}
MARATHI_CONTROL = ["संध्याकाळी समुद्रकिनारी बसताना मनाला एक वेगळीच शांती जाणवते.", "पावसाचे थेंब जमिनीवर पडतात तेव्हा मातीचा सुगंध सर्वत्र पसरतो."]
HINDI_CONTROL = ["शाम को समुद्र के किनारे बैठने पर मन को एक अलग ही शांति मिलती है।", "बारिश की बूंदें जब जमीन पर गिरती हैं तो मिट्टी की खुशबू फैल जाती है।"]

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def load_model_and_tokenizer(cfg):
    hf_id, loader = cfg["hf_id"], cfg["loader"]
    dtype = torch.bfloat16 if DEVICE == "cuda" else torch.float32
    tokenizer = AutoTokenizer.from_pretrained(hf_id, trust_remote_code=True)
    if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
    sgpu = {"": 0} if DEVICE == "cuda" else {"": "cpu"}
    if loader == "gemma3" and GEMMA3_AVAILABLE:
        model = Gemma3ForCausalLM.from_pretrained(hf_id, torch_dtype=dtype, device_map=sgpu).eval()
    else:
        model = AutoModelForCausalLM.from_pretrained(hf_id, trust_remote_code=True, torch_dtype=dtype, device_map=sgpu).eval()
    return model, tokenizer

def compute_ppl(model, tokenizer, sentences):
    if not sentences: return None
    t_nll, t_tok = 0.0, 0
    with torch.no_grad():
        for s in sentences:
            enc = tokenizer(s, return_tensors="pt"); ids = enc["input_ids"].to(DEVICE)
            if ids.shape[1] < 2: continue
            out = model(ids); logits = out.logits if hasattr(out, "logits") else out[0]
            loss = torch.nn.functional.cross_entropy(logits[:, :-1, :].reshape(-1, logits.size(-1)), ids[:, 1:].reshape(-1), reduction="sum")
            t_nll += loss.item(); t_tok += ids[:, 1:].numel()
    return math.exp(min(t_nll / t_tok, 20)) if t_tok > 0 else None

def get_token_stats(tokenizer, sentences):
    t_tok = sum(len(tokenizer.encode(s)) for s in sentences)
    t_wrd = sum(len(s.split()) for s in sentences)
    return t_tok, (t_tok / t_wrd)

In [None]:
results = []
base_tok_count = 0
all_konkani = [s for sents in DOMAINS.values() for s in sents]

for i, cfg in enumerate(MODELS):
    model, tokenizer = load_model_and_tokenizer(cfg)
    t_count, fertility = get_token_stats(tokenizer, all_konkani)
    if i == 0: base_tok_count = t_count
    
    row = {"Model": cfg["display"], "Group": cfg["group"], "Params_M": cfg["params_m"], "Fertility": fertility, 
           "Token_Tax": ((t_count / base_tok_count) - 1) * 100 if i > 0 else 0, "Context": 2048 / fertility}
    
    for d, s in DOMAINS.items(): row[f"PPL {d}"] = compute_ppl(model, tokenizer, s)
    row["PPL Overall"] = compute_ppl(model, tokenizer, all_konkani)
    row["PPL Marathi"] = compute_ppl(model, tokenizer, MARATHI_CONTROL)
    row["PPL Hindi"] = compute_ppl(model, tokenizer, HINDI_CONTROL)
    
    ov = row["PPL Overall"]
    row["Sanity"] = "PASS ✓" if ov and ov < min(row["PPL Marathi"], row["PPL Hindi"]) else "FAIL ✗"
    results.append(row)
    del model; torch.cuda.empty_cache()

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(20, 18))
colors = {"specialized": "#E53935", "sub1b": "#1E88E5", "1b_plus": "#43A047"}
bar_clrs = [colors[r["Group"]] for r in results]
lbls = [r["Model"].replace("\n", " ") for r in results]

def plot_bar(ax, key, title, is_pct=False):
    vals = [r.get(key, 0) for r in results]
    ax.bar(lbls, vals, color=bar_clrs, edgecolor="black")
    ax.set_title(title, fontweight="bold")
    for i, v in enumerate(vals): ax.text(i, v, f"{v:.1f}%" if is_pct else f"{v:.1f}", ha="center", va="bottom")

plot_bar(axes[0,0], "PPL Poetic", "Poetic PPL (↓)")
plot_bar(axes[0,1], "PPL Conversational", "Conv PPL (↓)")
plot_bar(axes[0,2], "PPL Overall", "Overall PPL (↓)")
plot_bar(axes[1,0], "Fertility", "Fertility Rate (↓)")
plot_bar(axes[1,1], "Token_Tax", "Token Tax vs Gonyai (% ↑)", True)
plot_bar(axes[1,2], "Context", "Words per 2k Window (↑)")

# Sanity Logic for Panel 9
ax9 = axes[2,2]; x = np.arange(len(lbls)); w = 0.25
ax9.bar(x-w, [r["PPL Overall"] for r in results], w, label="Konkani", color="#E53935")
ax9.bar(x, [r["PPL Marathi"] for r in results], w, label="Marathi", color="#1E88E5")
ax9.bar(x+w, [r["PPL Hindi"] for r in results], w, label="Hindi", color="#43A047")
ax9.set_title("Cross-Lingual Sanity"); ax9.legend()

plt.tight_layout(); plt.show()