# ReasonForge

Deterministic math & code tools for small language models.

1. Clone repo & install deps
2. Install Ollama & pull model
3. Sanity tests
4. MATH-500 / HumanEval benchmarks
5. Gradio chat UI

---
## Setup

In [None]:
# Clone repo & install deps
!git clone https://github.com/RoyCoding8/MCP.git /content/MCP
!pip uninstall -y -q omegaconf
!pip install -q "gradio>=6.0" sympy datasets "math-verify[antlr4_13_2]" pandas matplotlib

In [None]:
# Install Ollama
!sudo apt-get install -qq zstd
!curl -fsSL https://ollama.com/install.sh | sh > /dev/null 2>&1

In [None]:
import subprocess, time, os
import requests

MODELS_PATH = '/content/ollama_models'
os.makedirs(MODELS_PATH, exist_ok=True)
print(f'Models path: {MODELS_PATH}')

subprocess.run(['pkill', '-f', 'ollama'], capture_output=True)
time.sleep(2)

env = os.environ.copy()
env['OLLAMA_MODELS'] = MODELS_PATH
env['OLLAMA_HOST'] = '0.0.0.0:11434'
env['OLLAMA_NUM_PARALLEL'] = '15'

print("Starting Ollama server...")
log_file = open('/content/ollama_server.log', 'w')
proc = subprocess.Popen(
    ['ollama', 'serve'],
    env=env,
    stdout=log_file,
    stderr=subprocess.STDOUT,
)

start = time.time()
while time.time()-start < 30:
    try:
        requests.get('http://localhost:11434/')
        print(f'Ollama ready ({time.time()-start:.2f}s)')
        break
    except requests.ConnectionError:
        time.sleep(1)
else:
    print('Ollama failed to start — check /content/ollama_server.log')

In [None]:
# Pull Models
MODELS = ['qwen3:8b','qwen3:32b']

for MODEL in MODELS:
    print(f'Pulling {MODEL}...')
    !OLLAMA_MODELS={MODELS_PATH} ollama pull {MODEL}

---
## Verify

In [None]:
os.chdir('/content/MCP')
!python -m tests.sanity

---
## Benchmarks

A/B comparison: Baseline (no tools) vs ReasonForge (with tools).

In [None]:
# MATH-500 Benchmark
N_MATH = 5
SKIP_BASELINE = False
THINK = True

from google.colab import userdata
try: os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
except: print("HF_TOKEN not found in Secrets.")

os.chdir('/content/MCP')
cmd = f'python -m tests.benchmark --model {MODELS[0]} --n {N_MATH}'
if SKIP_BASELINE: cmd += ' --skip-baseline'
if THINK: cmd += ' --think'
print(f'Running: {cmd}\n')
!{cmd}

In [None]:
# Visualize MATH-500 results
import pandas as pd, matplotlib.pyplot as plt, json as _json
from pathlib import Path

results_dir = Path('/content/MCP/tests/results')
files = sorted(results_dir.glob('*.json'), key=lambda f: f.stat().st_mtime)
if not files:
    print('No results found. Run the benchmark first.')
else:
    latest = files[-1]
    print(f'Loading: {latest.name}\n')
    with open(latest) as f: report = _json.load(f)

    df = pd.DataFrame(report['results'])
    print(f"Model: {report['model']}  |  N={report['n']}  |  Seed={report['seed']}")
    print(f"RF Accuracy: {report['rf_accuracy']:.1%}")
    if report.get('baseline_accuracy') is not None:
        print(f"Baseline Accuracy: {report['baseline_accuracy']:.1%}")
        print(f"Delta: {report['delta']:+.1%}")
    print()

    display(df[['type','level','expected','baseline_answer','baseline_correct','rf_answer','rf_correct','rf_rounds','rf_used_tools','weight']])

    # By difficulty
    if 'baseline_correct' in df.columns and df['baseline_correct'].any():
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        lvl = df.groupby('level').agg(Baseline=('baseline_correct','mean'), ReasonForge=('rf_correct','mean')).sort_index()
        lvl.plot.bar(ax=axes[0], rot=0, color=['#94a3b8','#3b82f6'])
        axes[0].set_title('Accuracy by Difficulty')
        axes[0].set_ylabel('Accuracy')
        axes[0].set_ylim(0, 1.05)
        axes[0].legend(loc='upper right')

        cat = df.groupby('type').agg(Baseline=('baseline_correct','mean'), ReasonForge=('rf_correct','mean')).sort_index()
        cat.plot.bar(ax=axes[1], rot=30, color=['#94a3b8','#3b82f6'])
        axes[1].set_title('Accuracy by Category')
        axes[1].set_ylabel('Accuracy')
        axes[1].set_ylim(0, 1.05)
        axes[1].legend(loc='upper right')

        plt.tight_layout()
        plt.savefig('/content/MCP/tests/results/math_results.png', dpi=150, bbox_inches='tight')
        plt.show()
    else:
        fig, ax = plt.subplots(figsize=(7, 5))
        lvl = df.groupby('level')['rf_correct'].mean().sort_index()
        lvl.plot.bar(ax=ax, rot=0, color='#3b82f6')
        ax.set_title('ReasonForge Accuracy by Difficulty')
        ax.set_ylabel('Accuracy')
        ax.set_ylim(0, 1.05)
        plt.tight_layout()
        plt.savefig('/content/MCP/tests/results/math_results.png', dpi=150, bbox_inches='tight')
        plt.show()

In [None]:
# HumanEval Code Benchmark
N_CODE = 2
SKIP_BASELINE_CODE = False
THINK_CODE = True
SEED = 42

os.chdir('/content/MCP')
cmd = f'python -m tests.code_benchmark --model {MODEL} --n {N_CODE} --seed {SEED}'
if SKIP_BASELINE_CODE: cmd += ' --skip-baseline'
if THINK_CODE: cmd += ' --think'
print(f'Running: {cmd}\n')
!{cmd}

In [None]:
# Visualize HumanEval results
results_dir = Path('/content/MCP/tests/results')
files = sorted(results_dir.glob('code_*.json'), key=lambda f: f.stat().st_mtime)
if not files:
    print('No code benchmark results found. Run the benchmark first.')
else:
    latest = files[-1]
    print(f'Loading: {latest.name}\n')
    with open(latest) as f: report = _json.load(f)

    df = pd.DataFrame(report['results'])
    print(f"Model: {report['model']}  |  N={report['n']}  |  Seed={report['seed']}")
    print(f"RF Pass@1: {report['rf_pass1']:.1%}")
    if report.get('baseline_pass1') is not None:
        print(f"Baseline Pass@1: {report['baseline_pass1']:.1%}")
        print(f"Delta: {report['delta']:+.1%}")
    print()

    display(df)

    # Summary bar chart
    fig, ax = plt.subplots(figsize=(6, 4))
    labels = ['ReasonForge']
    vals = [report['rf_pass1']]
    colors = ['#3b82f6']
    if report.get('baseline_pass1') is not None:
        labels.insert(0, 'Baseline')
        vals.insert(0, report['baseline_pass1'])
        colors.insert(0, '#94a3b8')
    bars = ax.bar(labels, vals, color=colors, width=0.5)
    for bar, v in zip(bars, vals):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, f'{v:.0%}', ha='center', fontweight='bold')
    ax.set_ylim(0, 1.15)
    ax.set_ylabel('Pass Rate')
    ax.set_title(f'HumanEval � {report["model"]}')
    plt.tight_layout()
    plt.savefig('/content/MCP/tests/results/code_results.png', dpi=150, bbox_inches='tight')
    plt.show()

---

In [None]:
os.environ['RF_SHARE'] = '1'
os.chdir('/content/MCP')

!python -u -m ui.app | tee -a /content/ollama_server.log

---

In [None]:
# !OLLAMA_MODELS=$MODELS_PATH ollama ps
# print()
# !OLLAMA_MODELS=$MODELS_PATH ollama list

In [None]:
# !rm -rf /content/MCP
# !rm /content/ollama_server.log