# ReasonForge

Deterministic math & code tools for small language models.

1. Clone repo & install deps
2. Install Ollama & pull model
3. Sanity tests
4. MATH-500 / HumanEval benchmarks
5. Gradio chat UI

---
## Setup

In [None]:
# Clone repo & install deps
!git clone https://github.com/RoyCoding8/MCP.git /content/MCP
!pip uninstall -y -q omegaconf
!pip install -q "gradio>=6.0" sympy datasets "math-verify[antlr4_13_2]" pandas matplotlib

In [None]:
# Install Ollama
!sudo apt-get install -qq zstd
!curl -fsSL https://ollama.com/install.sh | sh > /dev/null 2>&1

In [None]:
from google.colab import drive
drive.mount('/content/drive')

RESULTS_DIR = '/content/drive/MyDrive/ReasonForge/results'
!mkdir -p {RESULTS_DIR}

In [None]:
import subprocess, time, os, requests
from pathlib import Path
import json as _json

MODELS_PATH = '/content/ollama_models'
os.makedirs(MODELS_PATH, exist_ok=True)

# I am using G4 GPU with ~96GB VRAM, ajust values according to your GPU
MODEL_CONFIG = {
    'qwen3:8b':  15,
    'qwen3:32b': 7,
    # 'qwen3:4b':  30,
}

In [None]:
# Helper functions
def start_ollama(num_parallel=None, timeout=30):
    subprocess.run(['pkill', '-f', 'ollama'], capture_output=True)
    time.sleep(2)

    env = os.environ.copy()
    env['OLLAMA_MODELS'] = MODELS_PATH
    env['OLLAMA_HOST'] = '0.0.0.0:11434'
    if num_parallel is not None:
        env['OLLAMA_NUM_PARALLEL'] = str(num_parallel)

    log_file = open('/content/ollama_server.log', 'a')
    subprocess.Popen(
        ['ollama', 'serve'],
        env=env,
        stdout=log_file,
        stderr=subprocess.STDOUT,
    )

    start = time.time()
    while time.time() - start < timeout:
        try:
            requests.get('http://localhost:11434/')
            print(f'Ollama ready ({time.time()-start:.2f}s)',
                  f'  parallel={num_parallel}' if num_parallel else '')
            return True
        except requests.ConnectionError:
            time.sleep(1)
    print('Ollama failed to start â€” check /content/ollama_server.log')
    return False


def run_benchmark_suite(benchmark_module, n, skip_baseline, think, separator='-',seed=42):
    os.chdir('/content/MCP')
    for model, num_parallel in MODEL_CONFIG.items():
        start_ollama(num_parallel)
        print(f'\n{separator*56}')
        print(f'  {model}  (parallel={num_parallel})')
        print(f'{separator*56}')
        cmd = f'python -m {benchmark_module} --model {model} --n {n} --seed {seed}'
        cmd += f' --results-dir {RESULTS_DIR}'
        if skip_baseline: cmd += ' --skip-baseline'
        if think:         cmd += ' --think'
        !{cmd}


def load_latest_result(glob_pattern='*.json'):
    results_path = Path(RESULTS_DIR)
    files = sorted(results_path.glob(glob_pattern), key=lambda f: f.stat().st_mtime)
    if not files:
        print(f'No results matching {glob_pattern!r}. Run the benchmark first.')
        return None, None
    latest = files[-1]
    print(f'Loading: {latest.name}\n')
    with open(latest) as f:
        report = _json.load(f)
    return report, latest


def print_report_header(report, accuracy_key='rf_accuracy', baseline_key='baseline_accuracy', delta_key='delta'):
    print(f"Model: {report['model']}  |  N={report['n']}  |  Seed={report['seed']}")
    print(f"{accuracy_key.replace('_',' ').title()}: {report[accuracy_key]:.1%}")
    if report.get(baseline_key) is not None:
        print(f"{baseline_key.replace('_',' ').title()}: {report[baseline_key]:.1%}")
        print(f"Delta: {report[delta_key]:+.1%}")
    print()

In [None]:
start_ollama()

In [None]:
for model in MODEL_CONFIG:
    print(f'Pulling {model}...')
    !OLLAMA_MODELS={MODELS_PATH} ollama pull {model}
print('All models pulled.')

---
## Verify

In [None]:
os.chdir('/content/MCP')
!python -m tests.sanity

---
## Benchmarks

A/B comparison: Baseline (no tools) vs ReasonForge (with tools).

Results are checkpointed to Google Drive after each problem. If the notebook crashes, re-running will resume from where it left off.

In [None]:
# MATH-500 Benchmark
N_MATH = 50
SKIP_BASELINE = False
THINK = True

from google.colab import userdata
try: os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
except: print('HF_TOKEN not found in Secrets.')

run_benchmark_suite('tests.math_benchmark', N_MATH, SKIP_BASELINE, THINK, separator='-',seed=42)

In [None]:
# Visualize MATH-500 results
import pandas as pd, matplotlib.pyplot as plt

report, _ = load_latest_result('*.json')
if report:
    print_report_header(report, 'rf_accuracy', 'baseline_accuracy')
    df = pd.DataFrame(report['results'])
    display(df[['type','level','expected','baseline_answer','baseline_correct','rf_answer','rf_correct','rf_rounds','rf_used_tools','weight']])

    # By difficulty
    if 'baseline_correct' in df.columns and df['baseline_correct'].any():
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        lvl = df.groupby('level').agg(Baseline=('baseline_correct','mean'), ReasonForge=('rf_correct','mean')).sort_index()
        lvl.plot.bar(ax=axes[0], rot=0, color=['#94a3b8','#3b82f6'])
        axes[0].set_title('Accuracy by Difficulty')
        axes[0].set_ylabel('Accuracy')
        axes[0].set_ylim(0, 1.05)
        axes[0].legend(loc='upper right')

        cat = df.groupby('type').agg(Baseline=('baseline_correct','mean'), ReasonForge=('rf_correct','mean')).sort_index()
        cat.plot.bar(ax=axes[1], rot=30, color=['#94a3b8','#3b82f6'])
        axes[1].set_title('Accuracy by Category')
        axes[1].set_ylabel('Accuracy')
        axes[1].set_ylim(0, 1.05)
        axes[1].legend(loc='upper right')

        plt.tight_layout()
        plt.savefig(f'{RESULTS_DIR}/math_results.png', dpi=150, bbox_inches='tight')
        plt.show()
    else:
        fig, ax = plt.subplots(figsize=(7, 5))
        lvl = df.groupby('level')['rf_correct'].mean().sort_index()
        lvl.plot.bar(ax=ax, rot=0, color='#3b82f6')
        ax.set_title('ReasonForge Accuracy by Difficulty')
        ax.set_ylabel('Accuracy')
        ax.set_ylim(0, 1.05)
        plt.tight_layout()
        plt.savefig(f'{RESULTS_DIR}/math_results.png', dpi=150, bbox_inches='tight')
        plt.show()

In [None]:
# HumanEval Code Benchmark
N_CODE = 20
SKIP_BASELINE_CODE = False
THINK_CODE = True

run_benchmark_suite('tests.code_benchmark', N_CODE, SKIP_BASELINE_CODE, THINK_CODE, separator='=',seed=42)

In [None]:
# Visualize HumanEval results
import pandas as pd, matplotlib.pyplot as plt

report, _ = load_latest_result('code_*.json')
if report:
    print_report_header(report, 'rf_pass1', 'baseline_pass1')
    df = pd.DataFrame(report['results'])
    display(df)

    fig, ax = plt.subplots(figsize=(6, 4))
    labels = ['ReasonForge']
    vals = [report['rf_pass1']]
    colors = ['#3b82f6']
    if report.get('baseline_pass1') is not None:
        labels.insert(0, 'Baseline')
        vals.insert(0, report['baseline_pass1'])
        colors.insert(0, '#94a3b8')
    bars = ax.bar(labels, vals, color=colors, width=0.5)
    for bar, v in zip(bars, vals):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, f'{v:.0%}', ha='center', fontweight='bold')
    ax.set_ylim(0, 1.15)
    ax.set_ylabel('Pass Rate')
    ax.set_title(f'HumanEval {report["model"]}')
    plt.tight_layout()
    plt.savefig(f'{RESULTS_DIR}/code_results.png', dpi=150, bbox_inches='tight')
    plt.show()

---

In [None]:
os.environ['RF_SHARE'] = '1'
os.chdir('/content/MCP')

!python -u -m ui.app | tee -a /content/ollama_server.log

---

In [None]:
# !OLLAMA_MODELS=$MODELS_PATH ollama ps
# print()
# !OLLAMA_MODELS=$MODELS_PATH ollama list

In [None]:
# !rm -rf /content/MCP
# !rm /content/ollama_server.log