# ReasonForge

Deterministic math & code tools for small language models.

1. Clone repo & install deps
2. Install Ollama & pull model
3. Sanity tests
4. MATH-500 / HumanEval benchmarks
5. Gradio chat UI

---
## Setup

In [None]:
# Clone repo & install deps
!git clone https://github.com/RoyCoding8/MCP.git /content/MCP
!pip install -q gradio>=6.0 sympy datasets math-verify[antlr4_13_2]

In [None]:
# Install Ollama
!sudo apt-get install -qq zstd > /dev/null 2>&1
!curl -fsSL https://ollama.com/install.sh | sh > /dev/null 2>&1

In [3]:
# Store models locally in the Colab VM
import os

MODELS_PATH = '/content/ollama_models'
os.makedirs(MODELS_PATH, exist_ok=True)
print(f'Models path: {MODELS_PATH}')

Models path: /content/ollama_models


In [None]:
import subprocess, time, os
import requests

subprocess.run(['pkill', '-f', 'ollama'], capture_output=True)
time.sleep(2)

env = os.environ.copy()
env['OLLAMA_MODELS'] = MODELS_PATH
env['OLLAMA_HOST'] = '0.0.0.0:11434'

print("Starting Ollama server...")
log_file = open('/content/ollama_server.log', 'w')
proc = subprocess.Popen(
    ['ollama', 'serve'],
    env=env,
    stdout=log_file,
    stderr=subprocess.STDOUT,
)

start = time.time()
while time.time()-start < 30:
    try:
        requests.get('http://localhost:11434/')
        print(f'Ollama ready ({time.time()-start:.2f}s)')
        break
    except requests.ConnectionError:
        time.sleep(1)
else:
    print('Ollama failed to start â€” check /content/ollama_server.log')

In [35]:
# Pull Models
MODELS = ['qwen3:8b']

for MODEL in MODELS:
    print(f'Pulling {MODEL}...')
    !OLLAMA_MODELS={MODELS_PATH} ollama pull {MODEL}

Pulling qwen3:8b...
[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[A[A[A[A[A[A[1G[?25h[?2026l


---
## Verify

In [None]:
os.chdir('/content/MCP')
!python -m tests.sanity

---
## Benchmarks

A/B comparison: Baseline (no tools) vs ReasonForge (with tools).

In [None]:
# MATH-500 Benchmark
N_MATH = 1
SKIP_BASELINE = False
THINK = False

import subprocess
import os
from google.colab import userdata

try: os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
except: print("HF_TOKEN not found in Secrets. Proceeding without authentication.")

# Construct command list
cmd = ['python', '-u', '-m', 'tests.benchmark', '--model', MODEL, '--n', str(N_MATH)]
if SKIP_BASELINE: cmd.append('--skip-baseline')
if THINK: cmd.append('--think')

print(f"Running: {' '.join(cmd)}\n")

with open('/content/ollama_server.log', 'a') as log:
    process = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        bufsize=1,
        cwd='/content/MCP'
    )
    for line in process.stdout:
        log.write(line)
        log.flush()
    process.wait()

In [None]:
# HumanEval Code Benchmark
N_CODE = 1
SKIP_BASELINE_CODE = False
THINK_CODE = True

cmd = ['python', '-u', '-m', 'tests.code_benchmark', '--model', MODEL, '--n', str(N_CODE)]
if SKIP_BASELINE_CODE: cmd.append('--skip-baseline')
if THINK_CODE: cmd.append('--think')

print(f"Running: {' '.join(cmd)}\n")

with open('/content/ollama_server.log', 'a') as log:
    process = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        bufsize=1,
        cwd='/content/MCP'
    )
    for line in process.stdout:
        print(line, end='')
        log.write(line)
        log.flush()
    process.wait()

---

In [None]:
os.environ['RF_SHARE'] = '1'
os.chdir('/content/MCP')

!python -u -m ui.app | tee -a /content/ollama_server.log

---

In [None]:
# !OLLAMA_MODELS=$MODELS_PATH ollama ps
# print()
# !OLLAMA_MODELS=$MODELS_PATH ollama list

In [None]:
# !rm -rf /content/MCP
# !rm /content/ollama_server.log