# ReasonForge

Deterministic math & code tools for small language models.

1. Clone repo & install deps
2. Install Ollama & pull model
3. Sanity tests
4. MATH-500 / HumanEval benchmarks
5. Gradio chat UI

---
## Setup

In [None]:
# Clone repo & install deps
!git clone https://github.com/RoyCoding8/MCP.git /content/MCP 2>/dev/null || echo 'Already cloned'
!pip install -q "requests>=2.31.0" "gradio>=6.0" "sympy>=1.13.0" "datasets>=4.6.1" "math-verify[antlr4_13_2]>=0.9.0"

In [None]:
# Install Ollama
!sudo apt-get install -qq zstd > /dev/null 2>&1
!curl -fsSL https://ollama.com/install.sh | sh > /dev/null 2>&1

In [None]:
import subprocess, time, os
import requests

MODELS_PATH = '/content/ollama_models'
os.makedirs(MODELS_PATH, exist_ok=True)
print(f'Models path: {MODELS_PATH}')

subprocess.run(['pkill', '-f', 'ollama'], capture_output=True)
time.sleep(2)

env = os.environ.copy()
env['OLLAMA_MODELS'] = MODELS_PATH
env['OLLAMA_HOST'] = '0.0.0.0:11434'
env['OLLAMA_NUM_PARALLEL'] = '2'

print("Starting Ollama server...")
log_file = open('/content/ollama_server.log', 'w')
proc = subprocess.Popen(
    ['ollama', 'serve'],
    env=env,
    stdout=log_file,
    stderr=subprocess.STDOUT,
)

start = time.time()
while time.time()-start < 30:
    try:
        requests.get('http://localhost:11434/')
        print(f'Ollama ready ({time.time()-start:.2f}s)')
        break
    except requests.ConnectionError:
        time.sleep(1)
else:
    print('Ollama failed to start â€” check /content/ollama_server.log')

In [None]:
# Pull Models
MODELS = ['qwen3:8b']

for MODEL in MODELS:
    print(f'Pulling {MODEL}...')
    !OLLAMA_MODELS={MODELS_PATH} ollama pull {MODEL}

---
## Verify

In [None]:
os.chdir('/content/MCP')
!python -m tests.sanity

---
## Benchmarks

A/B comparison: Baseline (no tools) vs ReasonForge (with tools).

In [None]:
# MATH-500 Benchmark
N_MATH = 50
SKIP_BASELINE = False
THINK = False

from google.colab import userdata
try: os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
except: print("HF_TOKEN not found in Secrets.")

os.chdir('/content/MCP')
cmd = f'python -m tests.benchmark --model {MODEL} --n {N_MATH}'
if SKIP_BASELINE: cmd += ' --skip-baseline'
if THINK: cmd += ' --think'
print(f'Running: {cmd}\n')
!{cmd}

In [None]:
# HumanEval Code Benchmark
N_CODE = 20
SKIP_BASELINE_CODE = False
THINK_CODE = True

os.chdir('/content/MCP')
cmd = f'python -m tests.code_benchmark --model {MODEL} --n {N_CODE}'
if SKIP_BASELINE_CODE: cmd += ' --skip-baseline'
if THINK_CODE: cmd += ' --think'
print(f'Running: {cmd}\n')
!{cmd}

---

In [None]:
os.environ['RF_SHARE'] = '1'
os.chdir('/content/MCP')

!python -u -m ui.app | tee -a /content/ollama_server.log

---

In [None]:
# !OLLAMA_MODELS=$MODELS_PATH ollama ps
# print()
# !OLLAMA_MODELS=$MODELS_PATH ollama list

In [None]:
# !rm -rf /content/MCP
# !rm /content/ollama_server.log