<a href="https://colab.research.google.com/github/Sounakray2003/Asmadiya-tech/blob/main/CodeValidator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==============================
#  LLM + SEMGREP HYBRID AUDITOR
#  CATEGORISED SEVERITY: Critical / High / Medium / Low
#  ZERO HARDCODING | LOCAL ONLY
# ==============================

!pip install -q semgrep transformers torch accelerate bitsandbytes jinja2 > /dev/null 2>&1

import os, json, re, shutil, subprocess, html
from pathlib import Path
from datetime import datetime
from jinja2 import Environment
from IPython.display import HTML, display
from google.colab import files
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# ------------------------------------------------------------------
# 1. Load LLM (Mistral‑7B, local, no API key)
# ------------------------------------------------------------------
print("Loading Mistral‑7B‑Instruct (local)…")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    device_map="auto",
    torch_dtype="auto",
    load_in_4bit=True,
    trust_remote_code=False
)
llm = pipeline("text-generation", model=model, tokenizer=tokenizer,
               max_new_tokens=600, temperature=0.0, do_sample=False)

# ------------------------------------------------------------------
# 2. Upload code
# ------------------------------------------------------------------
print("\nUpload .py file or .zip:")
uploaded = files.upload()

extract_dir = "/content/uploaded_code"
if os.path.exists(extract_dir):
    shutil.rmtree(extract_dir)
os.makedirs(extract_dir, exist_ok=True)

for name, data in uploaded.items():
    dest = os.path.join(extract_dir, name)
    with open(dest, "wb") as f:
        f.write(data)
    if name.lower().endswith(".zip"):
        import zipfile
        with zipfile.ZipFile(dest, "r") as z:
            z.extractall(extract_dir)
        os.remove(dest)

py_files = list(Path(extract_dir).rglob("*.py"))
if not py_files:
    raise FileNotFoundError("No .py file found.")
file_path = py_files[0]
rel_path  = str(file_path.relative_to(Path(extract_dir).parent))
code      = file_path.read_text(encoding="utf-8", errors="replace")

print(f"\nScanning: {rel_path}")
print("="*70)

# ------------------------------------------------------------------
# 3. Run Semgrep (Zero hardcoding)
# ------------------------------------------------------------------
print("Running Semgrep (1000+ rules)…")
semgrep_cmd = ["semgrep", "scan", "--config=p/r2c-security-audit", "--json", str(file_path.parent)]
proc = subprocess.run(semgrep_cmd, capture_output=True, text=True)

semgrep_issues = []
if proc.returncode == 0:
    try:
        data = json.loads(proc.stdout)
        for r in data.get("results", []):
            if file_path.name in r["path"]:
                meta = r.get("extra", {}).get("metadata", {})
                semgrep_issues.append({
                    "line": r["start"]["line"],
                    "cwe": meta.get("cwe", ["N/A"])[0].split(":")[0],
                    "owasp": meta.get("owasp", ["N/A"])[0].split(":")[0],
                    "severity": r["extra"]["severity"].upper(),
                    "message": r["extra"]["message"],
                    "rule": r["check_id"],
                    "source": "Semgrep"
                })
    except: pass
else:
    semgrep_issues.append({"line":"N/A","cwe":"N/A","owasp":"N/A","severity":"ERROR",
                           "message":f"Semgrep error: {proc.stderr.strip()}","rule":"N/A","source":"Semgrep"})

# ------------------------------------------------------------------
# 4. Elevate to CRITICAL for the most dangerous patterns
# ------------------------------------------------------------------
CRITICAL_CWE = {"CWE-78","CWE-89","CWE-502","CWE-94","CWE-95"}   # Cmd/SQL/Deser/Eval
CRITICAL_OWASP = {"A03","A08","A10"}                         # Injection/SSRF

for i in semgrep_issues:
    if i["cwe"] in CRITICAL_CWE or i["owasp"] in CRITICAL_OWASP:
        i["severity"] = "CRITICAL"

# ------------------------------------------------------------------
# 5. Run LLM – explain each issue
# ------------------------------------------------------------------
print("Running LLM for plain‑English explanations…")
LLM_PROMPT = f"""You are a senior security engineer.
For every Semgrep finding below, give a **short, clear** explanation of why it is dangerous and how to fix it.

Return ONLY a JSON array of objects with:
{{
  "line": <int>,
  "explanation": "<plain English>"
}}

Findings to explain:
{json.dumps([{"line": i["line"], "msg": i["message"]} for i in semgrep_issues[:15]], indent=2)}

Code (for context):
{code}

Return ONLY the JSON array.
"""

def explain_with_llm() -> list:
    try:
        out = llm(LLM_PROMPT)[0]["generated_text"]
        m = re.search(r"\[.*\]", out, re.DOTALL)
        if m:
            return json.loads(m.group(0))
    except: pass
    return []

llm_explanations = explain_with_llm()

# Merge explanations
for exp in llm_explanations:
    for issue in semgrep_issues:
        if issue["line"] == exp["line"]:
            issue["explanation"] = exp["explanation"]
            break

# ------------------------------------------------------------------
# 6. HTML Report – colour‑coded severity rows
# ------------------------------------------------------------------
timestamp    = datetime.now().strftime("%Y-%m-%d %H:%M:%S IST")
counts = {
    "CRITICAL": sum(1 for i in semgrep_issues if i["severity"] == "CRITICAL"),
    "HIGH"    : sum(1 for i in semgrep_issues if i["severity"] == "HIGH"),
    "MEDIUM"  : sum(1 for i in semgrep_issues if i["severity"] == "MEDIUM"),
    "LOW"     : sum(1 for i in semgrep_issues if i["severity"] == "LOW")
}
total_issues = len(semgrep_issues)

HTML_REPORT = """
<!DOCTYPE html>
<html><head><meta charset="UTF-8"><title>LLM + Semgrep Hybrid Audit</title>
<style>
  body {font-family:Arial;margin:2rem;background:#f9f9fb;}
  .summary {background:#d5f4e6;padding:1rem;border-radius:8px;margin-bottom:2rem;}
  .safe {padding:1rem;margin:1rem 0;border-left:5px solid #28a745;background:#d4edda;border-radius:4px;}
  pre {background:#eee;padding:8px;border-radius:4px;overflow:auto;font-size:0.9em;}
  table {width:100%;border-collapse:collapse;margin-top:1rem;}
  th,td {border:1px solid #ddd;padding:8px;text-align:left;}
  .critical {background:#c0392b;color:white;}
  .high     {background:#e74c3c;color:white;}
  .medium   {background:#e67e22;color:white;}
  .low      {background:#f39c12;color:white;}
  .error    {background:#7f8c8d;color:white;}
</style></head><body>

<div class="summary">
  <h1>LLM + Semgrep Hybrid Security Audit</h1>
  <p><strong>Semgrep:</strong> 1000+ OWASP/CWE rules (zero hardcoding)</p>
  <p><strong>LLM:</strong> Mistral‑7B (plain‑English explanations)</p>
  <p><strong>File:</strong> {{ rel_path }}</p>
  <p><strong>Time:</strong> {{ timestamp }}</p>
  <p><strong>Severity breakdown:</strong>
     <span class="critical">Critical: {{ counts.CRITICAL }}</span> |
     <span class="high">High: {{ counts.HIGH }}</span> |
     <span class="medium">Medium: {{ counts.MEDIUM }}</span> |
     <span class="low">Low: {{ counts.LOW }}</span>
  </p>
</div>

<div class="safe">
  <strong>NO API KEY</strong> | <strong>NO REGEX</strong> | <strong>CATEGORISED SEVERITY</strong>
</div>

<h2>Raw Code</h2>
<pre>{{ escaped_code }}</pre>

{% if semgrep_issues %}
<h2>Security Findings (Semgrep + LLM Explanation)</h2>
<table>
  <tr><th>Severity</th><th>CWE</th><th>OWASP</th><th>Line</th><th>Rule</th><th>Message</th><th>Explanation</th></tr>
{% for i in semgrep_issues %}
  <tr class="{{ i.severity.lower() if i.severity != 'CRITICAL' else 'critical' }}">
    <td>{{ i.severity }}</td>
    <td>{{ i.cwe }}</td>
    <td>{{ i.owasp }}</td>
    <td>{{ i.line }}</td>
    <td>{{ i.rule }}</td>
    <td>{{ i.message }}</td>
    <td>{{ i.get("explanation", "—") }}</td>
  </tr>
{% endfor %}
</table>
{% else %}
<div class="safe"><strong>No issues detected – code is clean!</strong></div>
{% endif %}

</body></html>
"""

env = Environment()
template = env.from_string(HTML_REPORT)
report_html = template.render(
    rel_path      = rel_path,
    timestamp     = timestamp,
    counts        = counts,
    total_issues  = total_issues,
    escaped_code  = html.escape(code),
    semgrep_issues= semgrep_issues
)

print("\n" + "="*70)
print("HYBRID AUDIT COMPLETE – SEVERITY CATEGORISED")
print("="*70)
display(HTML(report_html))