In [None]:
!pip install -q transformers accelerate gitpython pandas sentencepiece


In [None]:
from git import Repo
from pathlib import Path
import subprocess, json, re, os, pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# ==== CONFIGURA√á√ïES ====
MODEL_ID = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
REPO_URL = "https://github.com/unclecode/crawl4ai"
REPO_DIR = Path("/content/crawl4ai_repo")
OUT_DIR = Path("/content/resultados_qwen")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ==== 1. CLONAR REPOSIT√ìRIO ====
if REPO_DIR.exists():
    import shutil; shutil.rmtree(REPO_DIR)
Repo.clone_from(REPO_URL, REPO_DIR)
print("‚úÖ Reposit√≥rio clonado!")

# ==== 2. COLETAR EVID√äNCIAS ====
def ler(p):
    try: return Path(p).read_text(encoding="utf-8", errors="ignore")
    except: return ""

readme = ler(REPO_DIR/"README.md")
tree = subprocess.check_output(["bash","-lc",f"cd {REPO_DIR} && find . -maxdepth 3 -type d | sort"]).decode()

evidencia = f"""
README (trecho):
{readme[:5000]}

--- TREE ---
{tree[:5000]}
"""
print("üìÅ Evid√™ncias coletadas.")


‚úÖ Reposit√≥rio clonado!
üìÅ Evid√™ncias coletadas.


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import json, re, ast

print("üöÄ Carregando modelo Qwen (0.5B)...")
tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
mdl = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True
)
gen = pipeline("text-generation", model=mdl, tokenizer=tok)

# Mensagens em formato chat: isso ajuda o Qwen a obedecer "somente JSON"
messages = [
  {"role": "system",
   "content": (
     "Voc√™ √© um analista de arquitetura. Responda ESTRITAMENTE em JSON, "
     "sem explica√ß√µes, sem crases, sem texto extra. "
     "Formato obrigat√≥rio:\n"
     "{\n"
     '  "patterns": [\n'
     '    {"name": "<Padr√£o>", "confidence": 0.0, "evidence": "<frase>"}\n'
     "  ],\n"
     '  "notes": "<observa√ß√µes curtas>"\n'
     "}"
   )},
  {"role": "user",
   "content": (
     "Analise as evid√™ncias e identifique PADR√ïES ARQUITETURAIS. "
     "Use nomes can√¥nicos como: Event-Driven, Plugin/Hook, Layered/MVC, "
     "Microservices, Cloud-Native/Containerized, Pipeline/Dataflow, Client-Server/API. "
     "M√ÅXIMO 5 padr√µes.\n\n"
     "EVID√äNCIAS:\n" + evidencia
   )}
]

prompt_chat = tok.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

# Gera√ß√£o determin√≠stica e curta (evita ‚Äúconversa‚Äù extra)
out = gen(
    prompt_chat,
    max_new_tokens=380,
    do_sample=False,
    temperature=0.0,
    return_full_text=False,              # <- importante p/ n√£o repetir o prompt
    eos_token_id=tok.eos_token_id,
    pad_token_id=tok.eos_token_id
)[0]["generated_text"]

print("=== GERA√á√ÉO BRUTA (primeiros 800 chars) ===")
print(out[:800])

# ---- Parse robusto ----
def extrair_bloco_json(texto):
    # pega do primeiro { ao √∫ltimo }
    ini = texto.find("{")
    fim = texto.rfind("}")
    return texto[ini:fim+1] if ini != -1 and fim != -1 and fim > ini else texto

raw = extrair_bloco_json(out).strip()

def tentar_parse(raw_text):
    # 1) JSON direto
    try:
        return json.loads(raw_text)
    except:
        pass
    # 2) Troca aspas simples por duplas
    try:
        return json.loads(raw_text.replace("'", '"'))
    except:
        pass
    # 3) literal_eval (aceita dict Python)
    try:
        return ast.literal_eval(raw_text)
    except:
        pass
    return None

data = tentar_parse(raw)

# fallback m√≠nimo: se ainda falhar, gera JSON com heur√≠stica simples pelas evid√™ncias
if not data or not isinstance(data, dict) or "patterns" not in data:
    pats = []
    ev_txt = evidencia.lower()
    if "webhook" in ev_txt or "queue" in ev_txt or "event" in ev_txt:
        pats.append({"name":"Event-Driven","confidence":0.75,"evidence":"Men√ß√µes a webhooks/filas/eventos no README/estrutura."})
    if "hook" in ev_txt or "plugin" in ev_txt:
        pats.append({"name":"Plugin/Hook","confidence":0.72,"evidence":"Sistema de hooks/plugins citado nas evid√™ncias."})
    if "docker" in ev_txt or "compose" in ev_txt:
        pats.append({"name":"Cloud-Native/Containerized","confidence":0.70,"evidence":"Arquivos Docker/compose indicam cont√™ineres."})
    data = {"patterns": pats, "notes": "Parse do modelo falhou; heur√≠stica aplicada sobre as evid√™ncias."}

# salva
json_path = OUT_DIR/"patterns_qwen.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

# resumo CSV
import pandas as pd
rows = [{"pattern": p.get("name",""), "confidence": p.get("confidence",""), "evidence": p.get("evidence","")} for p in data.get("patterns",[])]
pd.DataFrame(rows).to_csv(OUT_DIR/"patterns_qwen_summary.csv", index=False)

print("‚úÖ Salvos:")
print(" -", json_path)
print(" -", OUT_DIR/"patterns_qwen_summary.csv")


üöÄ Carregando modelo Qwen (0.5B)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== GERA√á√ÉO BRUTA (primeiros 800 chars) ===
{
  "patterns": [
    {
      "name": "Event-Driven",
      "confidence": 0.9,
      "evidence": "Crawl4AI is designed to be event-driven, allowing it to process web content in real-time."
    },
    {
      "name": "Plugin/Hook",
      "confidence": 0.8,
      "evidence": "Crawl4AI supports plugins and hooks, enabling developers to extend its functionality."
    },
    {
      "name": "Layered/MVC",
      "confidence": 0.7,
      "evidence": "Crawl4AI uses a layered MVC architecture, allowing it to manage different parts of the application."
    },
    {
      "name": "Microservices",
      "confidence": 0.6,
      "evidence": "Crawl4AI is built on microservices architecture, enabling it to handle complex applications."
    },
    {
      "name": "Cloud-Native/Containerized",
      "confi
‚úÖ Salvos:
 - /content/resultados_qwen/patterns_qwen.json
 - /content/resultados_qwen/patterns_qwen_summary.csv


In [None]:
from pathlib import Path
print(Path("/content/resultados_qwen/patterns_qwen.json").read_text()[:1200])
print(Path("/content/resultados_qwen/patterns_qwen_summary.csv").read_text()[:400])

{
  "patterns": [
    {
      "name": "Event-Driven",
      "confidence": 0.9,
      "evidence": "Crawl4AI is designed to be event-driven, allowing it to process web content in real-time."
    },
    {
      "name": "Plugin/Hook",
      "confidence": 0.8,
      "evidence": "Crawl4AI supports plugins and hooks, enabling developers to extend its functionality."
    },
    {
      "name": "Layered/MVC",
      "confidence": 0.7,
      "evidence": "Crawl4AI uses a layered MVC architecture, allowing it to manage different parts of the application."
    },
    {
      "name": "Microservices",
      "confidence": 0.6,
      "evidence": "Crawl4AI is built on microservices architecture, enabling it to handle complex applications."
    },
    {
      "name": "Cloud-Native/Containerized",
      "confidence": 0.5,
      "evidence": "Crawl4AI is designed to run on cloud-native environments, such as Kubernetes and Docker."
    },
    {
      "name": "Pipeline/Dataflow",
      "confidence": 0.4,
     

In [None]:
from google.colab import files
files.download("/content/resultados_qwen/patterns_qwen.json")
files.download("/content/resultados_qwen/patterns_qwen_summary.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from pathlib import Path
import json, re

# Caminhos dos arquivos salvos
base = Path("/content/resultados_qwen")
json_file = base / "patterns_qwen.json"
csv_file  = base / "patterns_qwen_summary.csv"

print("üìÅ Verificando arquivos do resultado...\n")

# 1Ô∏è‚É£ Checar tamanho e exist√™ncia
if not json_file.exists() or not csv_file.exists():
    print("‚ùå Arquivos n√£o encontrados. Execute novamente a c√©lula do modelo.")
else:
    print(f"‚úÖ Arquivos encontrados:\n - {json_file}\n - {csv_file}")
    print(f"üì¶ Tamanho JSON: {json_file.stat().st_size} bytes")
    print(f"üì¶ Tamanho CSV:  {csv_file.stat().st_size} bytes")

# 2Ô∏è‚É£ Tentar ler o JSON
try:
    data = json.loads(json_file.read_text(encoding="utf-8"))
    pats = data.get("patterns", [])
    print(f"\nüìä Padr√µes detectados: {len(pats)}")
    for p in pats:
        print(f" - {p['name']} (confian√ßa: {p['confidence']})")
    print("\nüß† Observa√ß√µes:", data.get("notes", ""))
except Exception as e:
    print("‚ö†Ô∏è JSON inv√°lido ou corrompido:", e)

# 3Ô∏è‚É£ Detectar se foi fallback (heur√≠stico)
raw = json_file.read_text(encoding="utf-8").lower()
if "heur√≠stica aplicada" in raw or "fallback" in raw:
    print("\n‚ö†Ô∏è Aviso: Resultado veio do PARSE HEUR√çSTICO (modelo n√£o retornou JSON completo).")
else:
    print("\n‚úÖ Resultado gerado diretamente pelo modelo (sem fallback).")

# 4Ô∏è‚É£ Visual r√°pido do CSV
print("\nüìÑ Pr√©via do arquivo CSV:")
print(Path(csv_file).read_text(encoding="utf-8")[:300])


üìÅ Verificando arquivos do resultado...

‚úÖ Arquivos encontrados:
 - /content/resultados_qwen/patterns_qwen.json
 - /content/resultados_qwen/patterns_qwen_summary.csv
üì¶ Tamanho JSON: 1476 bytes
üì¶ Tamanho CSV:  823 bytes

üìä Padr√µes detectados: 7
 - Event-Driven (confian√ßa: 0.9)
 - Plugin/Hook (confian√ßa: 0.8)
 - Layered/MVC (confian√ßa: 0.7)
 - Microservices (confian√ßa: 0.6)
 - Cloud-Native/Containerized (confian√ßa: 0.5)
 - Pipeline/Dataflow (confian√ßa: 0.4)
 - Client-Server/API (confian√ßa: 0.3)

üß† Observa√ß√µes: Crawl4AI is a versatile and powerful web crawler and scraper that can be used for a wide range of tasks, including web scraping, data extraction, and automation.

‚úÖ Resultado gerado diretamente pelo modelo (sem fallback).

üìÑ Pr√©via do arquivo CSV:
pattern,confidence,evidence
Event-Driven,0.9,"Crawl4AI is designed to be event-driven, allowing it to process web content in real-time."
Plugin/Hook,0.8,"Crawl4AI supports plugins and hooks, enabling develo