In [1]:
!pip install newspaper3k
!pip install lxml_html_clean
!pip install gradio



In [2]:
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/LLM

/content/drive/.shortcut-targets-by-id/1cLAfJ7A0BfjUVEeXnxfsTHDYfqxMVwrk/LLM


In [3]:
import torch
import gradio as gr
from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoTokenizer
from newspaper import Article

# Cấu hình Model với đường dẫn tuyệt đối
MODEL_CONFIGS = {
    "BART Large + LoRA 20K": {
        "model_dir": "lora_fb_bart_large_adapter_20K",
        "tokenizer": "facebook/bart-large",
        "type": "seq2seq"
    },
    "Qwen + LoRA": {
        "model_dir": "qwen_3_4b_ins",
        "tokenizer": "Qwen/Qwen3-4B-Instruct-2507",
        "type": "causal"
    },
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dictionary lưu trữ model đã load sẵn
_loaded_models = {}

def preload_all_models():
    """
    Hàm này sẽ chạy một lần duy nhất khi khởi động để nạp tất cả vào CUDA
    """
    for name, cfg in MODEL_CONFIGS.items():
        if name not in _loaded_models:
            print(f"--- Đang nạp sẵn {name} vào GPU... ---")
            try:
                if cfg["type"] == "seq2seq":
                    model = AutoModelForSeq2SeqLM.from_pretrained(
                        cfg["model_dir"],
                        torch_dtype=torch.float16,
                        low_cpu_mem_usage=True
                    ).to(device)
                else:
                    model = AutoModelForCausalLM.from_pretrained(
                        cfg["model_dir"],
                        torch_dtype=torch.float16,
                        low_cpu_mem_usage=True
                    ).to(device)

                tokenizer = AutoTokenizer.from_pretrained(cfg["tokenizer"])
                model.eval()
                _loaded_models[name] = (model, tokenizer, cfg["type"])
            except Exception as e:
                print(f"Lỗi khi nạp {name}: {e}")

# Gọi hàm nạp sẵn ngay khi chạy cell này
preload_all_models()

def summarize_text(text, model_name, max_output_length=200, num_beams=4):
    # Lấy trực tiếp từ cache, không load lại, không giải phóng gì cả
    if model_name not in _loaded_models:
        return "Model chưa được nạp thành công."

    model, tokenizer, model_type = _loaded_models[model_name]

    if model_type == "causal":
        prompt = f"<|im_start|>system\nBạn là chuyên gia tóm tắt văn bản.<|im_end|>\n<|im_start|>user\nHãy tóm tắt văn bản sau: {text}<|im_end|>\n<|im_start|>assistant\n"
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        input_length = inputs.input_ids.shape[1]
    else:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024).to(device)

    with torch.no_grad():
        # Dùng num_beams thấp (1-2) nếu vẫn bị OOM khi chạy Qwen
        summary_ids = model.generate(
            **inputs,
            max_new_tokens=max_output_length if model_type == "causal" else None,
            max_length=max_output_length if model_type == "seq2seq" else None,
            num_beams=num_beams,
            early_stopping=True
        )

    if model_type == "causal":
        generated_ids = summary_ids[0][input_length:]
        return tokenizer.decode(generated_ids, skip_special_tokens=True)

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# --- Các hàm phụ trợ giữ nguyên ---
def summarize_input(text, url, model_name, max_len, beams):
    try:
        if url.strip():
            article = Article(url, language="vi")
            article.download(); article.parse()
            text_to_summarize = article.text
        else:
            text_to_summarize = text

        if not text_to_summarize.strip(): return "Vui lòng nhập nội dung."
        return summarize_text(text_to_summarize, model_name, max_len, beams)
    except Exception as e:
        return f"Lỗi: {str(e)}"

# Giao diện Gradio
with gr.Blocks() as demo:
    gr.Markdown("## Tóm tắt đa mô hình (Nạp sẵn vào GPU)")
    model_selector = gr.Dropdown(choices=list(MODEL_CONFIGS.keys()), value=list(MODEL_CONFIGS.keys())[0], label="Chọn mô hình")
    text_input = gr.Textbox(lines=6, label="Văn bản")
    url_input = gr.Textbox(label="URL")
    with gr.Row():
        max_len = gr.Slider(50, 500, value=200, label="Độ dài")
        beams = gr.Slider(1, 4, value=2, step=1, label="Beam size (Giảm để tránh OOM)")
    output = gr.Textbox(label="Kết quả", lines=8)
    btn = gr.Button("Tóm tắt")
    btn.click(fn=summarize_input, inputs=[text_input, url_input, model_selector, max_len, beams], outputs=output)

demo.launch(debug=True)

--- Đang nạp sẵn BART Large + LoRA 20K vào GPU... ---


`torch_dtype` is deprecated! Use `dtype` instead!


--- Đang nạp sẵn Qwen + LoRA vào GPU... ---


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://41d1153a41f82badcd.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://41d1153a41f82badcd.gradio.live




In [None]:
demo.launch(share=True)