CRAWL DATA

In [None]:
import json
import os
import pandas as pd
from Libraries.Crawler import CategoryValidator, UrlCollector, ArticleCrawler

In [None]:
# === CÁC HÀM HỖ TRỢ ===

def save_json(data, file_path):
    """Lưu một list dictionary vào file JSON."""
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def load_json(file_path):
    """Đọc dữ liệu từ file JSON."""
    if not os.path.exists(file_path):
        return []
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def append_to_jsonl(data, file_path):
    """Ghi nối tiếp list dictionary vào file JSONL."""
    with open(file_path, 'a', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

def get_existing_article_urls(file_path):
    """Lấy set các URL bài viết đã có từ file JSONL."""
    urls = set()
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    urls.add(json.loads(line)['url'])
                except (json.JSONDecodeError, KeyError):
                    continue
    return urls

def convert_to_xlsx(jsonl_path, xlsx_path):
    """Chuyển file JSONL sang XLSX."""
    try:
        df = pd.read_json(jsonl_path, lines=True)
        column_order = ["category", "sub_category", "url", "title", "description", "content", "date", "words"]
        df = df[[col for col in column_order if col in df.columns]]
        df.to_excel(xlsx_path, index=False, engine='openpyxl')
    except (FileNotFoundError, ValueError):
        pass

In [None]:
type_dict = load_json("Resource/categories.json")

my_config = {
    "BASE_URL": "https://vnexpress.net",

    "MIN_YEAR": 2020,
    "MIN_WORDS": 200,
    "MAX_WORDS": 1000,
    
    "TARGET_ARTICLES_PER_SUBTYPE": 30,
    "MAX_CONCURRENT_WORKERS": 6,
    "VALIDATION_ARTICLES_COUNT": 5,
    
    "TYPE_DICT": type_dict
}

resource_dir = "Resource"
database_dir = "Database"
pageName = "VNExpress"

DICT_FILE = f"{resource_dir}/{pageName}_DICT.json"
URLS_FILE = f"{resource_dir}/{pageName}_URLS.json"
JSON_FILE = f"{database_dir}/JSON/{pageName}.jsonl"
XLSX_FILE = f"{database_dir}/XLSX/{pageName}.xlsx"

os.makedirs(resource_dir, exist_ok=True)
os.makedirs(os.path.join(database_dir, "JSON"), exist_ok=True)
os.makedirs(os.path.join(database_dir, "XLSX"), exist_ok=True)

In [None]:
# === CHẠY QUY TRÌNH ===

# --- Giai đoạn 1: Lấy danh sách chuyên mục hợp lệ ---
validator = CategoryValidator(config=my_config)
valid_categories = validator.run()
save_json(valid_categories, DICT_FILE)

# --- Giai đoạn 2: Thu thập URL ---
validated_dict = load_json(DICT_FILE)
if validated_dict:
    url_collector = UrlCollector(config=my_config)
    all_urls = url_collector.run(valid_subcategories=validated_dict)
    save_json(all_urls, URLS_FILE)
else:
    print("Không có chuyên mục hợp lệ nào, dừng quy trình.")

# --- Giai đoạn 3: Crawl nội dung bài viết ---
urls_to_crawl = load_json(URLS_FILE)
if urls_to_crawl:
    existing_urls = get_existing_article_urls(JSON_FILE)
    
    article_crawler = ArticleCrawler(config=my_config)
    new_articles = article_crawler.run(
        urls_to_crawl=urls_to_crawl, 
        existing_article_urls=existing_urls
    )
    
    if new_articles:
        append_to_jsonl(new_articles, JSON_FILE)
        
    convert_to_xlsx(JSON_FILE, XLSX_FILE)
else:
    print("Không có URL nào để crawl.")

TRAIN and TEST

In [None]:
import pandas as pd
from transformers import pipeline
from Libraries import Trainer

training_config = {
    # --- Đường dẫn và tên model ---
    "DATA_JSONL_FILE": "Database/JSON/vnexpress_articles.jsonl",
    "MODEL_CHECKPOINT": "vinai/bartpho-syllable",
    "OUTPUT_MODEL_DIR": "Models/bartpho-summarizer",
    
    # --- Hyperparameters ---
    "MAX_INPUT_LENGTH": 1024,
    "MAX_TARGET_LENGTH": 256,
    "BATCH_SIZE": 4,
    "NUM_TRAIN_EPOCHS": 3,
    "LEARNING_RATE": 3e-5,
    "WEIGHT_DECAY": 0.01,
}

In [None]:
# === TRAIN ===
summarizer_trainer = Trainer.SummarizationTrainer(config=training_config)
summarizer_trainer.run()

In [None]:
# === TEST ===
fine_tuned_model_path = training_config["OUTPUT_MODEL_DIR"] 
summarizer_pipeline = pipeline("summarization", model=fine_tuned_model_path)

# Lấy một bài báo từ dữ liệu của bạn để tóm tắt thử
df = pd.read_json(training_config["DATA_JSONL_FILE"], lines=True)
sample_text = df.iloc[50]["content"] # Số 50

print("--- VĂN BẢN GỐC ---")
print(sample_text)
print("\n" + "="*50 + "\n")

print("--- BẢN TÓM TẮT TỪ MODEL ---")
summary = summarizer_pipeline(sample_text, max_length=256, min_length=50, do_sample=False)
print(summary[0]['summary_text'])