CRAWL DATA

In [1]:
import json
import os
import pandas as pd
import re
import itertools
from Libraries.Crawler import CategoryValidator, UrlCollector, ArticleCrawler

In [2]:
# === CÁC HÀM XỬ LÝ FILE ===

def load_json(file_path):
    if not os.path.exists(file_path):
        return []
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)
    
def replace_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def save_json(data, file_path):
    with open(file_path, 'a', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def load_jsonl(file_path):
    data = []
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                data.append(json.loads(line))
    return data

def replace_jsonl(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

def save_jsonl(data, file_path):
    with open(file_path, 'a', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

# === CÁC HÀM HỖ TRỢ ===

def get_existing_article_urls(file_path):
    """Lấy set các URL bài viết đã có từ file JSONL."""
    urls = set()
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    urls.add(json.loads(line)['url'])
                except (json.JSONDecodeError, KeyError):
                    continue
    return urls

def convert_to_xlsx(jsonl_path, xlsx_path):
    """Chuyển file JSONL sang XLSX."""
    try:
        df = pd.read_json(jsonl_path, lines=True)
        column_order = ["category", "sub_category", "url", "title", "description", "content", "date", "words"]
        df = df[[col for col in column_order if col in df.columns]]
        df.to_excel(xlsx_path, index=False, engine='openpyxl')
    except (FileNotFoundError, ValueError):
        pass

def get_url_key(item):
    match = re.search(r'-(\d+)\.html', item['url'])
    return int(match.group(1)) if match else 0

def heapify(arr, n, i, key_func):
    largest = i
    l = 2 * i + 1
    r = 2 * i + 2
    if l < n and key_func(arr[l]) > key_func(arr[largest]): largest = l
    if r < n and key_func(arr[r]) > key_func(arr[largest]): largest = r
    if largest != i:
        arr[i], arr[largest] = arr[largest], arr[i]
        heapify(arr, n, largest, key_func)

def heapSort(arr, key_func):
    n = len(arr)
    for i in range(n // 2 - 1, -1, -1):
        heapify(arr, n, i, key_func)
    for i in range(n - 1, 0, -1):
        arr[i], arr[0] = arr[0], arr[i]
        heapify(arr, i, 0, key_func)
    return arr

In [3]:
type_dict = load_json("Resource/categories.json")

my_config = {
    "BASE_URL": "https://vnexpress.net",

    "MIN_YEAR": 2020,
    "MIN_WORDS": 200,
    "MAX_WORDS": 1000,
    
    "TARGET_ARTICLES_PER_SUBTYPE": 30,
    "MAX_CONCURRENT_WORKERS": 6,
    "VALIDATION_ARTICLES_COUNT": 5,
    "PROGRESS_TIMEOUT": 10,
    "ARTICLE_TIMEOUT": 10,
    "MAX_CONSECUTIVE_FAILURES": 3,
    "URL_MAX_SUBCATEGORY_FAILURES": 3, 
    "ARTICLE_TIMEOUT": 5,
    
    "TYPE_DICT": type_dict
}

resource_dir = "Resource"
database_dir = "Database"
pageName = "VNExpress"

CATE_FILE = f"{resource_dir}/{pageName}_CATE.json"
DICT_FILE = f"{resource_dir}/{pageName}_DICT.json"
URLS_FILE = f"{resource_dir}/{pageName}_URLS.json"
JSON_FILE = f"{database_dir}/JSON/{pageName}.jsonl"
XLSX_FILE = f"{database_dir}/XLSX/{pageName}.xlsx"
TEST_FILE = f"{resource_dir}/test.json"

os.makedirs(resource_dir, exist_ok=True)
os.makedirs(os.path.join(database_dir, "JSON"), exist_ok=True)
os.makedirs(os.path.join(database_dir, "XLSX"), exist_ok=True)

In [4]:
# --- Giai đoạn 1: Lấy danh sách chuyên mục hợp lệ ---
def getCategories():
    validator = CategoryValidator(config=my_config)
    valid_categories = validator.run()
    replace_json(valid_categories, DICT_FILE)

# getCategories()

In [5]:
# --- Giai đoạn 2: Thu thập URL ---
def getURLDict():
    validated_dict = load_json(DICT_FILE)
    if validated_dict:
        url_collector = UrlCollector(config=my_config)
        all_urls = url_collector.run(valid_subcategories=validated_dict)
        save_json(all_urls, URLS_FILE)
    else:
        print("Không có chuyên mục hợp lệ nào, dừng quy trình.")

# getURLDict()

In [None]:
# --- Giai đoạn 3: Crawl nội dung bài viết ---
def finalCrawl():
    urlindex = 1
    URLS_FILE = f"{resource_dir}/{pageName}_URLS_{urlindex}.json"

    urls_to_crawl = load_json(URLS_FILE)
    if urls_to_crawl:
        print(f"\nĐang sắp xếp {len(urls_to_crawl)} URLs bằng Heapsort...")
        sorted_urls = heapSort(urls_to_crawl, key_func=get_url_key)
        print("Sắp xếp hoàn tất.")
        existing_urls = get_existing_article_urls(JSON_FILE)
        
        article_crawler = ArticleCrawler(config=my_config)
        new_articles, crawled_urls_list = article_crawler.run(
            urls_to_crawl=sorted_urls, 
            existing_article_urls=existing_urls
        )
        
        if new_articles:
            save_jsonl(new_articles, JSON_FILE)
            
        print(f"\nDanh sách các URL đã crawl thành công ({len(crawled_urls_list)}):")
    else:
        print("Không có URL nào để crawl.")

# finalCrawl()

In [13]:
from Libraries.Sorter import ArticleSorter
all_articles = load_jsonl(JSON_FILE)
sorter = ArticleSorter(categories_file_path=CATE_FILE)
sorted_articles = sorter.sort_and_deduplicate(all_articles)
replace_jsonl(sorted_articles, JSON_FILE)
convert_to_xlsx(JSON_FILE, XLSX_FILE)

Dữ liệu gốc có 966 bài báo.
Bắt đầu sắp xếp 966 bài báo...
Sắp xếp hoàn tất.


TRAIN and TEST

In [8]:
import pandas as pd
from transformers import pipeline
from Libraries import Trainer

training_config = {
    # --- Đường dẫn và tên model ---
    "DATA_JSONL_FILE": "Database/JSON/vnexpress_articles.jsonl",
    "MODEL_CHECKPOINT": "vinai/bartpho-syllable",
    "OUTPUT_MODEL_DIR": "Models/bartpho-summarizer",
    
    # --- Hyperparameters ---
    "MAX_INPUT_LENGTH": 1024,
    "MAX_TARGET_LENGTH": 256,
    "BATCH_SIZE": 4,
    "NUM_TRAIN_EPOCHS": 3,
    "LEARNING_RATE": 3e-5,
    "WEIGHT_DECAY": 0.01,
}

In [9]:
# === TRAIN ===
summarizer_trainer = Trainer.SummarizationTrainer(config=training_config)
# summarizer_trainer.run()

In [10]:
# === TEST ===
def modelTest():
    fine_tuned_model_path = training_config["OUTPUT_MODEL_DIR"] 
    summarizer_pipeline = pipeline("summarization", model=fine_tuned_model_path)

    # Lấy một bài báo từ dữ liệu của bạn để tóm tắt thử
    df = pd.read_json(training_config["DATA_JSONL_FILE"], lines=True)
    sample_text = df.iloc[50]["content"] # Số 50

    print("--- VĂN BẢN GỐC ---")
    print(sample_text)
    print("\n" + "="*50 + "\n")

    print("--- BẢN TÓM TẮT TỪ MODEL ---")
    summary = summarizer_pipeline(sample_text, max_length=256, min_length=50, do_sample=False)
    print(summary[0]['summary_text'])

# modelTest()