In [68]:
from dotenv import load_dotenv
import pdfplumber
import re
import pandas as pd
import os
import trafilatura
import spacy


In [61]:
heat = pd.read_excel('corpus/heat.xlsx')
heat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   source_name  6 non-null      object
 1   url          6 non-null      object
dtypes: object(2)
memory usage: 228.0+ bytes


In [62]:
heat.head(10)

Unnamed: 0,source_name,url
0,id1,https://heathealth.info/heat-and-health/?utm_s...
1,id2,https://www.nature.com/articles/s43247-024-019...
2,id3,https://apnews.com/article/deadly-heat-humidit...
3,id4,https://www.who.int/news-room/fact-sheets/deta...
4,id5,https://heat.gov/
5,id6,https://pmc.ncbi.nlm.nih.gov/articles/PMC10231...


In [63]:
# Tạo thư mục output nếu chưa tồn tại
output_dir = "corpus/heat"
os.makedirs(output_dir, exist_ok=True)

for _, row in heat.iterrows():
    source_id = row["source_name"]   # vd: id1, id2, ...
    url = row["url"]

    try:
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            text = trafilatura.extract(downloaded)
            if text:
                file_path = os.path.join(output_dir, f"{source_id}.txt")
                with open(file_path, "w", encoding="utf-8") as f:
                    f.write(text)
                print(f"✅ Saved: {file_path}")
            else:
                print(f"⚠️ Không extract được text từ {url}")
        else:
            print(f"❌ Không fetch được URL: {url}")
    except Exception as e:
        print(f"⚠️ Lỗi với {url}: {e}")

✅ Saved: corpus/heat/id1.txt
❌ Không fetch được URL: https://www.nature.com/articles/s43247-024-01930-6?utm_source=chatgpt.com
✅ Saved: corpus/heat/id3.txt
✅ Saved: corpus/heat/id4.txt
❌ Không fetch được URL: https://heat.gov/
✅ Saved: corpus/heat/id6.txt


In [64]:
def clean_text(text: str) -> str:
    """
    Tiền xử lý text:
    - Xóa khoảng trắng thừa
    - Giữ nguyên bullet (- ) và heading
    """
    text = text.strip()
    text = re.sub(r"\s+", " ", text)        # chuẩn hóa khoảng trắng
    return text


In [65]:
# Thư mục chứa file txt
folder = "corpus/heat"

for filename in os.listdir(folder):
    if filename.endswith(".txt"):
        filepath = os.path.join(folder, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read()
        
        cleaned = clean_text(content)

        # Ghi đè lại file cũ
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(cleaned)

print("Hoàn tất xử lý toàn bộ file trong corpus/heat/")

Hoàn tất xử lý toàn bộ file trong corpus/heat/


---
# Chunking

In [103]:
input_dir = "corpus/heat"
excel_file = "corpus/heat.xlsx"
output_csv = "corpus/heat_chunks.csv"

In [104]:
# Load mapping source_name -> url
df_map = pd.read_excel(excel_file)
mapping = dict(zip(df_map["source_name"], df_map["url"]))

nlp = spacy.load("en_core_web_sm")

def split_into_sentences(text: str):
    """Tách văn bản thành danh sách câu"""
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    return sentences

def chunk_sentences(sentences, chunk_size=5, overlap=2):
    """
    Gom n câu thành 1 chunk, overlap k câu.
    Giữ luôn cả phần dư cuối cùng.
    """
    chunks = []
    i = 0
    while i < len(sentences):
        chunk = sentences[i:i+chunk_size]
        if not chunk:
            break
        chunks.append(" ".join(chunk))
        if i + chunk_size >= len(sentences):
            break
        i += chunk_size - overlap
    return chunks

In [105]:
all_data = []
# Duyệt qua từng file txt
for fname in os.listdir(input_dir):
    if fname.endswith(".txt"):
        source_name = os.path.splitext(fname)[0]
        url = mapping.get(source_name, None)
        if url is None:
            print(f"⚠️ Không tìm thấy URL mapping cho {fname}, bỏ qua.")
            continue
        
        try:
            with open(os.path.join(input_dir, fname), "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()
        except Exception as e:
            print(f"❌ Lỗi đọc file {fname}: {e}")
            continue
        
        sentences = split_into_sentences(text)
        if not sentences:
            print(f"⚠️ Không tách được câu trong {fname}")
            continue
        
        chunks = chunk_sentences(sentences, chunk_size=5, overlap=2)
        
        for c in chunks:
            all_data.append({"url": url, "chunk": c})

# Xuất CSV
df_out = pd.DataFrame(all_data)
df_out.to_csv(output_csv, index=False, encoding="utf-8")
print(f"✅ Saved {len(df_out)} chunks to {output_csv}")

✅ Saved 114 chunks to corpus/heat_chunks.csv


---
# Checking

In [None]:
respiratory
allergy
heat
cardiovascular


In [91]:
check = pd.read_csv('corpus/respiratory_chunks.csv')
check.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274 entries, 0 to 273
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   url     274 non-null    object
 1   chunk   274 non-null    object
dtypes: object(2)
memory usage: 4.4+ KB
