In [1]:
!pip install pytorch-crf

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl.metadata (2.4 kB)
Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [1]:
import torch
import torch.nn as nn
from torchcrf import CRF

class MaskedAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim, 1)

    def forward(self, lstm_out, mask):
        # lstm_out: (batch, seq_len, hidden_dim)
        scores = self.attn(lstm_out).squeeze(-1)
        scores = scores.masked_fill(mask == 0, -1e9)
        attn_weights = torch.softmax(scores, dim=1).unsqueeze(-1)
        context = torch.sum(lstm_out * attn_weights, dim=1, keepdim=True)
        return lstm_out + context.expand_as(lstm_out)


class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=128, hidden_dim=256, num_layers=2, dropout=0.25, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.embedding_dropout = nn.Dropout(dropout)

        self.bilstm = nn.LSTM(
            embedding_dim,
            hidden_dim // 2,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )

        self.layer_norm = nn.LayerNorm(hidden_dim)
        self.attention = MaskedAttention(hidden_dim)

        # optional hidden projection before CRF
        self.hidden_fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

        self.fc = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)

    def forward(self, x, tags=None, mask=None):
        if mask is None:
            mask = (x != self.embedding.padding_idx).type(torch.bool)
        mask[:, 0] = 1

        embeddings = self.embedding(x)
        embeddings = self.embedding_dropout(embeddings)

        lstm_out, _ = self.bilstm(embeddings)
        lstm_out = self.layer_norm(lstm_out)
        lstm_out = self.attention(lstm_out, mask)
        lstm_out = self.hidden_fc(lstm_out)

        emissions = self.fc(lstm_out)

        if tags is not None:
            log_likelihood = self.crf(emissions, tags, mask=mask)
            return -log_likelihood.mean()
        else:
            return self.crf.decode(emissions, mask=mask)

In [None]:
## Test NER model
import torch
import json
from nltk import sent_tokenize
import unicodedata

with open("D:/Study/Education/Projects/Group_Project/source/data/ner/model/token2idx.json", "r", encoding="utf-8") as f:
    token2idx = json.load(f)

with open("D:/Study/Education/Projects/Group_Project/source/data/ner/model/label2idx.json", "r", encoding="utf-8") as f:
    label2idx = json.load(f)

#Reverse lookup dict
idx2label = {v: k for k, v in label2idx.items()}

#Recreate model architecture
model = BiLSTM_CRF(
    vocab_size=len(token2idx),
    tagset_size=len(label2idx),
    embedding_dim=128,
    hidden_dim=256,
    num_layers=2,
    dropout=0.3,
    pad_idx=token2idx["<PAD>"]
)

# Load trained weights 
model.load_state_dict(torch.load("D:/Study/Education/Projects/Group_Project/source/data/ner/model/model_bilstm_crf.pt", map_location=torch.device("cpu")))
model.eval()

print("✅ Model and dictionaries loaded successfully!")

✅ Model and dictionaries loaded successfully!


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [8]:
import re

def prepare_text_for_model(text, token2idx, device="cpu"):
    """
    Prepare a raw text string for BiLSTM-CRF inference.
    Returns: (X_tensor, mask_tensor, tokens)
    """
    # Normalize and tokenize
    text = text.lower().strip()
    tokens = re.findall(r"\w+|[^\w\s]", text, re.UNICODE)

    # Convert tokens to indices
    X = [token2idx.get(tok, token2idx["<UNK>"]) for tok in tokens]

    # Convert to PyTorch tensors
    X_tensor = torch.tensor([X], dtype=torch.long).to(device)
    mask = (X_tensor != token2idx["<PAD>"]).to(torch.bool)

    return X_tensor, mask, tokens

# text = sent_tokenize(try_thong_tu)[0]
text = "Luật Thuế Giá trị gia tăng số 12/23/Nđ-CP được ban hành bởi Chính phủ và được kí kết bởi Chủ tịch QUốc hội Trần Thanh Mẫn Hà Nội ngày 23 tháng 8 năm 2021"

X_tensor, mask, tokens = prepare_text_for_model(text, token2idx, device)

with torch.no_grad():
    pred = model(X_tensor, mask=mask)

pred_labels = [idx2label[i] for i in pred[0]]
list(zip(tokens, pred_labels))

[('luật', 'B-TIT'),
 ('thuế', 'I-TIT'),
 ('giá', 'I-TIT'),
 ('trị', 'I-TIT'),
 ('gia', 'I-TIT'),
 ('tăng', 'I-TIT'),
 ('số', 'O'),
 ('12', 'B-DOCID'),
 ('/', 'I-DOCID'),
 ('23', 'I-DOCID'),
 ('/', 'I-DOCID'),
 ('nđ', 'I-DOCID'),
 ('-', 'O'),
 ('cp', 'O'),
 ('được', 'O'),
 ('ban', 'O'),
 ('hành', 'O'),
 ('bởi', 'O'),
 ('chính', 'B-DEP'),
 ('phủ', 'I-DEP'),
 ('và', 'O'),
 ('được', 'O'),
 ('kí', 'O'),
 ('kết', 'O'),
 ('bởi', 'O'),
 ('chủ', 'O'),
 ('tịch', 'O'),
 ('quốc', 'B-LOC'),
 ('hội', 'I-LOC'),
 ('trần', 'I-LOC'),
 ('thanh', 'I-LOC'),
 ('mẫn', 'I-LOC'),
 ('hà', 'I-LOC'),
 ('nội', 'I-LOC'),
 ('ngày', 'B-DAT'),
 ('23', 'I-DAT'),
 ('tháng', 'I-DAT'),
 ('8', 'I-DAT'),
 ('năm', 'I-DAT'),
 ('2021', 'I-DAT')]

In [9]:
from vncorenlp import VnCoreNLP

ner_annotator = VnCoreNLP("D:/Study/Education/Projects/Group_Project/VnCoreNLP/VnCoreNLP-1.1.1.jar", annotators="wseg,pos,ner", max_heap_size='-Xmx2g') 

In [59]:
ner_annotator.annotate(sent_tokenize(try_text)[-1])

{'sentences': [[{'index': 1,
    'form': 'CHỦ_TỊCH',
    'posTag': 'Ny',
    'nerLabel': 'O',
    'head': -1},
   {'index': 2,
    'form': 'QUỐC_HỘI',
    'posTag': 'Ny',
    'nerLabel': 'O',
    'head': -1},
   {'index': 3,
    'form': 'Trần_Thanh_Mẫn',
    'posTag': 'Np',
    'nerLabel': 'B-PER',
    'head': -1}]]}

#### Try Data Extraction framework

In [10]:
from nltk import sent_tokenize

#for module import
import sys, os
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from shared_functions.global_functions import *

In [59]:
def tokenize_like_conll(text):
    """
    Split tokens like Label Studio:
    - Separate punctuation (/,:; etc.)
    - Split numbers, letters, symbols
    """
    return re.findall(r"\w+|[^\w\s]", text, re.UNICODE)

def run_ner_on_sentence(sentence, model, token2idx, idx2label, device="cpu"):
    """
    Tokenize a sentence, convert to indices (alreay had indices dictionary), run NER model, and return (tokens, labels).
    """
    tokens = tokenize_like_conll(sentence.lower())

    # Convert tokens to indices
    X = [token2idx.get(tok, token2idx["<UNK>"]) for tok in tokens]
    X_tensor = torch.tensor([X], dtype=torch.long).to(device)

    # Create attention mask
    mask = (X_tensor != token2idx["<PAD>"]).to(torch.bool)

    # Predictx label from model
    model.eval()
    with torch.no_grad():
        preds = model(X_tensor, mask=mask)[0]  # assume model returns label IDs
        preds = preds[:len(tokens)]

    labels = [idx2label[p] for p in preds]
    
    return tokens, labels


def merge_entities(tokens, labels):
    """
    Merge consecutive tokens with the same entity type (B-I-O format).
    Returns list of (text, entity_type)
    Example: Hà - B-LOC, Nội - I-LOC  => ("Hà Nội", "LOC")
    """
    entities = []
    current_tokens = []
    current_type = None

    for tok, lbl in zip(tokens, labels):
        if lbl.startswith("B-"):
            if current_tokens:
                entities.append((" ".join(current_tokens), current_type))
            current_tokens = [tok]
            current_type = lbl[2:]
        elif lbl.startswith("I-") and current_tokens:
            current_tokens.append(tok)
        else:
            if current_tokens:
                entities.append((" ".join(current_tokens), current_type))
                current_tokens = []
                current_type = None

    if current_tokens:
        entities.append((" ".join(current_tokens), current_type))
    return entities


# def smart_fix_loc(entities):
#     """
#     Merge common location patterns like:
#     'Thành phố' + next LOC, or 'Tỉnh' + next LOC.
#     """
#     fixed = []
#     skip_next = False
#     for i, (text, etype) in enumerate(entities):
#         if skip_next:
#             skip_next = False
#             continue

#         if etype == "LOC" and i + 1 < len(entities):
#             next_text, next_type = entities[i + 1]
#             if next_type == "LOC" and text.lower() in ["thành phố", "tỉnh", "quận", "huyện"]:
#                 fixed.append((f"{text} {next_text}", "LOC"))
#                 skip_next = True
#                 continue

#         fixed.append((text, etype))
#     return fixed

def normalize_text(text):
    if not text:
        return None
    text = re.sub(r"\s+", " ", text.strip())
    text = text.replace("–", "-").replace("_", " ")
    text = re.sub(r"[-–—,:;.\s]+$", "", text)
    return text

def merge_fragmented(text):
    """
    Merge the token starts with a vowel with the token before it
    EXCEPT for certain Vietnamese words
    """
    if not isinstance(text, str):
        return text

    vowels = "aeiouyàáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựýỳỷỹỵ"
    vowel_start = re.compile(rf"^[{vowels}]", re.IGNORECASE)

    # Words that should *not* be merged (valid Vietnamese words that varies based on use cases)
    exclude_words = {"án", 'anh'}

    parts = text.split()
    if len(parts) < 2:
        return text

    merged = [parts[0]]
    for token in parts[1:]:
        token_lower = token.lower()
        if vowel_start.match(token_lower) and token_lower not in exclude_words:
            merged[-1] += token
        else:
            merged.append(token)

    return " ".join(merged)

def normalize_date(text):
    """
    Convert various Vietnamese date formats to 'dd/mm/yyyy'.
    Examples:
        'ngày 12 tháng 6 năm 2021'  -> '12/06/2021'
        '17/6/2025'                 -> '17/06/2025'
        '17/6'                      -> '17/06/'
        'ngày 5 tháng 12 năm 24'    -> '05/12/2024'
    """
    if not text:
        return None

    text = text.strip().lower()
    text = text.replace("–", "-").replace("_", " ")

    # Try to match "ngày 12 tháng 6 năm 2021"
    match_vn = re.search(
        r"ngày\s*(\d{1,2})\D+tháng\s*(\d{1,2})\D+năm\s*(\d{2,4})",
        text,
        re.IGNORECASE
    )
    if match_vn:
        d, m, y = match_vn.groups()
        if len(y) == 2:
            y = "20" + y
        return f"{d.zfill(2)}/{m.zfill(2)}/{y}"
    
    # Try to match numeric formats like "17/6/2025" or "17-6-25" or "17.6.2025"
    match_num = re.search(r"(\d{1,2})[^\d]+(\d{1,2})(?:[^\d]+(\d{2,4}))?", text)
    if match_num:
        d, m, y = match_num.groups()
        if y:
            if len(y) == 2:
                y = "20" + y
            return f"{d.zfill(2)}/{m.zfill(2)}/{y}"
        else:
            return f"{d.zfill(2)}/{m.zfill(2)}"
        

    # If no recognizable format found, return cleaned text
    return re.sub(r"\s+", " ", text.strip())

def extract_abbreviation(text):
    def remove_accents(s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
            )

    # Normalize and split
    parts = remove_accents(text).strip().split()
    abbreviation = ''.join(p[0].upper() for p in parts if p)
    
    return abbreviation

def clean_document_id(id_str):
    if not isinstance(id_str, str):
        return id_str
    id_str = id_str.lower().strip()
    # Common noise patterns
    replacements = {
        r'cpu\b': 'cp',
        r'qhu\b': 'qh',
        r'bnvu\b': 'bnv',
        r'bkhu\b': 'bkh',
        r'ttu\b': 'tt',
        r'nhnnvn':'nhnn',
        r'bkhvcn':'bkhcn'
    }
    for pattern, repl in replacements.items():
        id_str = re.sub(pattern, repl, id_str)
    return id_str

def to_upper_alnum(text):
    '''
    Uppercase all words in a string
    '''
    if not isinstance(text, str):
        return text
    return re.sub(r'[a-zA-ZÀ-ỹ]', lambda m: m.group(0).upper(), text)

In [60]:
def extract_document_metadata(text, model, token2idx, idx2label, device="cpu"):
    sentences = sent_tokenize(text)
    if not sentences:
        return pd.DataFrame([{}])

    first_sentence = sentences[0]
    last_sentence = sentences[-1]

    tokens, labels = run_ner_on_sentence(first_sentence, model, token2idx, idx2label, device)
    entities = merge_entities(tokens, labels)
    # entities = smart_fix_loc(entities)

    last_tokens, last_labels = run_ner_on_sentence(last_sentence, model, token2idx, idx2label, device)
    last_entities = merge_entities(last_tokens, last_labels)

    metadata = {
        "issuer_department": None,
        "issue_date": None,
        "title": None,
        "location": None,
        "document_id": None,
        "issuer": None,
        "document_type": None
    }

    #Extract entities from first sentence
    seen_loc = 0
    for text_, etype in entities:
        if etype == "DEP" and metadata["issuer_department"] is None:
            metadata["issuer_department"] = normalize_text(text_)
            
        elif etype == "DAT" and metadata["issue_date"] is None:
            metadata["issue_date"] = normalize_date(text_)
            
        elif etype == "LOC":
            seen_loc += 1
            if seen_loc == 2 and metadata["location"] is None:
                metadata["location"] = normalize_text(text_)
                
        elif etype == "DOCID" and metadata["document_id"] is None:
            if text_.endswith("-"):
                text_ += "cp"
            elif len(text_.split('/')[-1]) <= 2:
                text_ += f'-{extract_abbreviation(metadata["issuer_department"])}'
                
            #Normalize then remove all spaces between characters
            metadata["document_id"] = re.sub(r"\s+", "", normalize_text(text_))

    #Auto-fill missing year in issue_date
    if metadata.get("issue_date"):
        date_str = metadata["issue_date"]
        if date_str.count('/') == 1:
            date_str = f'{date_str}/{metadata["document_id"].split("/")[1]}'
            metadata["issue_date"] = date_str

    #Default location to Hanoi
    if not metadata["location"]:
        metadata["location"] = "Hà Nội"

    #Title & Document Type Extraction 
    title_keywords = ["luật", "nghị quyết", "nghị định", "thông tư", "quyết định", "chỉ thị", "hướng dẫn"]
    title_finding =['luật', 'nghị', 'thông', 'quyết', 'chỉ', 'hướng']

    # Reassign labels for title-start tokens
    for i, tok in enumerate(tokens):
        if tok.lower() in title_finding:
            labels[i] = "B-TIT"

    # Find first B-TIT
    first_b_tit_idx = next((i for i, lbl in enumerate(labels) if lbl == "B-TIT"), None)

    if first_b_tit_idx is not None:
        start_idx = first_b_tit_idx
        end_idx = start_idx

        for k in range(start_idx + 1, len(tokens)):
            if labels[k].endswith("TIT"):
                end_idx = k
            else:
                if (end_idx - start_idx + 1) < 5:
                    end_idx = min(start_idx + 4, len(tokens) - 1)
                break

        candidate_title = " ".join(tokens[start_idx:end_idx + 1])
        candidate_title = normalize_text(candidate_title)
        metadata["title"] = candidate_title

        # Extract document type from title keyword
        for kw in title_keywords:
            if candidate_title.lower().startswith(kw):
                metadata["document_type"] = kw.capitalize()
                break
    else:
        metadata["title"] = "UNKNOWN"
        metadata["document_type"] = "UNKNOWN"

    # Second “Luật” occurrence logic
    if metadata["document_type"] == "Luật":
        luat_indices = [i for i, t in enumerate(tokens) if t.lower() == "luật"]
        if len(luat_indices) >= 2:
            start_idx = luat_indices[1]
            end_idx = start_idx
            for k in range(start_idx + 1, len(tokens)):
                if labels[k].endswith("TIT"):
                    end_idx = k
                else:
                    if (end_idx - start_idx + 1) < 5:
                        end_idx = min(start_idx + 4, len(tokens) - 1)
                    break
            metadata["title"] = normalize_text(" ".join(tokens[start_idx:end_idx + 1]))

    #Ensure consistency between title and document_type 
    if metadata["title"] and metadata["document_type"]:
        title_lower = metadata["title"].lower()
        doc_type_lower = metadata["document_type"].lower()
        if not title_lower.startswith(doc_type_lower):
            metadata["title"] = f"{metadata['document_type']} {metadata['title']}"
    elif metadata["title"]:
        for kw in title_keywords:
            if metadata["title"].lower().startswith(kw):
                metadata["document_type"] = kw.capitalize()
                break

    ## Extract Issuer
    try:
        annotated = ner_annotator.annotate(last_sentence)
        persons = []
        current_name = []

        for sent in annotated["sentences"]:
            for word_info in sent:
                label = word_info.get("nerLabel")
                token = word_info.get("form", "").replace("_", " ")
                if label == "B-PER":
                    if current_name:
                        persons.append(" ".join(current_name))
                    current_name = [token]
                elif label == "I-PER" and current_name:
                    current_name.append(token)
                else:
                    if current_name:
                        persons.append(" ".join(current_name))
                        current_name = []
        if current_name:
            persons.append(" ".join(current_name))
        
        # Use the last complete PER entity
        if persons:
            metadata["issuer"] = normalize_text(persons[-1])
        else:
            metadata["issuer"] = "UNKNOWN"

        if len(metadata["issuer"].split(' ')) == 1:
            metadata["issuer"] = "UNKNOWN"
        
    except Exception as e:
        print(f"[WARN] VnCoreNLP extraction failed: {e}")
        metadata["issuer"] = "UNKNOWN"

    # Capitalization Fix and final check
    df = pd.DataFrame([metadata])
    
    #Fix missing document id
    mask = df['document_id'].str.split('/').str[-1].apply(lambda x: len(str(x).strip()) <= 2)

    df.loc[mask, 'document_id'] = (
        df.loc[mask, 'document_id'] + '-' +
        df.loc[mask, 'issuer_department'].map(str.strip).apply(extract_abbreviation).str.lower()
    )
    
    df['document_id'] = df['document_id'].apply(clean_document_id)
    
    #Fix some fragmented features
    df = df.applymap(merge_fragmented)
    
    cols_to_capitalize = ['issuer_department', 'title', 'location', 'issuer', 'document_type']
    df[cols_to_capitalize] = df[cols_to_capitalize].apply(
        lambda col: col.apply(lambda x: x.title() if isinstance(x, str) else x)
    )
    
    #Capitalize all letters in document_id
    df['document_id'] = df['document_id'].apply(to_upper_alnum)
    
    return df

In [22]:
file_path = 'D:/Study/Education/Projects/Group_Project/source/document'
files = os.listdir(file_path)

list_doc_content = []
for file in files:
    if not file.lower().endswith(('.pdf', '.docx', '.doc')):
        continue
    list_doc_content.append(get_text_from_s3(f'legaldocstorage/{file}'))

In [61]:
# import botocore

# for file in files:
#     try:
#         try_text = get_text_from_s3(f'legaldocstorage/{file}')
#         df_meta = extract_document_metadata(try_text, model, token2idx, idx2label, device)
#         df = pd.concat([df, df_meta], axis=0)
#     except botocore.exceptions.ClientError as e:
#         if e.response['Error']['Code'] == 'NoSuchKey':
#             print(f"❌ NoSuchKey error: File not found in S3 → {file}")
#         else:
#             print(f"⚠️ Other S3 error for file {file}: {e}")
#     except Exception as e:
#         print(f"⚠️ Unexpected error with file {file}: {e}")

df = pd.DataFrame(columns = ['issuer_department', 'issue_date', 'title', 'location', 'document_id', 'issuer', 'document_type'])

for doc_content in list_doc_content:
    df_meta = extract_document_metadata(doc_content, model, token2idx, idx2label, device)
    df = pd.concat([df, df_meta], axis=0)
df = df.reset_index(drop=True)

  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)
  df = df.applymap(merge_fragmented)


In [62]:
df

Unnamed: 0,issuer_department,issue_date,title,location,document_id,issuer,document_type
0,Quốc Hội,17/06/2020,Luật Doanh Nghiệp Căn Cứ,Hà Nội,59/2020/QH14,Nguyễn Thị Kim Ngân,Luật
1,Quốc Hội,17/06/2025,"Luật Sửa Đổi , Bổ",Hà Nội,76/2025/QH15,Trần Thanh Mẫn,Luật
2,Quốc Hội,26/11/2024,Luật Thuế Giá Trị Gia Tăng,Hà Nội,48/2024/QH15,Trần Thanh Mẫn,Luật
3,Quốc Hội,06/04/2016,"Luật Sửa Đổi , Bổ Sung Một Số Điều",Hà Nội,106/2016/QH13,Nguyễn Thị Kim Ngân,Luật
4,Quốc Hội,14/06/2025,Luật Thuế Thu Nhập Doanh,Hà Nội,67/2025/QH15,Trần Thanh Mẫn,Luật
5,Quốc Hội,14/06/2025,Luật Thuế Tiêu Thụ Đặc Biệt,Hà Nội,66/2025/QH15,Trần Thanh Mẫn,Luật
6,Quốc Hội,06/04/2016,"Luật Thuế Xuất Khẩu , Thuế Nhập Khẩu",Hà Nội,107/2016/QH13,Unknown,Luật
7,Chính Phủ,03/10/2025,Nghị Định Quy Định Chi Tiết Về Việc Thực Hiện ...,Hà Nội,256/2025/NĐ-CP,Unknown,Nghị Định
8,Chính Phủ,02/04/2025,Nghị Định Gia Hạn Thời Hạn Nộp Thuế Giá Trị Gi...,Hà Nội,82/2025/NĐ-CP,Unknown,Nghị Định
9,Chính Phủ,01/07/2025,Nghị Định Quy Định Chi Tiết Thi Hành Một Số Điều,Hà Nội,181/2025/NĐ-CP,Unknown,Nghị Định


In [83]:
# try_text = get_text_from_s3('legaldocstorage/luat_thue_tndn_2025.pdf')

df_meta = extract_document_metadata(try_text, model, token2idx, idx2label, device)

In [84]:
df_meta

Unnamed: 0,issuer_department,issue_date,title,location,document_id,issuer,document_type
0,Quốc Hội,14/06/2025,Luật Thuế Thu Nhập Doanh,Hà Nội,67/2025/qh15,Trần Thanh Mẫn,Luật


#### Hierarchical Processing

In [51]:
from collections import defaultdict
import re

def parse_legal_text(text: str):
    # Regex patterns
    chapter_pattern = r"^\s*Chương\s+([IVXLCDM\d]+)\.?"
    clause_pattern = r"^\s*Điều\s+(\d+)\.?"
    point_pattern = r"^\s*\+?\s*(\d+)\."
    subpoint_pattern = r"^\s*(?:\*?\s*)?([a-z])\)"

    # Storage
    structure = defaultdict(list)
    current_chapter = None
    current_clause = None
    current_point = None

    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue

        # Match Chapter
        match_chapter = re.match(chapter_pattern, line, flags=re.IGNORECASE)
        if match_chapter:
            current_chapter = f"chapter {match_chapter.group(1)}"
            structure[current_chapter] = []
            current_clause = None
            current_point = None
            continue

        # Match Clause
        match_clause = re.match(clause_pattern, line)
        if match_clause:
            if not current_chapter:
                # If no chapter seen yet → use default top level
                current_chapter = "no_chapter"
                structure[current_chapter] = []
            current_clause = {
                "clause": match_clause.group(1),
                "text": line,
                "points": []
            }
            structure[current_chapter].append(current_clause)
            current_point = None
            continue

        # Match Point
        match_point = re.match(point_pattern, line)
        if match_point and current_clause:
            current_point = {
                "point": match_point.group(1),
                "text": line,
                "subpoints": []
            }
            current_clause["points"].append(current_point)
            continue

        # Match Subpoint
        match_subpoint = re.match(subpoint_pattern, line, flags=re.IGNORECASE)
        if match_subpoint and current_point:
            sub_val = match_subpoint.group(1)
            current_point["subpoints"].append({
                "subpoint": sub_val,
                "text": line
            })
            continue

    return dict(structure)


In [53]:
try_dict = parse_legal_text(try_text)

print(try_dict)

{'no_chapter': [{'clause': '1', 'text': 'Điều 1. Phạm vi điều chỉnh', 'points': []}, {'clause': '2', 'text': 'Điều 2. Đối tượng áp dụng', 'points': []}, {'clause': '3', 'text': 'Điều 3. Đối tượng được gia hạn', 'points': [{'point': '1', 'text': '1. Doanh nghiệp, tổ chức, hộ gia đình, hộ kinh doanh, cá nhân hoạt động sản xuất trong các ngành kinh', 'subpoints': [{'subpoint': 'a', 'text': 'a) Nông nghiệp, lâm nghiệp và thủy sản;'}, {'subpoint': 'b', 'text': 'b) Sản xuất, chế biến thực phẩm; dệt; sản xuất trang phục; sản xuất da và các sản phẩm có liên quan;'}, {'subpoint': 'c', 'text': 'c) Xây dựng;'}, {'subpoint': 'd', 'text': 'd) Hoạt động xuất bản; hoạt động điện ảnh, sản xuất chương trình truyền hình, ghi âm và xuất bản âm'}, {'subpoint': 'e', 'text': 'e) Sản xuất đồ uống; in, sao chép bản ghi các loại; sản xuất than cốc, sản phẩm dầu mỏ tinh chế; sản'}, {'subpoint': 'g', 'text': 'g) Thoát nước và xử lý nước thải.'}]}, {'point': '2', 'text': '2. Doanh nghiệp, tổ chức, hộ gia đình, hộ

In [54]:
df.columns

Index(['issuer_department', 'issue_date', 'title', 'location', 'document_id',
       'issuer', 'document_type'],
      dtype='object')

In [None]:
def saving_to_neo4j(text, namespace="Test"):
    """
    Save extracted legal document structure and metadata into Neo4j,
    under a given namespace label (e.g., :Test1, :Test2)
    """
    # === Extract document metadata ===
    df_meta = extract_document_metadata(text, model, token2idx, idx2label, device)
    meta_row = df_meta.iloc[0]

    metadata = {
        "law_id": meta_row["document_id"],
        "title": meta_row["title"],
        "issuer": meta_row["issuer"],
        "issue_date": meta_row["issue_date"],
        "location": meta_row["location"],
        "issuer_department": meta_row["issuer_department"],
        "document_type": meta_row["document_type"],
        "amendment_history": "None",
        "jurisdiction_scope": "National"
    }

    # === Central Node ===
    doc_type_label = metadata["document_type"].replace(" ", "_").capitalize()
    ns_label = re.sub(r"\W+", "_", namespace)  # sanitize label (only letters/numbers/_)

    dml_ddl_neo4j(f"""
        MERGE (l:{doc_type_label}:{ns_label} {{law_id: $law_id}})
        SET l += $meta
    """, law_id=metadata["law_id"], meta=metadata)

    # === Parse document structure ===
    parsed = parse_legal_text(text)

    ## If document has no chapter
    
    if not any(k.lower().startswith("chapter") for k in parsed.keys()):
        for clause in parsed.values():
            if isinstance(clause, list):
                for cl in clause:
                    clause_id = f"{metadata['law_id']}_C{cl['clause']}"
                    dml_ddl_neo4j(f"""
                        MERGE (c:Clause:{ns_label} {{id: $id}})
                        SET c.text = $text
                        WITH c
                        MATCH (l:{ns_label} {{law_id: $law_id}})
                        MERGE (l)-[:HAS_CLAUSE]->(c)
                    """, id=clause_id, text=cl["text"], law_id=metadata["law_id"])

                    for point in cl["points"]:
                        point_id = f"{clause_id}_P{point['point']}"
                        dml_ddl_neo4j(f"""
                            MERGE (p:Point:{ns_label} {{id: $id}})
                            SET p.text = $text
                            WITH p
                            MATCH (c:Clause:{ns_label} {{id: $clause_id}})
                            MERGE (c)-[:HAS_POINT]->(p)
                        """, id=point_id, text=point["text"], clause_id=clause_id)

                        for subpoint in point["subpoints"]:
                            subpoint_id = f"{point_id}_SP{subpoint['subpoint']}"
                            dml_ddl_neo4j(f"""
                                MERGE (sp:Subpoint:{ns_label} {{id: $id}})
                                SET sp.text = $text
                                WITH sp
                                MATCH (p:Point:{ns_label} {{id: $point_id}})
                                MERGE (p)-[:HAS_SUBPOINT]->(sp)
                            """, id=subpoint_id, text=subpoint["text"], point_id=point_id)

    # If document has chapters
    else:
        for chapter, clauses in parsed.items():
            chapter_id = f"{metadata['law_id']}_{chapter.replace(' ', '_')}"
            dml_ddl_neo4j(f"""
                MERGE (ch:Chapter:{ns_label} {{id: $id}})
                SET ch.title = $chapter
                WITH ch
                MATCH (l:{ns_label} {{law_id: $law_id}})
                MERGE (l)-[:HAS_CHAPTER]->(ch)
            """, id=chapter_id, chapter=chapter, law_id=metadata["law_id"])

            for clause in clauses:
                clause_id = f"{chapter_id}_C{clause['clause']}"
                dml_ddl_neo4j(f"""
                    MERGE (c:Clause:{ns_label} {{id: $id}})
                    SET c.text = $text
                    WITH c
                    MATCH (ch:Chapter:{ns_label} {{id: $chapter_id}})
                    MERGE (ch)-[:HAS_CLAUSE]->(c)
                """, id=clause_id, text=clause["text"], chapter_id=chapter_id)

                for point in clause["points"]:
                    point_id = f"{clause_id}_P{point['point']}"
                    dml_ddl_neo4j(f"""
                        MERGE (p:Point:{ns_label} {{id: $id}})
                        SET p.text = $text
                        WITH p
                        MATCH (c:Clause:{ns_label} {{id: $clause_id}})
                        MERGE (c)-[:HAS_POINT]->(p)
                    """, id=point_id, text=point["text"], clause_id=clause_id)

                    for subpoint in point["subpoints"]:
                        subpoint_id = f"{point_id}_SP{subpoint['subpoint']}"
                        dml_ddl_neo4j(f"""
                            MERGE (sp:Subpoint:{ns_label} {{id: $id}})
                            SET sp.text = $text
                            WITH sp
                            MATCH (p:Point:{ns_label} {{id: $point_id}})
                            MERGE (p)-[:HAS_SUBPOINT]->(sp)
                        """, id=subpoint_id, text=subpoint["text"], point_id=point_id)


In [65]:
truncated_doc = list_doc_content[2:]

In [66]:
len(truncated_doc)

19

In [None]:
for doc in truncated_doc:
    saving_to_neo4j(doc, namespace = "Test1")