In [1]:
# !pip install vncorenlp nbimporter
# !pip install pandas transformers torch underthesea vncorenlp tqdm hf_xet

data_process->word_segment ->̣̣̣̣(emb_different + complex) -> merged -> training-> check

In [2]:
import nbimporter
import importlib

from vncorenlp import VnCoreNLP
import json, re
import numpy as np
import pandas as pd
import torch
import math
from tqdm import tqdm
from sklearn.decomposition import TruncatedSVD
#import mô hình
from underthesea import word_tokenize, pos_tag
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
#import file
from data_process import run_data_processed, clean_question, clean_word
from Embedding.word_segment import word_segment, ws_question, ori_question
from Embedding.emb_different import get_diff
from Embedding.sen_complex import run_sen_comlex

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def data_process():
    subject_files = {
    "Văn_học": "output/văn.json",
    "sử": "output/sử.json",
    "địa": "output/địa.json",
    "anh": "output/anh_văn.json"
    }
    all_data = {}
    for subject, file_path in subject_files.items():
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                all_data[subject] = json.load(f)
            print(f"Loaded {len(all_data[subject])} questions from {subject}")
        except FileNotFoundError:
            print(f"Warning: File not found - {file_path}")
            continue
    qa_pairs = []
    processed_qa = []
    clean_question(qa_pairs, all_data)
    clean_word(processed_qa, qa_pairs)
    run_data_processed(qa_pairs)

In [4]:
def run_embedding():
    input_file = r"data_processed\Văn_học_processed.txt"
    output_file = r"Embedding\Output_ws\qa_processed_ws.txt"
    print("---------Đang xử lý Word Segment--------------\n")
    word_segment(input_file, output_file)
    print("---------Đã xử lý xong Word Segment----------\n")

    #------------- xử lý data word segment--------------------------------
    print("---------Đang xử lý ws_question--------------\n")
    output_file_ws = r"Embedding\Output_ws\questions.json"
    ws_question(output_file, output_file_ws)
    print("---------Đã xử lý xong ws_question----------\n")

    #-------------clean data nguyên bản-----------------------
    print("---------Đang xử lý ori_question--------------\n")
    output_file_ori = r"Embedding\Output_ws\ori_questions.json"
    ori_question(input_file, output_file_ori)
    print("---------Đã xử lý xong ori_question----------\n")


In [5]:
def create_different_feartures():
    input = r"Embedding\Output_ws\questions.json"
    output = r"Embedding\Output_features"
    
    tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
    model = AutoModel.from_pretrained("vinai/phobert-base")
    print(f"---------Đang tính toán độ lệch embedding giữa các options-------------\n")
    get_diff(input, output, tokenizer, model)


In [6]:
def load_lit_terms(txt_file="lit_terms.txt"):
    """
    Đọc danh sách thuật ngữ từ file txt thành set
    """
    with open(txt_file, "r", encoding="utf-8") as f:
        terms = {line.strip().lower() for line in f if line.strip()}
    print(f"📖 Đã load {len(terms)} thuật ngữ từ {txt_file}")
    return terms

def create_sentence_comlex_feartures():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("⚡ Using device:", device)
    # PhoBERT cho embedding
    phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
    phobert_model = AutoModel.from_pretrained("vinai/phobert-base").to(device)
    phobert_model.eval()
    # GPT2 tiếng Việt cho perplexity
    gpt2_tokenizer = AutoTokenizer.from_pretrained("NlpHUST/gpt2-vietnamese")
    gpt2_model = AutoModelForCausalLM.from_pretrained("NlpHUST/gpt2-vietnamese").to(device)
    gpt2_model.eval()

    connectors = {"và","hoặc","nhưng","tuy nhiên","cũng như là","nên","mà",
              "nếu","thì","bởi vì","vì","khi","mặc dù","để","sau","sau_khi",
              "trước","trước khi","hay","do","do đó","hễ","lẫn","cùng",
              "ngoài ra","vậy","trừ phi","hơn","bằng","như","dù cho",
              "nhằm","vì vậy","ngược lại","chỉ","chỉ trừ"}
    lit_terms = load_lit_terms("lit_terms.txt")
    svd = TruncatedSVD(n_components=128) 
    input_file = r"Embedding\Output_ws\ori_questions.json"
    output_file = r"Embedding\Output_features\question_features.csv"
    print("------------------------Đang tính toán độ lệch giữa các options------------------------")
    run_sen_comlex(input_file,output_file, connectors, phobert_tokenizer
                   , phobert_model, gpt2_tokenizer, gpt2_model, device, lit_terms, svd)


In [10]:
def merger_data():
    print(f"--------------------------------------Đang kết hợp data--------------------------------------")
    df1 = pd.read_csv(r"Embedding\Output_features\noise_features.csv")
    df2 = pd.read_csv(r"Embedding\Output_features\question_features.csv")
    df3 = pd.read_csv(r"training\data_for_training\van_with_bloom_out.csv")
    merged = pd.merge(df1, df2, on = "id", how = "inner")
    merged = pd.merge(merged, df3, on = "id", how = "inner")
    result = merged.drop(columns=["question_x", "answer", "subject", "question_y","question"])
    result.to_csv("merged.csv", index = False)
    print("Done.........")

In [8]:
def create_lit_terms():
    df1 = pd.read_csv(r"bold_words.csv", encoding="utf-8-sig")
    df2 = pd.read_csv(r"bold_words_2.csv", encoding = "utf-8-sig")

    df1["Bold Words"] = df1["Bold Words"].str.lower()
    df2["Bold Words"] = df2["Bold Words"].str.lower()

    merged = pd.concat([df1, df2], ignore_index=True)
    unique_words = merged.drop_duplicates(subset=["Bold Words"], keep="first")["Bold Words"]
    # Xuất TXT
    output_file = "lit_terms.txt"
    with open(output_file, "w", encoding="utf-8") as f:
        for word in unique_words:
            f.write(word.strip() + "\n")

    print(f"✅ Đã tạo file {output_file}")
    print("🔢 Số lượng từ duy nhất:", len(unique_words))

In [None]:
data_process()
run_embedding()

create_lit_terms()
create_different_feartures()
create_sentence_comlex_feartures()
merger_data()


--------------------------------------Đang kết hợp data--------------------------------------
Done.........
