In [None]:
!pip install PyMuPDF -q

import tensorflow as tf
import numpy as np
import pandas as pd
import fitz
import pickle
import re
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from google.colab import files

In [None]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
                                 initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1),
                                 initializer="zeros")
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = tf.keras.backend.tanh(tf.keras.backend.dot(x, self.W) + self.b)
        a = tf.keras.backend.softmax(e, axis=1)
        return tf.keras.backend.sum(a * x, axis=1)

In [None]:
print("üìÇ Vui l√≤ng upload c√°c file sau:")
print("1. word_difficulty_bilstm.h5")
print("2. char_vocab.pkl")
print("3. word_tokenizer.pkl")
print("4. scaler.pkl")
print("5. thresholds.pkl")
print("6. WordDifficulty.csv")

uploaded = files.upload()

üìÇ Vui l√≤ng upload c√°c file sau:
1. word_difficulty_final.h5
2. char_vocab.pkl
3. word_tokenizer.pkl
4. scaler.pkl
5. thresholds.pkl
6. WordDifficulty.csv


Saving WordDifficulty.csv to WordDifficulty.csv
Saving char_vocab.pkl to char_vocab.pkl
Saving scaler.pkl to scaler.pkl
Saving thresholds.pkl to thresholds.pkl
Saving word_difficulty_final.h5 to word_difficulty_final.h5
Saving word_tokenizer.pkl to word_tokenizer.pkl


In [None]:
# 1. Load Model & Configs
print("‚è≥ Loading Model & Configs...")
model = tf.keras.models.load_model(
    "word_difficulty_bilstm.h5",
    custom_objects={"Attention": Attention},
    compile=False
)
with open("char_vocab.pkl", "rb") as f: char_vocab = pickle.load(f)
with open("word_tokenizer.pkl", "rb") as f: word_tokenizer = pickle.load(f)
with open("scaler.pkl", "rb") as f: scaler = pickle.load(f)
with open("thresholds.pkl", "rb") as f:
    threshold_data = pickle.load(f)
    th1, th2 = threshold_data["th1"], threshold_data["th2"]

# --- T·∫†O T·ª™ ƒêI·ªÇN T·∫¶N SU·∫§T T·ª™ CSV ---
print("‚è≥ ƒêang t·∫°o t·ª´ ƒëi·ªÉn t·∫ßn su·∫•t t·ª´ WordDifficulty.csv...")
try:
    df_ref = pd.read_csv("WordDifficulty.csv")
    # L√†m s·∫°ch t·ª´ trong t·ª´ ƒëi·ªÉn (ch·ªØ th∆∞·ªùng, b·ªè nh√°y ƒë∆°n)
    df_ref["clean_word"] = df_ref["Word"].str.lower().str.replace("'", "")

    # T·∫°o Dictionary: { 'word': frequency_value }
    freq_dict = dict(zip(df_ref["clean_word"], df_ref["Log_Freq_HAL"]))

    # T√≠nh gi√° tr·ªã trung b√¨nh ƒë·ªÉ g√°n cho c√°c t·ª´ l·∫°
    avg_freq = df_ref["Log_Freq_HAL"].mean()
    print(f"‚úÖ ƒê√£ load {len(freq_dict)} t·ª´ v√†o t·ª´ ƒëi·ªÉn t·∫ßn su·∫•t.")
    print(f"üëâ Gi√° tr·ªã t·∫ßn su·∫•t trung b√¨nh (d√πng cho t·ª´ l·∫°): {avg_freq:.4f}")

except FileNotFoundError:
    print("‚ùå L·ªñI: Kh√¥ng t√¨m th·∫•y file 'WordDifficulty.csv'. Vui l√≤ng upload file n√†y!")
    freq_dict = {}
    avg_freq = 0

‚è≥ Loading Model & Configs...
‚è≥ ƒêang t·∫°o t·ª´ ƒëi·ªÉn t·∫ßn su·∫•t t·ª´ WordDifficulty.csv...
‚úÖ ƒê√£ load 40004 t·ª´ v√†o t·ª´ ƒëi·ªÉn t·∫ßn su·∫•t.
üëâ Gi√° tr·ªã t·∫ßn su·∫•t trung b√¨nh (d√πng cho t·ª´ l·∫°): 6.1632


In [None]:
# --- 1. X·ª≠ l√Ω vƒÉn b·∫£n ---
def clean_token(w):
    return str(w).lower().replace("'", "")

# --- 2. ƒê·∫øm √¢m ti·∫øt ---
def count_syllables(word):
    word = str(word).lower()
    count = 0
    vowels = "aeiouy"
    if len(word) == 0: return 0
    if word[0] in vowels: count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"): count -= 1
    if count == 0: count += 1
    return count

# --- 3. Tr√≠ch xu·∫•t ƒë·∫∑c tr∆∞ng ---
def extract_manual_features(words):
    features = []
    vowels = set("aeiou")

    for w in words:
        w_clean = clean_token(w)

        length = len(w_clean)
        num_vowels = sum(1 for c in w_clean if c in vowels)
        syllables = count_syllables(w_clean)
        consonant_ratio = (length - num_vowels) / length if length > 0 else 0

        # --- TRA C·ª®U T·∫¶N SU·∫§T ---
        if w_clean in freq_dict:
            freq = freq_dict[w_clean]  # L·∫•y gi√° tr·ªã th·ª±c t·∫ø t·ª´ dataset
        else:
            freq = avg_freq            # T·ª´ l·∫° -> D√πng gi√° tr·ªã trung b√¨nh

        # Vector 5 chi·ªÅu: [ƒê·ªô d√†i, Nguy√™n √¢m, √Çm ti·∫øt, T·ª∑ l·ªá ph·ª• √¢m, T·∫ßn su·∫•t]
        features.append([length, num_vowels, syllables, consonant_ratio, freq])

    return np.array(features)

# --- 4. M√£ h√≥a K√Ω t·ª± ---
def words_to_char_seq(words, vocab, maxlen=21):
    seqs = [[vocab.get(c, 0) for c in w] for w in words]
    return pad_sequences(seqs, maxlen=maxlen, padding="post")

# --- 5. Ph√¢n lo·∫°i ---
def classify_score(score, t1, t2):
    if score < t1: return 0
    elif score < t2: return 1
    else: return 2

In [None]:
def predict_word_difficulty(words_list):
    # 1. L√†m s·∫°ch danh s√°ch t·ª´
    clean_words = [clean_token(w) for w in words_list if len(clean_token(w)) > 0]

    if len(clean_words) == 0:
        return []

    # 2. T·∫°o Input 1: Character Sequence
    X_char = words_to_char_seq(clean_words, char_vocab, maxlen=21)

    # 3. T·∫°o Input 2: Word Indices
    X_word_seq = word_tokenizer.texts_to_sequences(clean_words)
    X_word = np.array([seq[0] if len(seq) > 0 else 0 for seq in X_word_seq])

    # 4. T·∫°o Input 3: Manual Features
    X_man = extract_manual_features(clean_words)
    X_man = scaler.transform(X_man)

    # 5. D·ª± ƒëo√°n
    print("ü§ñ ƒêang d·ª± ƒëo√°n...")
    pred_scores = model.predict([X_char, X_word, X_man], verbose=0).flatten()

    # 6. H·∫≠u x·ª≠ l√Ω k·∫øt qu·∫£
    results = []
    for w, score in zip(clean_words, pred_scores):
        level = classify_score(score, th1, th2)
        results.append({
            "Word": w,
            "Score": round(float(score), 5),
            "Level": level,
            "Label": ["D·ªÖ", "Trung b√¨nh", "Kh√≥"][level]
        })

    return pd.DataFrame(results)

In [None]:
test_words = [
    "cat", "dog", "run",
    "beautiful", "university",
    "photosynthesis", "phenomenon", "extraordinary",
    "algorithm", "cryptography"
]

df_res = predict_word_difficulty(test_words)
print("\nK·∫æT QU·∫¢ D·ª∞ ƒêO√ÅN:")
print(df_res)

ü§ñ ƒêang d·ª± ƒëo√°n...

K·∫æT QU·∫¢ D·ª∞ ƒêO√ÅN:
             Word    Score  Level       Label
0             cat  0.10511      0          D·ªÖ
1             dog  0.00154      0          D·ªÖ
2             run -0.00443      0          D·ªÖ
3       beautiful  0.51009      1  Trung b√¨nh
4      university  0.51420      1  Trung b√¨nh
5  photosynthesis  1.59437      2         Kh√≥
6      phenomenon  0.89195      1  Trung b√¨nh
7   extraordinary  1.03244      1  Trung b√¨nh
8       algorithm  0.50298      1  Trung b√¨nh
9    cryptography  1.16305      1  Trung b√¨nh


In [None]:
# Upload PDF
uploaded_pdf = files.upload()
pdf_path = list(uploaded_pdf.keys())[0]

# ƒê·ªçc text
print(f"üìÑ ƒêang ƒë·ªçc file: {pdf_path}")
text = ""
with fitz.open(pdf_path) as doc:
    for page in doc:
        text += page.get_text()

# T√°ch t·ª´
all_words = text.split()
print(f"T√¨m th·∫•y {len(all_words)} t·ª´. ƒêang x·ª≠ l√Ω...")

# D·ª± ƒëo√°n
df_final = predict_word_difficulty(all_words)

# Hi·ªÉn th·ªã th·ªëng k√™
print("\n--- TH·ªêNG K√ä ---")
print(df_final["Label"].value_counts())

# L∆∞u file CSV
csv_name = "pdf_difficulty_results.csv"
df_final.to_csv(csv_name, index=False)
print(f"\nüíæ ƒê√£ l∆∞u k·∫øt qu·∫£ v√†o: {csv_name}")
files.download(csv_name)

# Hi·ªÉn th·ªã 20 d√≤ng ƒë·∫ßu
df_final.head(20)

Saving srs_template.pdf to srs_template.pdf
üìÑ ƒêang ƒë·ªçc file: srs_template.pdf
T√¨m th·∫•y 2024 t·ª´. ƒêang x·ª≠ l√Ω...
ü§ñ ƒêang d·ª± ƒëo√°n...

--- TH·ªêNG K√ä ---
Label
D·ªÖ            1544
Trung b√¨nh     312
Kh√≥             34
Name: count, dtype: int64

üíæ ƒê√£ l∆∞u k·∫øt qu·∫£ v√†o: pdf_difficulty_results.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Word,Score,Level,Label
0,copyright,0.415,0,D·ªÖ
1,by,0.0725,0,D·ªÖ
2,karl,0.59751,1,Trung b√¨nh
3,e,1.66703,2,Kh√≥
4,wiegers,0.59803,1,Trung b√¨nh
5,permission,0.27713,0,D·ªÖ
6,is,-0.1876,0,D·ªÖ
7,granted,0.02877,0,D·ªÖ
8,to,-0.1211,0,D·ªÖ
9,use,0.02912,0,D·ªÖ
