In [1]:
!pip install fasttext==0.9.2 PyMuPDF -q

from google.colab import files
import tensorflow as tf
import fasttext
import numpy as np
import pandas as pd
import fitz
import pickle

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m119.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for fasttext (setup.py) ... [?25l[?25hdone


In [7]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
                                 initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1),
                                 initializer="zeros")
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = tf.keras.backend.tanh(tf.keras.backend.dot(x, self.W) + self.b)
        a = tf.keras.backend.softmax(e, axis=1)
        return tf.keras.backend.sum(a * x, axis=1)

uploaded = files.upload()

model = tf.keras.models.load_model(
    "word_difficulty_hybrid_fasttext.h5",
    compile=False,
    custom_objects={"Attention": Attention}
)

model.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss="mse")

with open("char_vocab.pkl", "rb") as f:
    char_vocab = pickle.load(f)

maxlen = 21
vocab_size = len(char_vocab) + 1

print("Model & Vocabulary loaded successfully!")

Saving char_vocab.pkl to char_vocab.pkl
Saving word_difficulty_hybrid_fasttext.h5 to word_difficulty_hybrid_fasttext.h5
✅ Model & Vocabulary loaded successfully!


In [8]:
# @title NAM chạy cell này, skip cell dưới

!wget -c https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gunzip -f cc.en.300.bin.gz

ft_model = fasttext.load_model("cc.en.300.bin")
print("Loaded FastText successfully!")

--2025-10-25 08:33:10--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.154.144.74, 18.154.144.87, 18.154.144.102, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.154.144.74|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz’


2025-10-25 08:33:37 (162 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]

Loaded FastText successfully!




In [12]:
import re, string
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# --- CLEAN WORD ---
def clean_token(w):
    w = w.lower().replace("'", "")
    return re.sub(r"[^a-z]", "", w)

# --- CHAR ENCODING ---
def words_to_char_seq(words, vocab, maxlen=21):
    seqs = [[vocab.get(c, 0) for c in w] for w in words]
    return pad_sequences(seqs, maxlen=maxlen, padding="post")

# --- FASTTEXT ENCODING ---
def words_to_vectors(words, ft_model):
    return np.array([ft_model.get_word_vector(w) for w in words])

# --- OPTIONAL: extract text from PDF ---
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

# --- ORDINAL DECODE (RẤT QUAN TRỌNG) ---
def ordinal_decode(p, t1=0.5, t2=1.5):
    if p < t1:
        return 0
    elif p < t2:
        return 1
    else:
        return 2

# --- PREDICT DIFFICULTY ---
def predict_word_difficulty(words, t1=0.5, t2=1.5):
    results = []

    clean_words = [clean_token(w) for w in words if len(clean_token(w)) > 0]

    if len(clean_words) == 0:
        return []

    X_char = words_to_char_seq(clean_words, char_vocab, maxlen=maxlen)
    X_ft = words_to_vectors(clean_words, ft_model)

    preds = model.predict([X_char, X_ft], verbose=0).flatten()

    for w, p in zip(clean_words, preds):
        level = ordinal_decode(p, t1, t2)
        results.append((w, level, round(float(p), 3)))

    return results

In [13]:
# Cell 6: Test nhanh sample words
test_words = [
    "cat",
    "university",
    "philosophical",
    "run",
    "beautiful",
    "extraordinary",
    "photosynthesis",
    "algorithm"
]

results = predict_word_difficulty(test_words)

for word, level, prob in results:
    print(f"{word:15s} → Level {level} → Prob {prob}")

cat             → Level 0 → Prob -0.067
university      → Level 1 → Prob 0.502
philosophical   → Level 1 → Prob 1.362
run             → Level 0 → Prob -0.077
beautiful       → Level 0 → Prob 0.107
extraordinary   → Level 1 → Prob 1.22
photosynthesis  → Level 2 → Prob 1.522
algorithm       → Level 1 → Prob 1.132


In [14]:
uploaded_pdf = files.upload()
pdf_path = list(uploaded_pdf.keys())[0]

print("📄 Reading file:", pdf_path)
pdf_text = extract_text_from_pdf(pdf_path)
words = pdf_text.split()

predictions = predict_word_difficulty(words)
df = pd.DataFrame(predictions, columns=["Word", "Predicted_Level", "Probabilities"])

print("Finish predicting! Listing first 20 rows:")
df.head(20)

Saving srs_template.pdf to srs_template (1).pdf
📄 Reading file: srs_template (1).pdf
Finish predicting! Listing first 20 rows:


Unnamed: 0,Word,Predicted_Level,Probabilities
0,copyright,1,0.532
1,by,0,0.047
2,karl,0,0.274
3,e,1,1.047
4,wiegers,2,1.652
5,permission,1,0.655
6,is,0,0.088
7,granted,0,0.29
8,to,0,0.14
9,use,0,-0.239


In [15]:
df.to_csv("pdf_word_difficulty.csv", index=False)
files.download("pdf_word_difficulty.csv")
print("Saved successfully!")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved successfully!
