In [23]:
import nltk
from nltk.tag import HiddenMarkovModelTrainer
from nltk.corpus import treebank

# If first time:
nltk.download('treebank')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to
[nltk_data]     /home/pityudhistira28/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/pityudhistira28/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [24]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["HF_TOKEN"] = os.environ.get('HF_TOKEN')

In [36]:
# HMM POS tagging with NLTK
# ---------------------------------

# Prepare data: use Treebank (or any tagged corpus). Convert to universal tags for simplicity
tagged_sents = treebank.tagged_sents(tagset='universal')  # list of list of (word, tag)
# Use small subset for quick experiment
train_sents = tagged_sents[:3000]
test_sents  = tagged_sents[3000:3200]

# Convert to format expected by trainer: list of (list_of_words, list_of_tags)
trainer = HiddenMarkovModelTrainer()

# NLTK HMM expects sequences as list of (token, tag) pairs; we can give trainer the list directly
hmm_tagger = trainer.train_supervised(train_sents)

# Evaluate
accuracy = hmm_tagger.accuracy(test_sents)
print("HMM tagger accuracy (universal tagset):", accuracy)

# Tagging example
result = hmm_tagger.tag("The quick brown fox jumps over the lazy dog .".split())

HMM tagger accuracy (universal tagset): 0.5524850484636007


In [40]:
for r in result:
    print(f"{r[0]} {r[1]}")

The DET
quick ADJ
brown NOUN
fox NOUN
jumps NOUN
over NOUN
the NOUN
lazy NOUN
dog NOUN
. NOUN


In [58]:
buast = hmm_tagger.tag("The young researcher presented her findings at the international conference. Later, she answered questions confidently from the audience. ".split())
for r in buast:
    print(f"{r[1]}")

DET
ADJ
NOUN
VERB
PRON
NOUN
ADP
DET
ADJ
NOUN
NOUN
NOUN
NOUN
NOUN
NOUN
NOUN
NOUN
NOUN


In [5]:
print(hmm_tagger.tag(
    "They took the upward path, through the still silence, steep and dark, shadowy with dense fog, drawing near to the threshold of the upper world. Afraid she was no longer there, and eager to see her, the lover turned his eyes. In an instant she dropped back, and he, unhappy man, stretching out his arms to hold her and be held, clutched at nothing but the receding air. Dying a second time, now, there was no complaint to her husband (what, then, could she complain of, except that she had been loved?). She spoke a last ‘farewell’ that, now, scarcely reached his ears, and turned again towards that same place."
.split()))

[('They', 'PRON'), ('took', 'VERB'), ('the', 'DET'), ('upward', 'ADJ'), ('path,', 'NOUN'), ('through', 'NOUN'), ('the', 'NOUN'), ('still', 'NOUN'), ('silence,', 'NOUN'), ('steep', 'NOUN'), ('and', 'NOUN'), ('dark,', 'NOUN'), ('shadowy', 'NOUN'), ('with', 'NOUN'), ('dense', 'NOUN'), ('fog,', 'NOUN'), ('drawing', 'NOUN'), ('near', 'NOUN'), ('to', 'NOUN'), ('the', 'NOUN'), ('threshold', 'NOUN'), ('of', 'NOUN'), ('the', 'NOUN'), ('upper', 'NOUN'), ('world.', 'NOUN'), ('Afraid', 'NOUN'), ('she', 'NOUN'), ('was', 'NOUN'), ('no', 'NOUN'), ('longer', 'NOUN'), ('there,', 'NOUN'), ('and', 'NOUN'), ('eager', 'NOUN'), ('to', 'NOUN'), ('see', 'NOUN'), ('her,', 'NOUN'), ('the', 'NOUN'), ('lover', 'NOUN'), ('turned', 'NOUN'), ('his', 'NOUN'), ('eyes.', 'NOUN'), ('In', 'NOUN'), ('an', 'NOUN'), ('instant', 'NOUN'), ('she', 'NOUN'), ('dropped', 'NOUN'), ('back,', 'NOUN'), ('and', 'NOUN'), ('he,', 'NOUN'), ('unhappy', 'NOUN'), ('man,', 'NOUN'), ('stretching', 'NOUN'), ('out', 'NOUN'), ('his', 'NOUN'), ('

In [42]:
# CRF POS tagging example
# ---------------------------------
import sklearn
import numpy as np
from sklearn_crfsuite import CRF, metrics
from nltk.corpus import treebank
from sklearn.model_selection import train_test_split

# Prepare data (universal tagset)
tagged_sents = treebank.tagged_sents(tagset='universal')
# Use moderate subset
train_sents = tagged_sents[:3000]
test_sents  = tagged_sents[3000:3200]

# Feature extractor for a sentence
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'suffix(3)': word[-3:],
        'suffix(2)': word[-2:],
        'prefix(1)': word[:1],
    }
    if i > 0:
        prev_word = sent[i-1][0]
        features.update({
            '-1:word.lower()': prev_word.lower(),
            '-1:word.istitle()': prev_word.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        next_word = sent[i+1][0]
        features.update({
            '+1:word.lower()': next_word.lower(),
            '+1:word.istitle()': next_word.istitle(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [tag for _, tag in sent]

X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_test  = [sent2features(s) for s in test_sents]
y_test  = [sent2labels(s) for s in test_sents]

crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

y_pred = crf.predict(X_test)
labels = list(crf.classes_)
print("CRF token-level F1 (micro):", metrics.flat_f1_score(y_test, y_pred, average='micro', labels=labels))
print(metrics.flat_classification_report(y_test, y_pred, labels=labels))

CRF token-level F1 (micro): 0.9684471024953598
              precision    recall  f1-score   support

        NOUN       0.96      0.97      0.96      1472
           .       1.00      1.00      1.00       538
         NUM       1.00      0.97      0.98       303
         ADJ       0.86      0.84      0.85       328
        VERB       0.96      0.96      0.96       603
         DET       0.99      1.00      1.00       373
         ADP       0.98      0.99      0.99       514
        CONJ       1.00      1.00      1.00        97
           X       1.00      1.00      1.00       264
         ADV       0.93      0.90      0.92       126
         PRT       0.99      0.99      0.99       168
        PRON       1.00      1.00      1.00        63

    accuracy                           0.97      4849
   macro avg       0.97      0.97      0.97      4849
weighted avg       0.97      0.97      0.97      4849



In [52]:
# Suppose you already have a word2features function defined, like:
def word2features(sent, i):
    word = sent[i]
    features = {
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'bias': 1.0,
    }
    if i > 0:
        features.update({
            '-1:word.lower()': sent[i-1].lower(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        features.update({
            '+1:word.lower()': sent[i+1].lower(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [53]:
# Example sentence
sentence = "The young researcher presented her findings at the international conference. Later, she answered questions confidently from the audience.".split()

# Convert to CRF features
X_new = [sent2features(sentence)]

# Predict
y_pred = crf.predict(X_new)
print(list(zip(sentence, y_pred[0])))

[('The', 'DET'), ('young', 'ADJ'), ('researcher', 'NOUN'), ('presented', 'VERB'), ('her', 'PRON'), ('findings', 'NOUN'), ('at', 'ADP'), ('the', 'ADJ'), ('international', 'NOUN'), ('conference.', 'NOUN'), ('Later,', 'NOUN'), ('she', 'NOUN'), ('answered', 'VERB'), ('questions', 'ADJ'), ('confidently', 'NOUN'), ('from', 'ADP'), ('the', 'DET'), ('audience.', 'NOUN')]


In [59]:
result = list(zip(sentence, y_pred[0]))

for r in result:
    print(f"{r[1]}")

DET
ADJ
NOUN
VERB
PRON
NOUN
ADP
ADJ
NOUN
NOUN
NOUN
NOUN
VERB
ADJ
NOUN
ADP
DET
NOUN


In [76]:
sentence = """
They took the upward path, through the still silence, steep and dark, shadowy with dense fog, drawing near to the threshold of the upper world. 
Afraid she was no longer there, and eager to see her, the lover turned his eyes. 
In an instant she dropped back, and he, unhappy man, stretching out his arms to hold her and be held, clutched at nothing but the receding air.
Dying a second time, now, there was no complaint to her husband (what, then, could she complain of, except that she had been loved?). 
She spoke a last ‘farewell’ that, now, scarcely reached his ears, and turned again towards that same place.
""".split()

# Convert to CRF features
X_new = [sent2features(sentence)]

# Predict
y_pred = crf.predict(X_new)
result = list(zip(sentence, y_pred[0]))

for r in result:
    print(f"{r[1]}")

PRON
VERB
NOUN
NOUN
NOUN
ADP
NOUN
ADV
ADJ
NOUN
CONJ
ADJ
NOUN
ADP
ADJ
NOUN
NOUN
NOUN
PRT
DET
NOUN
ADP
DET
NOUN
NOUN
NOUN
NOUN
VERB
ADV
ADJ
NOUN
CONJ
ADJ
NOUN
VERB
ADJ
NOUN
NOUN
VERB
PRON
NOUN
ADP
DET
NOUN
NOUN
VERB
NOUN
CONJ
ADJ
NOUN
NOUN
NOUN
VERB
ADJ
NOUN
PRT
VERB
PRON
CONJ
VERB
ADJ
NOUN
ADP
NOUN
CONJ
DET
NOUN
NOUN
NOUN
DET
ADJ
NOUN
NOUN
NOUN
VERB
ADJ
NOUN
PRT
PRON
NOUN
NOUN
NOUN
NOUN
VERB
ADJ
NOUN
NOUN
ADP
PRON
VERB
VERB
ADJ
NOUN
VERB
DET
ADJ
NOUN
NOUN
NOUN
NOUN
VERB
PRON
NOUN
CONJ
ADJ
NOUN
NOUN
ADP
ADJ
NOUN


In [79]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load a pretrained POS tagging model
model_name = "AndyLiang12/bert-finetuned-pos"  # already fine-tuned on POS tagging
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create pipeline
nlp = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example text
text = "The young researcher presented her findings at the international conference. Later, she answered questions confidently from the audience."

# Run POS tagging
results = nlp(text)

for r in results:
    print(f"{r['word']:15} -> {r['entity_group']} (score={r['score']:.2f})")

Device set to use cuda:0


the             -> DET (score=1.00)
young           -> ADJ (score=0.99)
researcher      -> NOUN (score=1.00)
presented       -> VERB (score=1.00)
her             -> PRON (score=1.00)
findings        -> NOUN (score=1.00)
at              -> ADP (score=1.00)
the             -> DET (score=1.00)
international   -> PROPN (score=0.69)
conference      -> NOUN (score=0.54)
.               -> PUNCT (score=1.00)
later           -> ADV (score=1.00)
,               -> PUNCT (score=1.00)
she             -> PRON (score=1.00)
answered        -> VERB (score=1.00)
questions       -> NOUN (score=1.00)
confidently     -> ADV (score=1.00)
from            -> ADP (score=1.00)
the             -> DET (score=1.00)
audience        -> NOUN (score=1.00)
.               -> PUNCT (score=1.00)


In [49]:
text = "The quick brown fox jumps over the lazy dog."

# Run POS tagging
results = nlp(text)

for r in results:
    print(f"{r['word']:15} -> {r['entity_group']} (score={r['score']:.2f})")
    # print(r['entity_group'])

the             -> DET (score=1.00)
quick brown     -> ADJ (score=0.99)
fox             -> NOUN (score=0.99)
jumps           -> VERB (score=0.99)
over            -> ADP (score=1.00)
the             -> DET (score=1.00)
lazy            -> ADJ (score=1.00)
dog             -> NOUN (score=1.00)
.               -> PUNCT (score=1.00)


In [80]:
sentence = """
They took the upward path, through the still silence, steep and dark, shadowy with dense fog, drawing near to the threshold of the upper world. 
Afraid she was no longer there, and eager to see her, the lover turned his eyes. 
In an instant she dropped back, and he, unhappy man, stretching out his arms to hold her and be held, clutched at nothing but the receding air.
Dying a second time, now, there was no complaint to her husband (what, then, could she complain of, except that she had been loved?). 
She spoke a last ‘farewell’ that, now, scarcely reached his ears, and turned again towards that same place.
"""

# Run POS tagging
results = nlp(sentence)

for r in results:
    print(f"{r['word']:15} -> {r['entity_group']} (score={r['score']:.2f})")

they            -> PRON (score=1.00)
took            -> VERB (score=1.00)
the             -> DET (score=1.00)
upward          -> ADJ (score=0.92)
path            -> NOUN (score=1.00)
,               -> PUNCT (score=1.00)
through         -> ADP (score=1.00)
the             -> DET (score=1.00)
still           -> ADV (score=0.99)
silence         -> NOUN (score=0.89)
,               -> PUNCT (score=1.00)
steep           -> ADJ (score=1.00)
and             -> CCONJ (score=1.00)
dark            -> ADJ (score=1.00)
,               -> PUNCT (score=1.00)
shadowy         -> ADJ (score=1.00)
with            -> ADP (score=1.00)
dense           -> ADJ (score=0.99)
fog             -> NOUN (score=1.00)
,               -> PUNCT (score=1.00)
drawing         -> VERB (score=1.00)
near            -> ADV (score=0.97)
to              -> ADP (score=1.00)
the             -> DET (score=1.00)
threshold       -> NOUN (score=1.00)
of              -> ADP (score=1.00)
the             -> DET (score=1.00)
upper      

In [77]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load a pretrained POS tagging model
model_name = "wietsedv/xlm-roberta-base-ft-udpos28-en"  # already fine-tuned on POS tagging
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create pipeline
nlp = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example text
text = "The young researcher presented her findings at the international conference. Later, she answered questions confidently from the audience."

# Run POS tagging
results = nlp(text)

for r in results:
    print(f"{r['word']:15} -> {r['entity_group']} (score={r['score']:.2f})")

Device set to use cuda:0


The             -> DET (score=1.00)
young           -> ADJ (score=0.99)
researcher      -> NOUN (score=1.00)
presented       -> VERB (score=1.00)
her             -> PRON (score=0.97)
findings        -> NOUN (score=0.84)
at              -> ADP (score=1.00)
the             -> DET (score=1.00)
international   -> ADJ (score=1.00)
conference      -> NOUN (score=1.00)
.               -> PUNCT (score=0.88)
Later           -> ADV (score=0.99)
,               -> PUNCT (score=0.91)
she             -> PRON (score=1.00)
answered        -> VERB (score=1.00)
questions       -> NOUN (score=1.00)
confidently     -> ADV (score=0.94)
from            -> ADP (score=1.00)
the             -> DET (score=1.00)
audience        -> NOUN (score=1.00)
.               -> PUNCT (score=0.98)


In [65]:
for r in results:
    # print(f"{r['word']:15} -> {r['entity_group']} (score={r['score']:.2f})")
    print(r['entity_group'])

DET
ADJ
NOUN
VERB
PRON
NOUN
ADP
DET
PROPN
NOUN
PUNCT
ADV
PUNCT
PRON
VERB
NOUN
ADV
ADP
DET
NOUN
PUNCT


In [51]:
text = "The quick brown fox jumps over the lazy dog."

# Run POS tagging
results = nlp(text)

for r in results:
    print(f"{r['word']:15} -> {r['entity_group']} (score={r['score']:.2f})")

The             -> DET (score=1.00)
quick brown     -> ADJ (score=0.99)
fox             -> NOUN (score=1.00)
jumps           -> VERB (score=0.99)
over            -> ADP (score=0.98)
the             -> DET (score=1.00)
lazy            -> ADJ (score=0.99)
dog             -> NOUN (score=1.00)
.               -> PUNCT (score=0.97)


In [78]:
sentence = """
They took the upward path, through the still silence, steep and dark, shadowy with dense fog, drawing near to the threshold of the upper world. 
Afraid she was no longer there, and eager to see her, the lover turned his eyes. 
In an instant she dropped back, and he, unhappy man, stretching out his arms to hold her and be held, clutched at nothing but the receding air.
Dying a second time, now, there was no complaint to her husband (what, then, could she complain of, except that she had been loved?). 
She spoke a last ‘farewell’ that, now, scarcely reached his ears, and turned again towards that same place.
"""

# Run POS tagging
results = nlp(sentence)

for r in results:
    print(f"{r['word']:15} -> {r['entity_group']}")

They            -> PRON
took            -> VERB
the             -> DET
upward          -> ADJ
path            -> NOUN
,               -> PUNCT
through         -> ADP
the             -> DET
still           -> ADJ
silence         -> NOUN
,               -> PUNCT
steep           -> ADJ
and             -> CCONJ
dark            -> ADJ
,               -> CCONJ
shadowy         -> ADJ
with            -> ADP
dense           -> ADJ
fog             -> NOUN
,               -> PUNCT
drawing         -> VERB
near            -> ADV
to              -> ADP
the             -> DET
threshold       -> NOUN
of              -> ADP
the             -> DET
upper           -> ADJ
world           -> NOUN
.               -> PUNCT
Afraid          -> ADJ
she             -> PRON
was             -> AUX
no longer there -> ADV
,               -> PUNCT
and             -> CCONJ
eager           -> ADJ
to              -> PART
see             -> VERB
her             -> PRON
,               -> PUNCT
the             -> DET
love