## Cargar el corpus

In [10]:
import nltk.chunk
import nltk
from nltk.corpus import conll2000

# Cargar conjunto de entrenamiento y pruebas, en formato conlleval (árbol de etiquetas)
conll_train = conll2000.chunked_sents('train.txt')
conll_test = conll2000.chunked_sents('test.txt')
print(conll_train[0])

# Convertir los arboles de etiquetas a lista de listas con etiquetas
train_chunks = [nltk.chunk.tree2conlltags(tree) for tree in conll_train]
test_chunks = [nltk.chunk.tree2conlltags(tree) for tree in conll_test]

# Datos de entrenamiento y pruebas para los chunkers en forma de lista con etiquitas IOB
train=[[(t, c) for (w, t, c) in chunk_tags] for chunk_tags in train_chunks]
test= [[(t, c) for (w, t, c) in chunk_tags] for chunk_tags in test_chunks]


print(test_chunks[0])

(S
  (NP Confidence/NN)
  (PP in/IN)
  (NP the/DT pound/NN)
  (VP is/VBZ widely/RB expected/VBN to/TO take/VB)
  (NP another/DT sharp/JJ dive/NN)
  if/IN
  (NP trade/NN figures/NNS)
  (PP for/IN)
  (NP September/NNP)
  ,/,
  due/JJ
  (PP for/IN)
  (NP release/NN)
  (NP tomorrow/NN)
  ,/,
  (VP fail/VB to/TO show/VB)
  (NP a/DT substantial/JJ improvement/NN)
  (PP from/IN)
  (NP July/NNP and/CC August/NNP)
  (NP 's/POS near-record/JJ deficits/NNS)
  ./.)
[('Rockwell', 'NNP', 'B-NP'), ('International', 'NNP', 'I-NP'), ('Corp.', 'NNP', 'I-NP'), ("'s", 'POS', 'B-NP'), ('Tulsa', 'NNP', 'I-NP'), ('unit', 'NN', 'I-NP'), ('said', 'VBD', 'B-VP'), ('it', 'PRP', 'B-NP'), ('signed', 'VBD', 'B-VP'), ('a', 'DT', 'B-NP'), ('tentative', 'JJ', 'I-NP'), ('agreement', 'NN', 'I-NP'), ('extending', 'VBG', 'B-VP'), ('its', 'PRP$', 'B-NP'), ('contract', 'NN', 'I-NP'), ('with', 'IN', 'B-PP'), ('Boeing', 'NNP', 'B-NP'), ('Co.', 'NNP', 'I-NP'), ('to', 'TO', 'B-VP'), ('provide', 'VB', 'I-VP'), ('structural', 'JJ

## Cargar etiquetador de la Pr2

In [2]:
# Cargar frases para entrenar el tagger
tagger_train_sentences = train_chunks
for i in range(len(tagger_train_sentences)):
    tagger_train_sentences[i] = [(w, t) for (w, t, c) in tagger_train_sentences[i]]

print(tagger_train_sentences[0])

[('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ('pound', 'NN'), ('is', 'VBZ'), ('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB'), ('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN'), ('if', 'IN'), ('trade', 'NN'), ('figures', 'NNS'), ('for', 'IN'), ('September', 'NNP'), (',', ','), ('due', 'JJ'), ('for', 'IN'), ('release', 'NN'), ('tomorrow', 'NN'), (',', ','), ('fail', 'VB'), ('to', 'TO'), ('show', 'VB'), ('a', 'DT'), ('substantial', 'JJ'), ('improvement', 'NN'), ('from', 'IN'), ('July', 'NNP'), ('and', 'CC'), ('August', 'NNP'), ("'s", 'POS'), ('near-record', 'JJ'), ('deficits', 'NNS'), ('.', '.')]


In [3]:
tagger_train_sentences_simplified = []
for sentence in tagger_train_sentences:
    new_sentence = []
    
    for taged_word in sentence:
        word, tag = taged_word
        main_category = tag[0]

        if word == '*0*':
            continue

        match main_category:
            case 'v':   # verbo
                new_tag = tag[:3]
            case 'F':   # signo de puntuación
                new_tag = tag[:3]
            case _:     # otro
                new_tag = tag[:2]

        new_sentence.append((word, new_tag))

    tagger_train_sentences_simplified.append(new_sentence)

print(tagger_train_sentences[0])
print(tagger_train_sentences_simplified[0])

[('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ('pound', 'NN'), ('is', 'VBZ'), ('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB'), ('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN'), ('if', 'IN'), ('trade', 'NN'), ('figures', 'NNS'), ('for', 'IN'), ('September', 'NNP'), (',', ','), ('due', 'JJ'), ('for', 'IN'), ('release', 'NN'), ('tomorrow', 'NN'), (',', ','), ('fail', 'VB'), ('to', 'TO'), ('show', 'VB'), ('a', 'DT'), ('substantial', 'JJ'), ('improvement', 'NN'), ('from', 'IN'), ('July', 'NNP'), ('and', 'CC'), ('August', 'NNP'), ("'s", 'POS'), ('near-record', 'JJ'), ('deficits', 'NNS'), ('.', '.')]
[('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ('pound', 'NN'), ('is', 'VB'), ('widely', 'RB'), ('expected', 'VB'), ('to', 'TO'), ('take', 'VB'), ('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN'), ('if', 'IN'), ('trade', 'NN'), ('figures', 'NN'), ('for', 'IN'), ('September', 'NN'), (',', ','), ('due', 'JJ'), ('for', 'IN'), ('release', 'NN'), ('tomorrow', 'NN'), (

In [4]:
from nltk.tag import hmm
tagger_hmm = hmm.HiddenMarkovModelTagger.train(tagger_train_sentences_simplified)

## Etiquetar y formatear la salida

In [5]:
# Etiquetar el test
test_sentences = [[(w, t) for (w, t, c) in sentence] for sentence in test_chunks]
tagged_test_sentences = [tagger_hmm.tag(sentence) for sentence in test_sentences]

In [22]:
chunker = nltk.tag.UnigramTagger(train)
chunker.tag(test[1])

[(('NNP', 'B-NP'), None),
 (('VBD', 'B-VP'), None),
 (('DT', 'B-NP'), None),
 (('NN', 'I-NP'), None),
 (('VBZ', 'B-VP'), None),
 (('IN', 'O'), None),
 (('PRP', 'B-NP'), None),
 (('TO', 'B-VP'), None),
 (('VB', 'I-VP'), None),
 (('CD', 'B-NP'), None),
 (('JJ', 'I-NP'), None),
 (('JJ', 'I-NP'), None),
 (('NNS', 'I-NP'), None),
 (('IN', 'B-PP'), None),
 (('DT', 'B-NP'), None),
 (('NNS', 'I-NP'), None),
 (('.', 'O'), None)]

In [6]:
# Formatear la salida del test
output_lines = []
for i, sentence in enumerate(test_chunks): # para cada frase
    for j, (word, pos, gold_chunk) in enumerate(sentence): # para cada palabra
        pred_chunk = tagged_test_sentences[i][j][1]
        output_lines.append(f"{word} {pos} {gold_chunk} {pred_chunk}")
    output_lines.append("")

In [7]:
# Guardar la salida en un archivo
with open("test_output.txt", "w") as f:
    f.write("\n".join(output_lines))

In [8]:
!python conlleval.py < test_output.txt

Traceback (most recent call last):
  File "c:\Users\FX607\Desktop\Master\LC\Pr3\conlleval.py", line 235, in <module>
    evaluate_conll_file(sys.stdin)
  File "c:\Users\FX607\Desktop\Master\LC\Pr3\conlleval.py", line 229, in evaluate_conll_file
    return evaluate(true_seqs, pred_seqs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\FX607\Desktop\Master\LC\Pr3\conlleval.py", line 209, in evaluate
    correct_counts, true_counts, pred_counts) = count_chunks(true_seqs, pred_seqs)
                                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\FX607\Desktop\Master\LC\Pr3\conlleval.py", line 131, in count_chunks
    _, pred_type = split_tag(pred_tag)
    ^^^^^^^^^^^^
ValueError: not enough values to unpack (expected 2, got 1)
