## Download the parsed treebank dataset (only 10% of the full version - which costs money), has 100675 tagged words. 
## The Brown Corpus is annotated with part-of-speech tags

In [29]:
import nltk

nltk.download('treebank')
nltk.download('brown')


[nltk_data] Downloading package treebank to /home/nlevi/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package brown to /home/nlevi/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [109]:
from nltk.corpus import treebank
print(treebank.words()[0:60])
# len(treebank.words()[:-1])/100

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.', 'Mr.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N.V.', ',', 'the', 'Dutch', 'publishing', 'group', '.', 'Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former', 'chairman', 'of', 'Consolidated', 'Gold', 'Fields', 'PLC', ',', 'was', 'named', '*-1', 'a', 'nonexecutive', 'director', 'of', 'this', 'British', 'industrial', 'conglomerate', '.', 'A', 'form']


In [110]:
from nltk.corpus import treebank
print(treebank.fileids()[:10])
print(treebank.words('wsj_0003.mrg'))
print(treebank.tagged_words('wsj_0003.mrg'))
print(treebank.parsed_sents('wsj_0003.mrg')[0])

['wsj_0001.mrg', 'wsj_0002.mrg', 'wsj_0003.mrg', 'wsj_0004.mrg', 'wsj_0005.mrg', 'wsj_0006.mrg', 'wsj_0007.mrg', 'wsj_0008.mrg', 'wsj_0009.mrg', 'wsj_0010.mrg']
['A', 'form', 'of', 'asbestos', 'once', 'used', '*', ...]
[('A', 'DT'), ('form', 'NN'), ('of', 'IN'), ...]
(S
  (S-TPC-1
    (NP-SBJ
      (NP (NP (DT A) (NN form)) (PP (IN of) (NP (NN asbestos))))
      (RRC
        (ADVP-TMP (RB once))
        (VP
          (VBN used)
          (NP (-NONE- *))
          (S-CLR
            (NP-SBJ (-NONE- *))
            (VP
              (TO to)
              (VP
                (VB make)
                (NP (NNP Kent) (NN cigarette) (NNS filters))))))))
    (VP
      (VBZ has)
      (VP
        (VBN caused)
        (NP
          (NP (DT a) (JJ high) (NN percentage))
          (PP (IN of) (NP (NN cancer) (NNS deaths)))
          (PP-LOC
            (IN among)
            (NP
              (NP (DT a) (NN group))
              (PP
                (IN of)
                (NP
                  (N

### Remove unwanted * notations and words

In [138]:
words = (treebank.tagged_words()[:-1])

# Step 1: Identify and store the indices of the elements that need to be removed
indices_to_remove = [i for i, (word, tag) in enumerate(words) if word.startswith('*') or word.endswith('*')]

# Step 2: Remove the elements from the original list
filtered_tagged_words = [word for i, word in enumerate(words) if i not in indices_to_remove]

# Step 3: Extract the tags from the original list
single_words = [word[0] for word in words]
tags = [word[1] for word in words]

# Step 4: Remove the corresponding tags using the stored indices
filtered_tags = [tag for i, tag in enumerate(tags) if i not in indices_to_remove]
filtered_words = [word for i, word in enumerate(single_words) if i not in indices_to_remove]

# Print the results
print("Filtered words:", filtered_words[:10])
print("Filtered tags:", filtered_tags[:10])

print('Length check: %f' % (len(filtered_words) - len(filtered_tags)))

Filtered words: ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the']
Filtered tags: ['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT']
Length check: 0.000000


In [145]:
sentence = ' '.join(filtered_words[:50])
print(sentence)
tag_sentence = ' '.join(filtered_tags[:50])
print(tag_sentence)

Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 . Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group . Rudolph Agnew , 55 years old and former chairman of Consolidated Gold Fields PLC , was named a nonexecutive
NNP NNP , CD NNS JJ , MD VB DT NN IN DT JJ NN NNP CD . NNP NNP VBZ NN IN NNP NNP , DT NNP VBG NN . NNP NNP , CD NNS JJ CC JJ NN IN NNP NNP NNP NNP , VBD VBN DT JJ


## Given the set of filtered words, we can now use them as a dataset, with whatever split we like.

In [146]:
len(filtered_words)

95182

In [149]:
# Function to split the list into chunks of 100 words
def split_into_sentences(words, chunk_size=100):
    sentences = []
    for i in range(0, len(words), chunk_size):
        sentences.append(words[i:i + chunk_size])
    return sentences

# Split the filtered words into sentences of 100 words each
sentences = split_into_sentences(filtered_words)

# Print the sentences
for i, sentence in enumerate(sentences[:2]):
    print(f"Sentence {i + 1}: {' '.join(sentence)}")

Sentence 1: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 . Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group . Rudolph Agnew , 55 years old and former chairman of Consolidated Gold Fields PLC , was named a nonexecutive director of this British industrial conglomerate . A form of asbestos once used to make Kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than 30 years ago , researchers reported 0 . The asbestos fiber , crocidolite ,
Sentence 2: is unusually resilient once it enters the lungs , with even brief exposures to it causing symptoms that show up decades later , researchers said 0 . Lorillard Inc. , the unit of New York-based Loews Corp. that makes Kent cigarettes , stopped using crocidolite in its Micronite cigarette filters in 1956 . Although preliminary findings were reported more than a year ago , the latest results appear in today 's New England Journa

In [153]:
' '.join(sentences[0])

'Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 . Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group . Rudolph Agnew , 55 years old and former chairman of Consolidated Gold Fields PLC , was named a nonexecutive director of this British industrial conglomerate . A form of asbestos once used to make Kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than 30 years ago , researchers reported 0 . The asbestos fiber , crocidolite ,'