In [20]:
import nltk
from nltk.corpus import brown
import re

brown_words = brown.words()

prefixes = ['un', 'non', 'dis', 'in']

In [21]:
from nltk.corpus import treebank as tb
tb_tagged_sents = tb.tagged_sents()
tb_tagged_words = tb.tagged_words()

# Step 1: Get all tokens that have prefixes of interest
pattern = r'\bun\w+|\bnon\w+|\bdis\w+|\bin\w+'
pwords = list(set([w for w in brown_words if re.match(pattern, w) and len(w) > 4]))
 
# Tag words that have prefixes of interest
pwords_t = nltk.pos_tag(p_words)
pwords_t

[('inducing', 'VBG'),
 ('dissimilar', 'JJ'),
 ('dishes', 'NNS'),
 ('insinuating', 'VBG'),
 ('incomprehensible', 'JJ'),
 ('distresses', 'NNS'),
 ('unmethodical', 'JJ'),
 ('inclinations', 'NNS'),
 ('investment', 'NN'),
 ('discrepancy', 'NN'),
 ('nonsense', 'JJ'),
 ('indicator', 'NN'),
 ('insuperable', 'JJ'),
 ('innermost', 'NN'),
 ('disreputable', 'JJ'),
 ('inspirational', 'JJ'),
 ('insight', 'NN'),
 ('interviewed', 'VBD'),
 ('inhibitions', 'NNS'),
 ('disappointed', 'JJ'),
 ('distributing', 'VBG'),
 ('inconsistent', 'JJ'),
 ('nondrying', 'JJ'),
 ('underwriter', 'NN'),
 ('disliking', 'VBG'),
 ('involutions', 'NNS'),
 ('unspectacular', 'JJ'),
 ('indenture', 'NN'),
 ('incorporates', 'VBZ'),
 ('inventor', 'NN'),
 ('unconquerable', 'JJ'),
 ('unresolved', 'JJ'),
 ('informs', 'NNS'),
 ('undergraduate', 'VBP'),
 ('unintelligible', 'JJ'),
 ('unself-conscious', 'JJ'),
 ('ungainly', 'RB'),
 ('unworthy', 'JJ'),
 ('disciple', 'NN'),
 ('intelligible', 'JJ'),
 ('inventive', 'JJ'),
 ('disallowed', 'VBN'

In [22]:
# Step 2: Get rid of erroneous words of interest
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# brown_t = nltk.corpus.brown.tagged_sents()

# Function to replace treebank tag with a wordnet tag to use with the Lemmatizer
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # Unfortunately, wordnet only supports adjective, verb, noun and adverb parts of speech
        # So, any word whose POS is not covered by wordnet will default to NOUN
        return wordnet.NOUN
    
prefixes = "un|non|dis|in"
words_of_interest = []
for w in pwords_t:
    # Separate prefix from stem
    word = w[0]
    tag = w[1]
    prefix = re.match(prefixes, word).group(0)
    stem = re.sub(prefix, '', word, count=1)
    lem_stem = wordnet_lemmatizer.lemmatize(stem, pos=(get_wordnet_pos(tag)))
    t = (word, tag, prefix, stem, lem_stem)
    print(t)
    words_of_interest.append(t)

('inducing', 'VBG', 'in', 'ducing', 'ducing')
('dissimilar', 'JJ', 'dis', 'similar', 'similar')
('dishes', 'NNS', 'dis', 'hes', 'he')
('insinuating', 'VBG', 'in', 'sinuating', 'sinuating')
('incomprehensible', 'JJ', 'in', 'comprehensible', 'comprehensible')
('distresses', 'NNS', 'dis', 'tresses', 'tress')
('unmethodical', 'JJ', 'un', 'methodical', 'methodical')
('inclinations', 'NNS', 'in', 'clinations', 'clinations')
('investment', 'NN', 'in', 'vestment', 'vestment')
('discrepancy', 'NN', 'dis', 'crepancy', 'crepancy')
('nonsense', 'JJ', 'non', 'sense', 'sense')
('indicator', 'NN', 'in', 'dicator', 'dicator')
('insuperable', 'JJ', 'in', 'superable', 'superable')
('innermost', 'NN', 'in', 'nermost', 'nermost')
('disreputable', 'JJ', 'dis', 'reputable', 'reputable')
('inspirational', 'JJ', 'in', 'spirational', 'spirational')
('insight', 'NN', 'in', 'sight', 'sight')
('interviewed', 'VBD', 'in', 'terviewed', 'terviewed')
('inhibitions', 'NNS', 'in', 'hibitions', 'hibitions')
('disappoint

In [23]:
# Import lexicon
from nltk.corpus import words
lex = words.words('en')

# Check if stem is in the lexicon
# If so, the prefix is negating and keep the word
# If not, discard the word
filtered = []
for w in words_of_interest:
    word = w[0]
    tag = w[1]
    prefix = w[2]
    stem = w[3]
    lem_stem = w[4]
    if word in lex:
        print(w)
        filtered.append(w)

('dissimilar', 'JJ', 'dis', 'similar', 'similar')
('insinuating', 'VBG', 'in', 'sinuating', 'sinuating')
('incomprehensible', 'JJ', 'in', 'comprehensible', 'comprehensible')
('unmethodical', 'JJ', 'un', 'methodical', 'methodical')
('investment', 'NN', 'in', 'vestment', 'vestment')
('discrepancy', 'NN', 'dis', 'crepancy', 'crepancy')
('nonsense', 'JJ', 'non', 'sense', 'sense')
('indicator', 'NN', 'in', 'dicator', 'dicator')
('insuperable', 'JJ', 'in', 'superable', 'superable')
('innermost', 'NN', 'in', 'nermost', 'nermost')
('disreputable', 'JJ', 'dis', 'reputable', 'reputable')
('inspirational', 'JJ', 'in', 'spirational', 'spirational')
('insight', 'NN', 'in', 'sight', 'sight')
('disappointed', 'JJ', 'dis', 'appointed', 'appointed')
('inconsistent', 'JJ', 'in', 'consistent', 'consistent')
('nondrying', 'JJ', 'non', 'drying', 'drying')
('underwriter', 'NN', 'un', 'derwriter', 'derwriter')
('disliking', 'VBG', 'dis', 'liking', 'like')
('unspectacular', 'JJ', 'un', 'spectacular', 'spectac