<a href="https://colab.research.google.com/github/Paul-mwaura/Natural-Language-Processing/blob/main/Replace_words_with_synonyms_using_nltk_and_wordnet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Replace words with synonyms using nltk and wordnet

In [4]:
text = open("food.txt").read()
text[:100]

'The Global Report on Food Crises (GRFC) 2020 is the result of\na joint, consensus-based assessment of'

In [5]:
import pandas as pd
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from random import randint
import nltk.data

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Load a text file if required
output = ""

# Load the pretrained neural net
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Tokenize the text
tokenized = tokenizer.tokenize(text)

# Get the list of words from the entire text
words = word_tokenize(text)

# Identify the parts of speech
tagged = nltk.pos_tag(words)

for i in range(0,len(words)):
    replacements = []

    # Only replace nouns with nouns, vowels with vowels etc.
    for syn in wordnet.synsets(words[i]):

        # Do not attempt to replace proper nouns or determiners
        if tagged[i][1] == 'NNP' or tagged[i][1] == 'DT':
            break
        
        # The tokenizer returns strings like NNP, VBP etc
        # but the wordnet synonyms has tags like .n.
        # So we extract the first character from NNP ie n
        # then we check if the dictionary word has a .n. or not 
        word_type = tagged[i][1][0].lower()
        if syn.name().find("."+word_type+"."):
            # extract the word only
            r = syn.name()[0:syn.name().find(".")]
            replacements.append(r)

    if len(replacements) > 0:
        # Choose a random replacement
        replacement = replacements[randint(0,len(replacements)-1)]
        output = output + " " + replacement
    else:
        # If no replacement could be found, then just use the
        # original word
        output = output + " " + words[i]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [8]:
output[:150]

' The Global Report on Food Crises ( GRFC ) 2020 be the result of a joint , consensus-based assessment of acute food insecurity site around the univers'

In [7]:
text[:150]

'The Global Report on Food Crises (GRFC) 2020 is the result of\na joint, consensus-based assessment of acute food insecurity\nsituations around the world'