In [2]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [4]:
from nltk.corpus import brown

##A text corpus is a large body of text. Many corpora are designed to contain a careful balance of material in diff genres.
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [7]:
## brown.sents? = return: the given file(s) as a list of sentences.
data = brown.sents(categories='editorial')[:100]

print(data)

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


In [9]:
##Tokenization - To convert given texts into words or sentences.
text = "It was a very pleasant day, the weather was cool and there were light showers. I went to the market to buy some flowers."
##files required to use tokenizers.
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
from nltk.tokenize import sent_tokenize,word_tokenize
##tokenize every sentence.
sents = sent_tokenize(text)
print(sents)

['It was a very pleasant day, the weather was cool and there were light showers.', 'I went to the market to buy some flowers.']


In [13]:
##tokenze every word.
new_sents = word_tokenize(text.lower())
print(new_sents)

['it', 'was', 'a', 'very', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.', 'i', 'went', 'to', 'the', 'market', 'to', 'buy', 'some', 'flowers', '.']


In [20]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[a-zA-Z@]+")
print(tokenizer.tokenize(text))

['It', 'was', 'a', 'very', 'pleasant', 'day', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', 'I', 'went', 'to', 'the', 'market', 'to', 'buy', 'some', 'flowers']


In [14]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
from nltk.corpus import stopwords

In [17]:
sw = set(stopwords.words('english'))
print(sw)

{'after', 'nor', 'before', 'an', 'y', 'who', 'through', 'wouldn', 'this', 'was', 'been', 'few', "shouldn't", 'does', 'as', 're', 'a', 'my', 'herself', 'shan', 'do', 'if', 'other', "couldn't", 'his', 'has', 'again', 'm', 'very', "it's", 'didn', 'won', 'all', 'don', 'haven', 'yours', "she's", 'up', 'myself', 'when', 'you', 'for', 'me', 'should', 'but', 'mightn', 'the', 'then', "doesn't", 'itself', 'weren', 'were', 'them', 'your', 'doesn', 'are', 'about', "mightn't", 'yourselves', 'where', 'in', 'ma', "didn't", 'above', 'couldn', 's', 'will', "needn't", 'to', 'off', 'hadn', "don't", "hasn't", 'they', 't', "should've", 'yourself', 'why', 'these', 'doing', 'out', 'same', 'from', 'did', 'during', 'more', 'having', "you're", 'into', 'what', 'each', 'over', 'only', 'than', "won't", 'too', 'how', 'both', 'that', 'hasn', 'll', 'at', 'i', 'aren', 'which', "wouldn't", 'there', 'not', 'be', 'of', 'until', 'while', "aren't", 'shouldn', "you'd", 'it', 'own', 'against', 'so', 'himself', 'needn', 'hers

In [19]:
def filter_words(new_sents):
    useful_words = [w for w in new_sents if w not in sw]
    return useful_words


without_sw = filter_words(new_sents)
print(without_sw)

['pleasant', 'day', ',', 'weather', 'cool', 'light', 'showers', '.', 'went', 'market', 'buy', 'flowers', '.']


In [22]:
import re
punctuation = re.compile(r'[-.?!,:;()|0-9]')

allWords = []

for words in without_sw:
    word = punctuation.sub("", words)
    if len(word)>0:
        allWords.append(word)
        
print(allWords)

['pleasant', 'day', 'weather', 'cool', 'light', 'showers', 'went', 'market', 'buy', 'flowers']


In [23]:
from nltk.stem.snowball import PorterStemmer,SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
ps = PorterStemmer()

In [24]:
def stemming(word_list):
    final = []
    for w in word_list:
        n = ps.stem(w)
        final.append(n)
    return final

In [26]:
print(stemming(allWords))

['pleasant', 'day', 'weather', 'cool', 'light', 'shower', 'went', 'market', 'buy', 'flower']


In [29]:
from nltk.corpus import wordnet

In [30]:
word_embeddings = {}

for token in allWords:
    synonymns = wordnet.synsets(token)
    if synonymns:
        word_embeddings[token] = synonymns[0].definition()

In [32]:
for word, embedding in word_embeddings.items():
    print(f"Word: {word} - Semantic Embedding: {embedding}")

Word: pleasant - Semantic Embedding: affording pleasure; being in harmony with your taste or likings
Word: day - Semantic Embedding: time for Earth to make a complete rotation on its axis
Word: weather - Semantic Embedding: the atmospheric conditions that comprise the state of the atmosphere in terms of temperature and wind and clouds and precipitation
Word: cool - Semantic Embedding: the quality of being at a refreshingly low temperature
Word: light - Semantic Embedding: (physics) electromagnetic radiation that can produce a visual sensation
Word: showers - Semantic Embedding: a plumbing fixture that sprays water over you
Word: went - Semantic Embedding: change location; move, travel, or proceed, also metaphorically
Word: market - Semantic Embedding: the world of commercial activity where goods and services are bought and sold
Word: buy - Semantic Embedding: an advantageous purchase
Word: flowers - Semantic Embedding: a plant cultivated for its blooms or blossoms
