<a href="https://colab.research.google.com/github/Satish055/DeepLearning_project/blob/main/WordCloud_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
##  Working with Natural languages: NLP
# 1. Install nltk library for NLP processing
# pip install nltk
# 2. Import nltk
# 3. convert everything to same case: I prefer lower case
# 4. word tokenize or sentence tokenize: breaking entire content into different sentences and/or words
# 5. Removal of stop words: stop words are words that dont give much of a meaning
# 6. Stemming: convert words into their root form
# 7. POS tagging (Part of Speech)

In [3]:
!pip install nltk



In [4]:
# Importing NLTK
import nltk
text = '''Product is gREAT. thought coloring products but I amn't liking the colors of it. as I am
thinking they are worst'''
text = text.lower()
print("After lower case: \n",text)

After lower case: 
 product is great. thought coloring products but i amn't liking the colors of it. as i am
thinking they are worst


In [5]:
text_str = text.split()
print("After string split: \n",text_str)

After string split: 
 ['product', 'is', 'great.', 'thought', 'coloring', 'products', 'but', 'i', "amn't", 'liking', 'the', 'colors', 'of', 'it.', 'as', 'i', 'am', 'thinking', 'they', 'are', 'worst']


In [8]:
# word tokenize using nltk
from nltk.tokenize import sent_tokenize, word_tokenize
print("Sentence tokenize: \n",sent_tokenize(text))
text_nltk = word_tokenize(text)
print("Words after nltk:\n",text_nltk)

Sentence tokenize: 
 ['product is great.', "thought coloring products but i amn't liking the colors of it.", 'as i am\nthinking they are worst']
Words after nltk:
 ['product', 'is', 'great', '.', 'thought', 'coloring', 'products', 'but', 'i', 'am', "n't", 'liking', 'the', 'colors', 'of', 'it', '.', 'as', 'i', 'am', 'thinking', 'they', 'are', 'worst']


In [7]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [9]:
# MANUAL STOP WORDS
our_stopwords = {'is','i','the','.','am','of','are','it','they','as'}
text2 = text_nltk
for w in list(text2):
    if w in our_stopwords:
        count_w = text2.count(w)
        for j in range(count_w):
            text2.remove(w)
print("Text after removal of our stop words:\n",text2)

Text after removal of our stop words:
 ['product', 'great', 'thought', 'coloring', 'products', 'but', "n't", 'liking', 'colors', 'thinking', 'worst']


In [10]:
# Download
from nltk.corpus import stopwords
'''
Download the stopword-
nltk.download('stopwords')
'''

"\nDownload the stopword-\nnltk.download('stopwords')\n"

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
nltk_stopwords = set(stopwords.words("english"))
nltk_stopwords.remove("but")
text2 = text_nltk
for w in list(text2):
    if w in nltk_stopwords:
        count_w = text2.count(w)
        for j in range(count_w):
            text2.remove(w)
print("Text after removal of nltk stop words:\n",text2)

Text after removal of nltk stop words:
 ['product', 'great', 'thought', 'coloring', 'products', 'but', "n't", 'liking', 'colors', 'thinking', 'worst']


In [15]:
# find the tag of each of the words
text4 = nltk.pos_tag(text2)
print("POS tags of each words before stemming:\n",text4)

POS tags of each words before stemming:
 [('product', 'NN'), ('great', 'JJ'), ('thought', 'JJ'), ('coloring', 'NN'), ('products', 'NNS'), ('but', 'CC'), ("n't", 'RB'), ('liking', 'JJ'), ('colors', 'NNS'), ('thinking', 'VBG'), ('worst', 'JJS')]


In [14]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [16]:
# stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
text3 = [stemmer.stem(word) for word in text2]
print("Text after stemming:\n",text3)

Text after stemming:
 ['product', 'great', 'thought', 'color', 'product', 'but', "n't", 'like', 'color', 'think', 'worst']


In [17]:
# find the tag of each of the words
text4 = nltk.pos_tag(text3)
print("POS tags of each words after stemming:\n",text4)

POS tags of each words after stemming:
 [('product', 'NN'), ('great', 'JJ'), ('thought', 'JJ'), ('color', 'NN'), ('product', 'NN'), ('but', 'CC'), ("n't", 'RB'), ('like', 'IN'), ('color', 'NN'), ('think', 'VBP'), ('worst', 'JJS')]


In [18]:
# POS tag
from nltk.tag import DefaultTagger
py_tag = DefaultTagger('NN')
tag_txt = py_tag.tag(text3)
print("After Default Tag: \n",tag_txt)

After Default Tag: 
 [('product', 'NN'), ('great', 'NN'), ('thought', 'NN'), ('color', 'NN'), ('product', 'NN'), ('but', 'NN'), ("n't", 'NN'), ('like', 'NN'), ('color', 'NN'), ('think', 'NN'), ('worst', 'NN')]
