In [16]:
import numpy as np   
import pandas as pd    
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [17]:
!pip install textblob

Defaulting to user installation because normal site-packages is not writeable


In [18]:
import textblob
from textblob import TextBlob, Word

In [19]:
text = "I am Sandhya. I am a 3rd year computer engineering student at Pune Institute of Computer Technology. I like drawing and painting. "

In [20]:
#tokenization
#Tokenization is the process of breaking down the given text in natural language processing into the 
#smallest unit in a sentence called a token.
TextBlob(text).words

WordList(['I', 'am', 'Sandhya', 'I', 'am', 'a', '3rd', 'year', 'computer', 'engineering', 'student', 'at', 'Pune', 'Institute', 'of', 'Computer', 'Technology', 'I', 'like', 'drawing', 'and', 'painting'])

In [21]:
#tokenization with NLTK (Natural Language Toolkit)
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize

In [22]:
tokens_sents = nltk.sent_tokenize(text)
print(tokens_sents)

['I am Sandhya.', 'I am a 3rd year computer engineering student at Pune Institute of Computer Technology.', 'I like drawing and painting.']


In [23]:
tokens_words = nltk.word_tokenize(text)
print(tokens_words)

['I', 'am', 'Sandhya', '.', 'I', 'am', 'a', '3rd', 'year', 'computer', 'engineering', 'student', 'at', 'Pune', 'Institute', 'of', 'Computer', 'Technology', '.', 'I', 'like', 'drawing', 'and', 'painting', '.']


In [24]:
# Stemming
# Stemming is the process of finding the root of words.
from nltk.stem import PorterStemmer

ps = PorterStemmer()
word = ("caring")
ps.stem(word)

'care'

In [25]:
# stemming with SnowballStemmer
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer(language = "english")
word = "flying"
stemmer.stem(word)

'fli'

In [26]:
# Lemmatization
# Lemmatization is the process of finding the form of the related word in the dictionary.

import nltk
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

# Lemmatize single word

print(lemmatizer.lemmatize("workers"))
print(lemmatizer.lemmatize("trees"))

worker
tree


In [27]:
#Let’s break this text down to tokens first. Then let’s apply the lemmatizer one by one on these tokens.

text = "Let’s lemmatize a simple sentence. We first tokenize the sentence into words using nltk.word_tokenize and then we will call lemmatizer.lemmatize() on each word. "
word_list = nltk.word_tokenize(text)
print(word_list)

['Let', '’', 's', 'lemmatize', 'a', 'simple', 'sentence', '.', 'We', 'first', 'tokenize', 'the', 'sentence', 'into', 'words', 'using', 'nltk.word_tokenize', 'and', 'then', 'we', 'will', 'call', 'lemmatizer.lemmatize', '(', ')', 'on', 'each', 'word', '.']


In [28]:
lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
print(lemmatized_output)

Let ’ s lemmatize a simple sentence . We first tokenize the sentence into word using nltk.word_tokenize and then we will call lemmatizer.lemmatize ( ) on each word .


In [29]:
word = 'flying'
w = Word(word)
w.lemmatize()

'flying'

In [15]:
text = "The striped bats are hanging on their feet for best"
sent = TextBlob(text)
" ". join([w.lemmatize() for w in sent.words])

'The striped bat are hanging on their foot for best'

In [31]:
# Loading Libraries
from nltk.tag import DefaultTagger
 
# POS Tagging
tagging = DefaultTagger('NN')
 
tagging.tag_sents([['good', 'morning', '!'], ['aaa', 'bbb', 'ccc']])

[[('good', 'NN'), ('morning', 'NN'), ('!', 'NN')],
 [('aaa', 'NN'), ('bbb', 'NN'), ('ccc', 'NN')]]

In [36]:
#stop words removal
import nltk
from nltk.corpus import stopwords
sw_nltk = stopwords.words('english')
print(sw_nltk)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [37]:
text = "When I first met her she was very quiet. She remained quiet during the entire two hour long journey from Stony Brook to New York."
words = [word for word in text.split() if word.lower() not in sw_nltk]
new_text = " ".join(words)
print(new_text)
print("Old length: ", len(text))
print("New length: ", len(new_text))

first met quiet. remained quiet entire two hour long journey Stony Brook New York.
Old length:  129
New length:  82
