In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer, LancasterStemmer,  PorterStemmer,WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('omw-1.4')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [2]:
# word tokenization
data = "All work and no play makes jack a dull boy, all work and no play"
tokens = word_tokenize(data.lower())
print(tokens)

['all', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play']


In [3]:
# sentence tokenization
print(sent_tokenize("I was going home when she rung. It was a surprise."))

['I was going home when she rung.', 'It was a surprise.']


In [4]:
porter = PorterStemmer()
porter.stem('going')

'go'

In [5]:
plurals = ['universal','universe','university']
singles = [porter.stem(plural) for plural in plurals]
print(' '.join(singles))

univers univers univers


In [6]:
plurals = ['alumnus','alumni']
singles = [porter.stem(plural) for plural in plurals]
print(' '.join(singles))

alumnu alumni


In [7]:
text = "Here you can find activities to practise your reading skills. Reading will help you to improve your understanding of the language and build your vocabulary.The self-study lessons in this section are written and organised according to the levels of the Common European Framework of Reference for languages (CEFR). There are different types of texts and interactive exercises that practise the reading skills you need to do well in your studies, to get ahead at work and to communicate in English in your free time.Take our free online English test to find out which level to choose. Select your level, from beginner (CEFR level A1) to advanced (CEFR level C1), and improve your reading skills at your own speed, whenever it's convenient for you."

In [8]:
tokenized_eu = word_tokenize(text)
porter_eu = [porter.stem(word) for word in tokenized_eu]
print(f" PorterStemmer: {100*round(len(''.join(porter_eu))/len(''.join(word_tokenize(text))),3)}%")

snowball = SnowballStemmer(language='english')
porter_eu = [snowball.stem(word) for word in tokenized_eu]
print(f" SnowballStemmer: {100*round(len(''.join(porter_eu))/len(''.join(word_tokenize(text))),3)}%")

lanc = LancasterStemmer()
porter_eu = [lanc.stem(word) for word in tokenized_eu]
print(f" LancasterStemmerr: {100*round(len(''.join(porter_eu))/len(''.join(word_tokenize(text))),3)}%")

 PorterStemmer: 88.4%
 SnowballStemmer: 88.9%
 LancasterStemmerr: 77.5%


In [12]:
nltk.download('wordnet')
porter = PorterStemmer()
lemmatizer = WordNetLemmatizer()
print(f" better\n Stemming: {porter.stem('better')}\n Lemmatization: { lemmatizer.lemmatize('better', pos ='a')}" )

[nltk_data] Downloading package wordnet to /root/nltk_data...


 better
 Stemming: better
 Lemmatization: good


In [13]:
sentence = "There are mistakes"
print(f'Sentence: {sentence}')

word_list = nltk.word_tokenize(sentence)
print(f'word_list: {word_list}')

lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
print(f'Lemmatization: {lemmatized_output}')

Sentence: There are mistakes
word_list: ['There', 'are', 'mistakes']
Lemmatization: There are mistake


In [14]:
print(nltk.pos_tag(nltk.word_tokenize(sentence)))

[('There', 'EX'), ('are', 'VBP'), ('mistakes', 'NNS')]


In [15]:
def get_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
print( ' '.join([lemmatizer.lemmatize(w, get_pos(w)) for w in nltk.word_tokenize(sentence)]))


There be mistake


In [16]:
# importing NLTK libarary stopwords 
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

print(stopwords.words('english'))

# random sentecnce with lot of stop words
sample_text = "Oh man, this is pretty cool. We will do more such things."
text_tokens = word_tokenize(sample_text)

tokens_without_sw = [word for word in text_tokens if not word in stopwords.words('english')]

print(text_tokens)
print(tokens_without_sw)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
import spacy
from nltk.tokenize import word_tokenize
# loading english language model of spaCy
en_model = spacy.load('en_core_web_sm')
# gettign the list of default stop words in spaCy english model
stopwords = en_model.Defaults.stop_words

sample_text = "Oh man, this is pretty cool. We will do more such things."
text_tokens = word_tokenize(sample_text)
tokens_without_sw= [word for word in text_tokens if not word in stopwords]

print(text_tokens)
print(tokens_without_sw)


['Oh', 'man', ',', 'this', 'is', 'pretty', 'cool', '.', 'We', 'will', 'do', 'more', 'such', 'things', '.']
['Oh', 'man', ',', 'pretty', 'cool', '.', 'We', 'things', '.']


In [18]:
from gensim.parsing.preprocessing import remove_stopwords

sample_text = "Oh man, this is pretty cool. We will do more such things."
sample_text_NSW = remove_stopwords(text)

print(word_tokenize(sample_text))
print(word_tokenize(sample_text_NSW))

['Oh', 'man', ',', 'this', 'is', 'pretty', 'cool', '.', 'We', 'will', 'do', 'more', 'such', 'things', '.']
['Here', 'activities', 'practise', 'reading', 'skills', '.', 'Reading', 'help', 'improve', 'understanding', 'language', 'build', 'vocabulary.The', 'self-study', 'lessons', 'section', 'written', 'organised', 'according', 'levels', 'Common', 'European', 'Framework', 'Reference', 'languages', '(', 'CEFR', ')', '.', 'There', 'different', 'types', 'texts', 'interactive', 'exercises', 'practise', 'reading', 'skills', 'need', 'studies', ',', 'ahead', 'work', 'communicate', 'English', 'free', 'time.Take', 'free', 'online', 'English', 'test', 'level', 'choose', '.', 'Select', 'level', ',', 'beginner', '(', 'CEFR', 'level', 'A1', ')', 'advanced', '(', 'CEFR', 'level', 'C1', ')', ',', 'improve', 'reading', 'skills', 'speed', ',', 'it', "'s", 'convenient', 'you', '.']


In [22]:

# importing NLTK libarary stopwords 
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stopwords_default = stopwords.words('english')
print(len(stopwords_default))

stopwords_default.append('like')
, 'marvel', 'ghost'])
print(len(stopwords_default))

# for adding multiple words
stopwords_default.extend(['marvel', 'ghost'])
print(len(stopwords_default))



179
180
182


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
