In [1]:
#Tokenization

from nltk.tokenize import sent_tokenize,word_tokenize
example_text = 'Hello Mr. Smith, How are you doing Today? The weather is great and Python is awesome. The sky is pinkish-blue. You should not eat cardboard.'
print(sent_tokenize(example_text))
print(word_tokenize(example_text))
#for i in word_tokenize(example_text):
#    print(i)

['Hello Mr. Smith, How are you doing Today?', 'The weather is great and Python is awesome.', 'The sky is pinkish-blue.', 'You should not eat cardboard.']
['Hello', 'Mr.', 'Smith', ',', 'How', 'are', 'you', 'doing', 'Today', '?', 'The', 'weather', 'is', 'great', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', 'not', 'eat', 'cardboard', '.']


In [2]:
#stopwords

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in word_tokenize(example_text) if not w in stop_words]
print(filtered_words)
#for i in word_tokenize(example_text):
#    if i not in stop_words:
#        print(i)

['Hello', 'Mr.', 'Smith', ',', 'How', 'Today', '?', 'The', 'weather', 'great', 'Python', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', 'eat', 'cardboard', '.']


In [3]:
#stemming

from nltk.stem import PorterStemmer
from nltk.stem import ISRIStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer
example_words = ['Python','Pythoner','Pythoned','Pythoning','Pythonly']
new_text = 'It is very important to be pythonly while you are pythoning in python. All pythoners have pythoned poorly at least once'
ps = PorterStemmer()
Is = ISRIStemmer()
ls = LancasterStemmer()

new_word = word_tokenize(new_text)
stemmed=[]

for i in new_word:
    stemmed.append(ps.stem(i))
print(stemmed)

#for i in example_words:
    #print(ps.stem(i))
    #print(Is.stem(i))
    #print(ls.stem(i))

['It', 'is', 'veri', 'import', 'to', 'be', 'pythonli', 'while', 'you', 'are', 'python', 'in', 'python', '.', 'all', 'python', 'have', 'python', 'poorli', 'at', 'least', 'onc']


In [4]:
#parts of speech tagging

import nltk
from nltk.tag import pos_tag
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = word_tokenize(i)
            tagged = pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))
#process_content()                       "remove co"

In [5]:
#chunking


import nltk
from nltk.tag import pos_tag
from nltk.corpus import state_union
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.tokenize import PunktSentenceTokenizer
train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = word_tokenize(i)
            tagged = pos_tag(words)
            chunkgram = """C:{<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkparser = nltk.RegexpParser(chunkgram)
            chunked = chunkparser.parse(tagged)
            print(chunked)                                #use chunked.draw() instead of print()
    except Exception as e:
        print(str(e))
#process_content()

In [6]:
#chinking
#removal

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = word_tokenize(i)
            tagged = pos_tag(words)
            chunkgram = """C:{<.*>+}
                            }<VB.?|IN|DT|TO>+{"""
            chunkparser = nltk.RegexpParser(chunkgram)
            chunked = chunkparser.parse(tagged)
            print(chunked)                                     #use chunked.draw() instead of print()
    except Exception as e:
        print(str(e))
#process_content()

In [7]:
#Named Entity Relationship

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = word_tokenize(i)
            tagged = pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged)
            print(namedEnt)                     #use draw() instead of print()                           
    except Exception as e:
        print(str(e))
#process_content()

In [8]:
#Lemmatizer

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('better',pos='a'))       #adjective
print(lemmatizer.lemmatize('best','a'))
print(lemmatizer.lemmatize('python'))
print(lemmatizer.lemmatize('cats'))
print(lemmatizer.lemmatize('run'))
print(lemmatizer.lemmatize('cacti'))

good
best
python
cat
run
cactus


In [9]:
#wordnet

from nltk.corpus import wordnet
#synonyms set
syns = wordnet.synsets("program")
print(syns)
#just a word
print(syns[0].lemmas()[0].name())
#definition
print(syns[0].definition())
#examples
print(syns[0].examples())

synonyms=[]
antonyms=[]

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
            
print("Synonyms:",set(synonyms))
print("Antonyms:",set(antonyms))

[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]
plan
a series of steps to be carried out or goals to be accomplished
['they drew up a six-step plan', 'they discussed plans for a new bond issue']
Synonyms: {'secure', 'sound', 'near', 'just', 'proficient', 'honorable', 'serious', 'estimable', 'right', 'goodness', 'well', 'unspoilt', 'safe', 'practiced', 'dear', 'in_force', 'salutary', 'effective', 'upright', 'beneficial', 'trade_good', 'expert', 'adept', 'in_effect', 'thoroughly', 'unspoiled', 'commodity', 'undecomposed', 'respectable', 'skillful', 'good', 'dependable', 'soundly', 'ripe', 'full', 'honest', 'skilful'}
Antonyms: {'badness', 'evilness', 'bad', 'evil', 'ill'}


In [10]:
#similarity check
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')

w3 = wordnet.synset('ship.n.01')
w4 = wordnet.synset('sheep.n.01')

w5 = wordnet.synset('cat.n.01')
w6 = wordnet.synset('boat.n.01')


print(w1.wup_similarity(w2))
print(w3.wup_similarity(w4))
print(w5.wup_similarity(w6))

0.9090909090909091
0.2962962962962963
0.32
