In [1]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
#nltk.download() - to install components of nltk

In [None]:
#Normally used terms

#Corpus - Body of text. Corpora is a collection of texts, journals 

#Lexicon - Words and their meanings. Based on the area the first meaning varies.
#eg : Bull for a financial investor refers to someone who is interested in market whereas in english 
#bull refers to an animal 

#token - refers to an entity which is split based on rules from a particular thing. 
#Like word tokenize from a sentence

In [None]:
#Tokenization

sen = "I am going for a run. I will be running very fast as this run should be made as ran not even ran actually running is the word "

lists = word_tokenize(sen)
print(lists)

In [None]:
#Stop words 
# In NLP useless words are referred to as stopwords. i.e words which doesnt convey any meaning 
#eg: very,having,ourseleves,is,was etc

# These words can be eliminated when we are analyzing the text data as they dont convey any meaning

from nltk.corpus import stopwords

set(stopwords.words('english'))


In [None]:
#stemming the words
#The reason why we stem is to shorten the lookup, and normalize sentences.

ps = PorterStemmer() #Stemming the words - run,running both refer to run except the tense.
#example 
print(ps.stem('Pythoning'))

In [None]:
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
for w in example_words:
    print(ps.stem(w))

In [None]:
#POS - Parts of Speech Tags  - i.e labelling words in a sentence as nouns,adjs,verbs etc..


# POS tag list:

# CC	coordinating conjunction
# CD	cardinal digit
# DT	determiner
# EX	existential there (like: "there is" ... think of it like "there exists")
# FW	foreign word
# IN	preposition/subordinating conjunction
# JJ	adjective	'big'
# JJR	adjective, comparative	'bigger'
# JJS	adjective, superlative	'biggest'
# LS	list marker	1)
# MD	modal	could, will
# NN	noun, singular 'desk'
# NNS	noun plural	'desks'
# NNP	proper noun, singular	'Harrison'
# NNPS	proper noun, plural	'Americans'
# PDT	predeterminer	'all the kids'
# POS	possessive ending	parent's
# PRP	personal pronoun	I, he, she
# PRP$	possessive pronoun	my, his, hers
# RB	adverb	very, silently,
# RBR	adverb, comparative	better
# RBS	adverb, superlative	best
# RP	particle	give up
# TO	to	go 'to' the store.
# UH	interjection	errrrrrrrm
# VB	verb, base form	take
# VBD	verb, past tense	took
# VBG	verb, gerund/present participle	taking
# VBN	verb, past participle	taken
# VBP	verb, sing. present, non-3d	take
# VBZ	verb, 3rd person sing. present	takes
# WDT	wh-determiner	which
# WP	wh-pronoun	who, what
# WP$	possessive wh-pronoun	whose
# WRB	wh-abverb	where, when

In [None]:
from nltk.tokenize import sent_tokenize,PunktSentenceTokenizer
from nltk.corpus import state_union #It has president speeches

In [None]:
state_union.raw("2005-GWBUSH.txt")

In [None]:
sent_tokenize(state_union.raw("2005-GWBUSH.txt"))[:5] # First five sentences

In [None]:
new = PunktSentenceTokenizer() #Unsupervised Machine learning tokenizer(It can be trained too)

In [None]:
sent = new.tokenize(state_union.raw("2005-GWBUSH.txt"))[:5]

In [None]:
# Chunking is a concept where we group in words into meaningful chunks. mainly to group words which are in relation
# to the noun - Regular expressions are used to achieve this 
for i in sent:
    words = word_tokenize(i);
    tags = nltk.pos_tag(words);
    chunkgram = r"""chunk:{<RB.?>*<NNP>+<NN>?}"""
    parser = nltk.RegexpParser(chunkgram)
    chunked = parser.parse(tags)
    print(chunked)
    #chunked.draw()

In [None]:
for subtree in chunked.subtrees():
    print(subtree)

In [None]:
# to print subtrees with label chunk we provided.
for subtree in chunked.subtrees(filter=lambda t:t.label()=='chunk'):
    print(subtree)

In [None]:
# Chinking is used if we have to remove some chunks from the created chunks
for i in sent:
    words = word_tokenize(i);
    tags = nltk.pos_tag(words);
    chunkgram = r"""chunk:{<RB.?>*<NNP>+<NN>?}
                        }<VB.?|IN|DT>+{"""
    parser = nltk.RegexpParser(chunkgram)
    chunked = parser.parse(tags)
    print(chunked)
    chunked.draw()

In [None]:
#Named entity recognition
#The idea is to have the machine immediately
#be able to pull out "entities" like people, places, things, locations, monetary figures, and more.

# ORGANIZATION - Georgia-Pacific Corp., WHO
# PERSON - Eddy Bonte, President Obama
# LOCATION - Murray River, Mount Everest
# DATE - June, 2008-06-29
# TIME - two fifty a m, 1:30 p.m.
# MONEY - 175 million Canadian Dollars, GBP 10.40
# PERCENT - twenty pct, 18.75 %
# FACILITY - Washington Monument, Stonehenge
# GPE - South East Asia, Midlothian

In [None]:
for i in sent:
    words = word_tokenize(i);
    tags = nltk.pos_tag(words);
    chunked = nltk.ne_chunk(tags)
    print(chunked)
    chunked.draw()

In [None]:
#Lemmatizing in NLTK
#A very similar operation to stemming. The major difference is stemming can often create non-existent
#words, whereas lemmas are actual words.
#your root stem(from stemming), meaning the word you end up with, is not something you can just
#look up in a dictionary, but you can look up a lemma(after lemmetizing)

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))

In [None]:
#Accessing corpora data folder.. In windows goto Appdata folder and check for it.
#print(nltk.__file__) -- In this location find data file and check the path.
from nltk.corpus import nps_chat
data = nps_chat.xml("10-19-adults_706posts.xml")

In [None]:
## Wordnet - Important corpora - FOr checking synonyms, Antonyms, Similarity of words
from nltk.corpus import wordnet
syn = wordnet.synsets("best")



In [None]:
#synonyms
synonyms = []
for l in syn:
    for f in l.lemmas():
        synonyms.append(f.name())
        
print(synonyms)

In [None]:
#similarity 
a1 = wordnet.synset("well.n.01") # n refers to noun here- use lemmas to find similarity
a2 = wordnet.synset("best.n.02")


In [3]:
#Example to workon -- Movie reviews
from nltk.corpus import movie_reviews

In [4]:
document = []
for category in movie_reviews.categories():
    for fileids in movie_reviews.fileids(category):
        document.append((movie_reviews.words(fileids),category))

In [5]:
words = movie_reviews.words()

In [6]:
#all_words=[w for w in words if w not in stopwords.words('english')]

In [7]:
all_words = [w.lower() for w in words]

In [8]:
import string

In [9]:
all_words = [ w for w in all_words if w not in string.punctuation]

In [10]:
final_words = nltk.FreqDist(all_words)

In [11]:
features = list(final_words.keys())[:4000]

In [12]:
features

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 'drink',
 'and',
 'then',
 'drive',
 'they',
 'get',
 'into',
 'an',
 'accident',
 'one',
 'of',
 'the',
 'guys',
 'dies',
 'but',
 'his',
 'girlfriend',
 'continues',
 'see',
 'him',
 'in',
 'her',
 'life',
 'has',
 'nightmares',
 'what',
 's',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'out',
 'critique',
 'mind',
 'fuck',
 'for',
 'generation',
 'that',
 'touches',
 'on',
 'very',
 'cool',
 'idea',
 'presents',
 'it',
 'bad',
 'package',
 'which',
 'is',
 'makes',
 'this',
 'review',
 'even',
 'harder',
 'write',
 'since',
 'i',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'with',
 'your',
 'head',
 'such',
 'lost',
 'highway',
 'memento',
 'there',
 'are',
 'good',
 'ways',
 'making',
 'all',
 'types',
 'these',
 'folks',
 'just',
 'didn',
 't',
 'snag',
 'correctly',
 'seem',
 'have',
 'taken',
 'pretty',
 'neat',
 'concept',
 'executed',
 'terribly',
 'so',
 'prob

In [13]:
def features_extract(document):
    words = set(document)
    features_final = {}
    for w in features:
        features_final[w]=(w in words)
    return features_final

In [14]:
featuresets = [(features_extract(rev), category) for (rev, category) in document]

In [15]:
import random

In [16]:
random.shuffle(featuresets)

In [17]:
train = featuresets[1:1800]
test = featuresets[1800:]

In [18]:
from nltk.classify import NaiveBayesClassifier

In [19]:
NBmodel = nltk.NaiveBayesClassifier.train(train)

In [20]:
nltk.classify.accuracy(NBmodel, test)

0.8

In [21]:
import pickle 
final_words = open('final_pickle','wb')
pickle.dump(NBmodel,final_words)
final_words.close()

In [22]:
model = open('final_pickle','rb')
new = pickle.load(model)
model.close()
nltk.classify.accuracy(new,test)

0.8

In [23]:
#Combining Algorithms 

In [24]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import logistic, SGDClassifier
#from sklearn.model_selection import train_test_split

In [25]:
MNB = SklearnClassifier(MultinomialNB())
MNB_model = MNB.train(train)
print("MNB_model- Accuracy",nltk.classify.accuracy(MNB_model,test))
logit = SklearnClassifier(logistic.LogisticRegression())
logit_model = logit.train(train)
print("logit_model- Accuracy",nltk.classify.accuracy(logit_model,test))
Ber = SklearnClassifier(BernoulliNB())
BN_model = Ber.train(train)
print("BN_model- Accuracy",nltk.classify.accuracy(BN_model,test))
SGD = SklearnClassifier(SGDClassifier())
SGD_model = SGD.train(train)
print("SGD_model- Accuracy",nltk.classify.accuracy(SGD_model,test))

MNB_model- Accuracy 0.835
logit_model- Accuracy 0.825
BN_model- Accuracy 0.795
SGD_model- Accuracy 0.775


In [26]:
from nltk.classify import ClassifierI
from statistics import mode

In [27]:
def classify(classifier,features):
    voted = []
    for c in classifier:
        vote = c.classify(features)
        voted.append(vote)
    return mode(voted)  

In [28]:
classifier = [MNB_model,logit_model,BN_model,SGD_model]

In [29]:
classify(classifier,test[0][0])

'pos'

In [30]:
class votedclassifier(ClassifierI):
    def __init__(self,*classifier):
        self.classifier = classifier
        
    def classify(self,features):
        voted = []
        for c in self.classifier:
            vote = c.classify(features)
            voted.append(vote)
        return mode(voted)

In [31]:
classifier_vote = votedclassifier(MNB_model,logit_model,BN_model)

In [32]:
classifier_vote

<__main__.votedclassifier at 0x1ef770cddd8>

In [33]:
print("New accuracy",nltk.classify.accuracy(classifier_vote,test))

New accuracy 0.83


In [34]:
def sentiment(tweet):
    features = features_extract(tweet)
    return votedclassifier(MNB_model,logit_model,BN_model).classify(features)

In [51]:
## Sentiment from Twitter

In [54]:
## Twitter Sentiment Analysis

In [37]:
from tweepy import Stream
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler


In [38]:
ckey= '7JzpvzEkgIXygwHZNDlHtbyul'
csecret = 'WIbSoJPACDhDlNn2ekrLDO1ikeWzwAh5OFT8V7BavCLkkMVPJM'
atoken = '915301601987919872-ihW5x7SV8dMW5lAG0nbyi69YfnmqQQl'
asecret = 'xfz573YVAzIdSbMSlF5ffxLhzp6laG11vCfgCUoyWiRLd'

In [39]:
auth = OAuthHandler(ckey,csecret)

In [40]:
auth.set_access_token(atoken,asecret)

In [41]:
Streamtweets = Stream(auth,StreamListener())

In [42]:
Streamtweets.filter(track = ["Deloitte"])

In [58]:
class mylistener(StreamListener):
    def on_data(self, raw_data):
        tweet = raw_data.split(',"text":"')[1].split('","source":"')[0]
        sent_data = open('senti_data.txt','a')
        sent_data.write(sentiment(tweet))
        sent_data.close()
#         save_data = open('deloitte_tweets.csv','a')
#         new_data = open('deloitte_clean.csv','a')
#         new_data.write(tweet)
#         new_data.close()
#         save_data.write(raw_data)
#         save_data.write('\n')
#         save_data.close()
        #print(raw_data)
        return True
    def on_error(self, status):
        print(status)

In [None]:
Streamtweets = Stream(auth,mylistener())
Streamtweets.filter(track = ["Deloitte"])