In [196]:
import nltk
import pickle

In [197]:
from nltk.corpus import twitter_samples
from nltk.tokenize import TweetTokenizer

from nltk.corpus import stopwords
stopwords_english = stopwords.words('english')

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

import string
import re

final_words = []

pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')
all_tweets = twitter_samples.strings('tweets.20150430-223406.json')


# HAPPY EMOTICONS
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

# SAD EMOTICONS
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

# ALL EMOTICONS
emoticons = emoticons_happy.union(emoticons_sad)


In [198]:
class sentiment:
    
    def __init__(self,tweet):
        self.tweet = tweet
        self.final_words =[]
    
    def stock(self):
        self.tweet = re.sub(r'\$\w*', '', self.tweet)

         
    def retweet(self):
        self.tweet = re.sub(r'^RT[\s]+', '', self.tweet)
        
    def hyperlinks(self):
        self.tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', self.tweet)

        
    def hashtag(self):
        self.tweet = re.sub(r'#', '', self.tweet)
        
    def tokenize(self):
        tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
        tweet_tokens = tokenizer.tokenize(self.tweet)
        self.tweet = tweet_tokens
        
    def algorithm(self):
        for word in self.tweet:
            if( word not in stopwords_english and
                 word not in emoticons and       
                   word not in string.punctuation):
            
                stem_word = stemmer.stem(word)
                self.final_words.append(stem_word)
                
    def all_functions(self):
        self.stock()
        self.retweet()
        self.hyperlinks()
        self.hashtag()
        self.tokenize()
        self.algorithm()
        
                        
                
    def getResult(self):
        return self.final_words
        
       
                


In [199]:
def bag_of_words(tweet):
    s=sentiment(tweet)
    s.all_functions()

    final_words = s.getResult()
    words = final_words
    words_dictionary = dict([word, True] for word in words)
    return words_dictionary


In [200]:
pos_tweets_set = []
for tweet in pos_tweets:
    pos_tweets_set.append((bag_of_words(tweet), 'pos'))   
    

In [201]:
neg_tweets_set=[]
for tweet in neg_tweets:
    neg_tweets_set.append((bag_of_words(tweet), 'neg'))

In [202]:
from random import shuffle
shuffle(pos_tweets_set)
shuffle(neg_tweets_set)

test_set = pos_tweets_set[:1000] + neg_tweets_set[:1000]
train_set = pos_tweets_set[1000:] + neg_tweets_set[1000:]

In [203]:
from nltk import classify
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_set)
accuracy = classify.accuracy(classifier, test_set)
model = classifier
print('Accuracy is :',accuracy*100,'%') 

print (classifier.show_most_informative_features(10))  


Accuracy is : 74.4 %
Most Informative Features
                     via = True              pos : neg    =     37.0 : 1.0
                     sad = True              neg : pos    =     28.1 : 1.0
                     bam = True              pos : neg    =     25.0 : 1.0
                     x15 = True              neg : pos    =     21.0 : 1.0
                  welcom = True              pos : neg    =     15.9 : 1.0
                   arriv = True              pos : neg    =     15.0 : 1.0
                    sick = True              neg : pos    =     14.2 : 1.0
                    glad = True              pos : neg    =     13.8 : 1.0
                     ugh = True              neg : pos    =     13.0 : 1.0
               goodnight = True              pos : neg    =     12.3 : 1.0
None


In [204]:
custom_tweet = "I hated the film. It was a disaster. Poor direction, bad acting."
custom_tweet_set = bag_of_words(custom_tweet)
print(classifier.classify(custom_tweet_set)) 

neg


In [205]:
prob_result = classifier.prob_classify(custom_tweet_set)
print(prob_result)
print(prob_result.max())
print (prob_result.prob("neg"))
print (prob_result.prob("pos"))

<ProbDist with 2 samples>
neg
0.73800282286521
0.2619971771347897


In [206]:
custom_tweet = "This laptop is brilliant" 
custom_tweet_set = bag_of_words(custom_tweet)
print (classifier.classify(custom_tweet_set))

pos


In [207]:
prob_result = classifier.prob_classify(custom_tweet_set)
print (prob_result)
print (prob_result.max()) 
print (prob_result.prob("neg")) 
print (prob_result.prob("pos"))


<ProbDist with 2 samples>
pos
0.18749999999999975
0.8124999999999992


In [208]:
from collections import defaultdict

actual_set = defaultdict(set)
predicted_set = defaultdict(set)

actual_set_cm = []
predicted_set_cm = []

for index, (feature, actual_label) in enumerate(test_set):
    actual_set[actual_label].add(index)
    actual_set_cm.append(actual_label)
    
    predicted_label = classifier.classify(feature)
    
    predicted_set[predicted_label].add(index)
    predicted_set_cm.append(predicted_label)
    

In [209]:
from nltk.metrics import precision, recall, f_measure, ConfusionMatrix

print('Pos Precision :',precision(actual_set['pos'], predicted_set['pos']))
print('Pos Recall :',recall(actual_set['pos'], predicted_set['pos']))
print('Pos F-measure:', f_measure(actual_set['pos'], predicted_set['pos']))
print('Neg Precision:', precision(actual_set['neg'], predicted_set['neg']))
print('Neg Recall:', recall(actual_set['neg'], predicted_set['neg']))
print('Neg F-measure:', f_measure(actual_set['neg'], predicted_set['neg']))

Pos Precision : 0.7337164750957854
Pos Recall : 0.766
Pos F-measure: 0.7495107632093932
Neg Precision: 0.7552301255230126
Neg Recall: 0.722
Neg F-measure: 0.7382413087934561


In [210]:
# Confusion Matrix
cm = ConfusionMatrix(actual_set_cm, predicted_set_cm)
print (cm)

    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<722>278 |
pos | 234<766>|
----+---------+
(row = reference; col = test)



In [211]:
print (cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

    |      n      p |
    |      e      o |
    |      g      s |
----+---------------+
neg | <36.1%> 13.9% |
pos |  11.7% <38.3%>|
----+---------------+
(row = reference; col = test)



In [212]:
filename = 'finalized_model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [213]:
from nltk import classify
with open('finalized_model.pkl','rb') as f:
    finalized_model = pickle.load(f)
    
custom_tweet = "I loved the movie"
custom_tweet_set = bag_of_words(custom_tweet)
print(finalized_model.classify(custom_tweet_set)) 
accuracy = classify.accuracy(finalized_model, test_set)
print('Accuracy is :',accuracy*100 , '%') 



pos
Accuracy is : 74.4 %
