# ***Sentiment Analysis of tweet samples using NAIVE BAYES Algorithm***




In [14]:
import nltk
import os as getcwd
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 

from nltk.corpus import twitter_samples 
nltk.download('twitter_samples')



[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [15]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

train_pos= all_positive_tweets[:4000]
test_pos = all_positive_tweets[4000:]
test_neg= all_negative_tweets[4000:]
train_neg= all_negative_tweets[:4000]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg 


In [57]:
print( len(all_positive_tweets))

5000


In [95]:
train_y = np.append(np.ones((len(train_pos))), np.zeros((len(train_neg))), axis=0)
test_y = np.append(np.ones((len(test_pos))), np.zeros((len(test_neg))), axis=0)

In [96]:
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

train_y.shape = (8000,)
test_y.shape = (2000,)


In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [97]:
import re                                  
import string
from nltk.corpus import stopwords          
from nltk.stem import PorterStemmer        
from nltk.tokenize import TweetTokenizer
def process_tweet(tweet):
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer()
    tweet_tokenized = tokenizer.tokenize(tweet)
    stopwords_english = stopwords.words('english') 
    tweet_processsed=[word for word in tweet_tokenized 
    if word not  in stopwords_english and word not in       
    string.punctuation and word[0]!='@' ]
    stemmer = PorterStemmer() 
    tweet_after_stem=[]
    for word in tweet_processsed:
        word=stemmer.stem(word)
        tweet_after_stem.append(word)
    return tweet_after_stem

custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :)#good#good #morning http://chapagain.com.np"


# print cleaned tweet
print(process_tweet(custom_tweet))


['hello', 'there', 'have', 'great', 'day', ':)', 'good', 'morn']


In [98]:
pos_words=[]
for tweet in all_positive_tweets:
    tweet=process_tweet(tweet)
    
    for word in tweet:
        
        pos_words.append(word)
freq_pos={}
for word in pos_words:
    if (word,1) not in freq_pos:
        freq_pos[(word,1)]=1
    else:
        freq_pos[(word,1)]=freq_pos[(word,1)]+1
neg_words=[]
for tweet in all_negative_tweets:
    tweet=process_tweet(tweet)
    
    for word in tweet:
        
        neg_words.append(word)
freq_neg={}
for word in neg_words:
    if (word,0) not in freq_neg:
        freq_neg[(word,0)]=1
    else:
        freq_neg[(word,0)]=freq_neg[(word,0)]+1
freqs_dict = dict(freq_pos)
freqs_dict.update(freq_neg)


In [99]:
print("type(freqs) = " + str(type(freqs_dict)))
print("len(freqs) = " + str(len(freqs_dict.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 13549


In [100]:
import numpy as np 
def features_extraction( tweet, freqs_dict):
  word_l= process_tweet(tweet)
  x=np.zeros((1,3))
  for i in word_l:
    try:
      x[0,1]+= freqs_dict[(i,1)]
    except:
      x[0,1]+=0
    try:
      x[0,2]+= freqs_dict[(i,0.0)]
    except:
      x[0,2]+= 0
  assert(x.shape==(1,3))
  return x




In [116]:
def lookup(freqs,word , label):
  if (word,label) in freqs:
    return freqs[(word ,label)]
  else:
    return 0
def train_naive_bayes(freqs, train_x, train_y):
    
    loglikelihood = {}
    logprior = 0

    

    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos and N_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:

            # Increment the number of positive words by the count for this 
            #(word, label) pair
            N_pos += freqs[pair]

        # else, the label is negative
        else:

            # increment the number of negative words by the count for this 
            #(word,label) pair
            N_neg += freqs[pair]

    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents 
    D_pos = (len(list(filter(lambda x: x > 0, train_y))))

    # Calculate D_neg, the number of negative documents 
    D_neg = (len(list(filter(lambda x: x <= 0, train_y))))

    # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)

    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = lookup(freqs,word ,1)
        freq_neg = lookup(freqs,word ,0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

 

    return logprior, loglikelihood

In [117]:
logprior, loglikelihood = train_naive_bayes(freqs_dict, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
10763


In [118]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    
    
    # process the tweet to get a list of words
    word_l = process_tweet(tweet)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    

    return p

In [119]:
my_tweet = 'She smiled.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is 3.160988686572


In [93]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
   
    accuracy = 0  

    
    y_hats = []
    for tweet in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    
    error = np.mean(np.absolute(y_hats-test_y))

   
    accuracy = 1-error

   

    return accuracy

In [120]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9980
