# Twitter Sentiment Analysis using naive bayes classifier 

In [1]:
import nltk
from nltk.corpus import twitter_samples
import numpy as np

In [2]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/preeti/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
all_positive_tweets=twitter_samples.strings('positive_tweets.json')
all_negative_tweets=twitter_samples.strings('negative_tweets.json')

In [4]:
print("Number of positive tweets :",len(all_positive_tweets))
print("Number of negative tweets:",len(all_negative_tweets))

print("The type of all positive tweet: ",type(all_positive_tweets))
print("the type of one entry :",type(all_negative_tweets[0]))

Number of positive tweets : 5000
Number of negative tweets: 5000
The type of all positive tweet:  <class 'list'>
the type of one entry : <class 'str'>


In [5]:
print("positive tweet examples:")
print(all_positive_tweets[0])
print("negative tweet examples")
print(all_negative_tweets[0])

positive tweet examples:
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
negative tweet examples
hopeless for tmr :(


In [6]:
#Preprocessing

import re
import string 

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


In [7]:
# remover hyperlinks , 
def remove_hyperlinks_marks_styles(tweet):
    #remove old style retweet text"RT"
    new_tweet=re.sub(r'^RT[\s]+','',tweet)
    
    # remove hyperlinks
    new_tweet=re.sub(r'https?:\/\/.*[\r\n]*','',new_tweet)
    
    #remove hashtag
    new_tweet=re.sub(r'#','',new_tweet)
    
    return new_tweet

In [8]:
# tokenize the string 
tokenizer=TweetTokenizer(preserve_case=False,strip_handles=True,reduce_len=True)

def tokenize_tweet(tweet):
    tweet_tokens=tokenizer.tokenize(tweet)
    
    return tweet_tokens

In [9]:
# remove stop words and punctuation 
nltk.download('stopwords')
# import the english stop words from nltk
stopwords_english=stopwords.words('english')

punctuation=string.punctuation

def remove_stopwords_punctuation(tweet_tokens):
    tweet_clean=[]
    
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in punctuation):
            tweet_clean.append(word)
            
    return tweet_clean

[nltk_data] Downloading package stopwords to /home/preeti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
stemmer =PorterStemmer()
def get_stem(tweet_clean):
    tweet_stem=[]
    for word in tweet_clean:
        stem_word=stemmer.stem(word)
        tweet_stem.append(stem_word)
        
    return tweet_stem

In [11]:
tweet_example = all_positive_tweets[2277]
print(tweet_example)

processed_tweet=remove_hyperlinks_marks_styles(tweet_example)
print(processed_tweet)

tweet_token=tokenize_tweet(processed_tweet)
print(tweet_token)

tweet_clean=remove_stopwords_punctuation(tweet_token)
print(tweet_clean)

tweet_stem=get_stem(tweet_clean)
print(tweet_stem)


My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i
My beautiful sunflowers on a sunny Friday morning off :) sunflowers favourites happy Friday off… 
['my', 'beautiful', 'sunflowers', 'on', 'a', 'sunny', 'friday', 'morning', 'off', ':)', 'sunflowers', 'favourites', 'happy', 'friday', 'off', '…']
['beautiful', 'sunflowers', 'sunny', 'friday', 'morning', ':)', 'sunflowers', 'favourites', 'happy', 'friday', '…']
['beauti', 'sunflow', 'sunni', 'friday', 'morn', ':)', 'sunflow', 'favourit', 'happi', 'friday', '…']


In [12]:
# combine all preprocessing 
def process_tweet(tweet):
    processed_tweet=remove_hyperlinks_marks_styles(tweet)
    tweet_token=tokenize_tweet(processed_tweet)
    tweet_clean=remove_stopwords_punctuation(tweet_token)
    tweet_stem=get_stem(tweet_clean)
    
    return tweet_stem


In [13]:
tweet_example=all_negative_tweets[1000]
print(tweet_example)

final_tweet=process_tweet(tweet_example)
print(final_tweet)


@seanactual You mean you're not offering? :(
['mean', 'offer', ':(']


In [14]:
# split the data into two parts , training and testing 
test_pos=all_positive_tweets[4000:]
train_pos=all_positive_tweets[:4000]
test_neg=all_negative_tweets[4000:]
train_neg=all_negative_tweets[:4000]

train_x=train_pos+train_neg
test_x=test_pos+test_neg

train_y=np.append(np.ones(len(train_pos)),np.zeros(len(train_neg)))
test_y=np.append(np.ones(len(test_pos)),np.zeros(len(test_neg)))

In [15]:
def create_frequency(tweets, ys):
    
    freq_d = {}

    # TODO: Create frequency dictionary
    for tweet,y in zip(tweets,ys):
        for word in process_tweet(tweet):
            pair=(word,y)
            if pair in freq_d:
                freq_d[pair]+=1
            else:
                freq_d[pair]=freq_d.get(pair,1)
    ...
    
    return freq_d

In [16]:
# testing function

tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]

freq_d = create_frequency(tweets, ys)
print(freq_d)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}


In [17]:
# build the freqs dictionary

freqs = create_frequency(train_x, train_y)

In [18]:
# train 
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    
    loglikelihood = {}
    logprior = 0
    
    # calculate the number of unique words in vocab
    unique_words = set([pair[0] for pair in freqs.keys()])
    V = len(unique_words)
    
    # calculate N_pos and N_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        
        # TODO: get N_pos and N_get
        if pair[1]>0:
            N_pos+= freqs[(pair)]
        else:
            N_neg+= freqs[(pair)]
        ...
            
    # TODO: calculate the number of documents (tweets)
    D = train_y.shape[0]
    
    # TODO: calculate D_pos, the number of positive documents (tweets)
    D_pos = sum(train_y)
    
    # TODO: calculate D_neg, the number of negative documents (tweets)
    D_neg = D-sum(train_y)
    
    # TODO: calculate logprior
    logprior = np.log(D_pos)-np.log(D_neg)
    
    # for each unqiue word
    for word in unique_words:
        
        # get the positive and negative frequency of the word
        freq_pos = freqs.get((word,1),0)
        freq_neg = freqs.get((word,0),0)
        
        # calculate the probability that word is positive, and negative
        p_w_pos = (freq_pos+1)/(freq_pos+V)
        p_w_neg = (freq_neg+1)/(freq_neg+V)
        
        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)
        
    return logprior, loglikelihood

In [19]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9083


In [20]:
# UNQ_C4 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''

    # TODO: process the tweet to get a list of words
    word_l = process_tweet(tweet)

    # TODO: initialize probability to zero
    p = 0

    # TODO: add the logprior
    p += logprior

    for word in word_l:

        # TODO: get log likelihood of each keyword
        if word in loglikelihood:
            p+=loglikelihood[word]

    return p

In [21]:
# Run this cell to test your function
for tweet in ['A dear', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great', 'bad bad bad bad']:
    # print( '%s -> %f' % (tweet, naive_bayes_predict(tweet, logprior, loglikelihood)))
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
#     print(f'{tweet} -> {p:.2f} ({p_category})')
    print(f'{tweet} -> {p:.2f}')

I am happy -> 2.13
I am bad -> -1.29
this movie should have been great. -> 2.12
great -> 2.12
great great -> 4.24
great great great -> 6.36
great great great great -> 8.48
bad bad bad bad -> -5.18
