## SENTIMENT ANALYSIS ON TWEETS

In [2]:
# Import modules
import re
import string
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import twitter_samples 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

nltk.download('twitter_samples') # https://www.nltk.org/howto/twitter.html
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer() # use for stemming remove -ing and all
    stopwords_english = stopwords.words('english')

    tweet = re.sub(r'\$\w*', '', tweet) # remove stock market tickers like $GE
    tweet = re.sub(r'^RT[\s]+', '', tweet) # remove old style retweet text "RT"
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet) # remove hyperlinks
    tweet = re.sub(r'#', '', tweet) # remove hashtags

    # tokenize tweets
    tokenizer = TweetTokenizer(
        preserve_case=False, 
        strip_handles=True,
        reduce_len=True
        )
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [4]:
def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    yslist = np.squeeze(ys).tolist() # zip needs an iterable so np_arr -> list, squeeze to convert to 1D

    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

# tweet = ['hi i am ryan','i am sad', "i am happy"]
# ys = [[1],[0],[1]]
# build_freqs(tweet,ys)

# Output will be like 
# {('hi', 1): 1, ('ryan', 1): 1, ('sad', 0): 1, ('happi', 1): 1}

In [5]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [6]:
# 20% TEST & 80% TRAIN
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0) # axis 0 is column wise
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0) # row 1 ,column 0 

In [7]:
# print("train_y.shape = " + str(train_y.shape))
# print("test_y.shape = " + str(test_y.shape))

In [8]:
freqs = build_freqs(train_x, train_y)

# print("type(freqs) = " + str(type(freqs)))
# print("len(freqs) = " + str(len(freqs.keys())))

In [9]:
# print('Example of a positive tweet: \n', train_x[0])
# print('\nProcessed version of the tweet: \n', process_tweet(train_x[0]))

In [10]:
def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    h = 1 / (1 + np.exp(-z)) # https://numpy.org/doc/stable/reference/routines.math.html
    return h

In [11]:
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    '''
    m = len(x) # rows
    
    for i in range(0, num_iters):

        z = np.dot(x,theta)
        h = sigmoid(z)
        J = -1/m * (np.dot(y.T, np.log(h)) + np.dot((1-y).T, np.log(1-h)))
        theta -= (alpha/m) * np.dot(x.T, (h-y))

        if i % 50 == 0:
            print(f'Cost at iteration {i}: {J}')
        
    J = float(J)
    return J, theta

In [12]:
# # Test
# np.random.seed(1)
# tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
# tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)
# tmp_J, tmp_theta = gradientDescent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 700)
# print(f"The cost after training is {tmp_J:.8f}.")
# print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")

In [13]:
def extract_features(tweet, freqs, process_tweet=process_tweet):
    '''
    Input: 
        tweet: a string containing one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    word_l = process_tweet(tweet)
    x = np.zeros(3) 
    x[0] = 1 # bias

    for word in word_l:
        if (word, 1) in freqs:
            x[1] += freqs[(word, 1)]

        if (word, 0) in freqs:
            x[2] += freqs[(word, 0)]

    x = x[None, :]
    assert(x.shape == (1, 3))
    
    return x # [ bias, sum pos. freqs, sum neg. freqs]

In [14]:
# Training Model
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

Y = train_y
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

Cost at iteration 0: [[0.69314718]]
Cost at iteration 50: [[0.64050664]]
Cost at iteration 100: [[0.59522326]]
Cost at iteration 150: [[0.55602586]]
Cost at iteration 200: [[0.52186107]]
Cost at iteration 250: [[0.49188044]]
Cost at iteration 300: [[0.4654039]]
Cost at iteration 350: [[0.44188404]]
Cost at iteration 400: [[0.42087709]]
Cost at iteration 450: [[0.40202052]]
Cost at iteration 500: [[0.38501586]]
Cost at iteration 550: [[0.36961555]]
Cost at iteration 600: [[0.35561279]]
Cost at iteration 650: [[0.34283372]]
Cost at iteration 700: [[0.33113124]]
Cost at iteration 750: [[0.3203802]]
Cost at iteration 800: [[0.31047349]]
Cost at iteration 850: [[0.30131903]]
Cost at iteration 900: [[0.29283724]]
Cost at iteration 950: [[0.28495906]]
Cost at iteration 1000: [[0.2776243]]
Cost at iteration 1050: [[0.27078028]]
Cost at iteration 1100: [[0.26438071]]
Cost at iteration 1150: [[0.2583848]]
Cost at iteration 1200: [[0.25275648]]
Cost at iteration 1250: [[0.24746371]]
Cost at itera

  J = float(J)


In [15]:
def predict_tweet(tweet, freqs, theta):
    '''
    Input: 
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output: 
        y_pred: the probability of a tweet being positive or negative
    '''
    x = extract_features(tweet, freqs)
    y_pred = sigmoid(np.dot(x,theta))
    
    return y_pred

In [16]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))    

I am happy -> 0.519275
I am bad -> 0.494347
this movie should have been great. -> 0.515980
great -> 0.516065
great great -> 0.532097
great great great -> 0.548063
great great great great -> 0.563930


  print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))


In [17]:
predict_tweet('I am learning :)', freqs, theta)

array([[0.83110764]])

In [18]:
def test_logistic_regression(test_x, test_y, freqs, theta, predict_tweet=predict_tweet):
    """
    Input: 
        test_x: a list of tweets
        test_y: (m, 1) vector with the corresponding labels for the list of tweets
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    Output: 
        accuracy: (# of tweets classified correctly) / (total # of tweets)
    """

    y_hat = []
    
    for tweet in test_x:
        y_pred = predict_tweet(tweet, freqs, theta)
        
        if y_pred > 0.5:
            y_hat.append(1.0)
        else:
            y_hat.append(0.0)

    test_y = test_y.flatten()
    
    accuracy = sum(y_hat == test_y) / len(test_y)
    
    return accuracy

tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9950


In [19]:
# Some error done by model
print('Label Predicted Tweet')
for x,y in zip(test_x,test_y):
    y_hat = predict_tweet(x, freqs, theta)

    if np.abs(y - (y_hat > 0.5)) > 0:
        print('THE TWEET IS:', x)
        print('THE PROCESSED TWEET IS:', process_tweet(x))
        print('%d\t%0.8f\t%s' % (y, y_hat, ' '.join(process_tweet(x)).encode('ascii', 'ignore')))

Label Predicted Tweet
THE TWEET IS: @MarkBreech Not sure it would be good thing 4 my bottom daring 2 say 2 Miss B but Im gonna be so stubborn on mouth soaping ! #NotHavingit :p
THE PROCESSED TWEET IS: ['sure', 'would', 'good', 'thing', '4', 'bottom', 'dare', '2', 'say', '2', 'miss', 'b', 'im', 'gonna', 'stubborn', 'mouth', 'soap', 'nothavingit', ':p']
1	0.48942981	b'sure would good thing 4 bottom dare 2 say 2 miss b im gonna stubborn mouth soap nothavingit :p'
THE TWEET IS: I'm playing Brain Dots : ) #BrainDots
http://t.co/UGQzOx0huu
THE PROCESSED TWEET IS: ["i'm", 'play', 'brain', 'dot', 'braindot']
1	0.48418981	b"i'm play brain dot braindot"
THE TWEET IS: I'm playing Brain Dots : ) #BrainDots http://t.co/aOKldo3GMj http://t.co/xWCM9qyRG5
THE PROCESSED TWEET IS: ["i'm", 'play', 'brain', 'dot', 'braindot']
1	0.48418981	b"i'm play brain dot braindot"
THE TWEET IS: I'm playing Brain Dots : ) #BrainDots http://t.co/R2JBO8iNww http://t.co/ow5BBwdEMY
THE PROCESSED TWEET IS: ["i'm", 'play', 

  print('%d\t%0.8f\t%s' % (y, y_hat, ' '.join(process_tweet(x)).encode('ascii', 'ignore')))


THE TWEET IS: off to the park to get some sunlight : )
THE PROCESSED TWEET IS: ['park', 'get', 'sunlight']
1	0.49636406	b'park get sunlight'
THE TWEET IS: @msarosh Uff Itna Miss karhy thy ap :p
THE PROCESSED TWEET IS: ['uff', 'itna', 'miss', 'karhi', 'thi', 'ap', ':p']
1	0.48250522	b'uff itna miss karhi thi ap :p'
THE TWEET IS: @phenomyoutube u probs had more fun with david than me : (
THE PROCESSED TWEET IS: ['u', 'prob', 'fun', 'david']
0	0.50988296	b'u prob fun david'
THE TWEET IS: pats jay : (
THE PROCESSED TWEET IS: ['pat', 'jay']
0	0.50040366	b'pat jay'
THE TWEET IS: my beloved grandmother : ( https://t.co/wt4oXq5xCf
THE PROCESSED TWEET IS: ['belov', 'grandmoth']
0	0.50000002	b'belov grandmoth'
THE TWEET IS: Sr. Financial Analyst - Expedia, Inc.: (#Bellevue, WA) http://t.co/ktknMhvwCI #Finance #ExpediaJobs #Job #Jobs #Hiring
THE PROCESSED TWEET IS: ['sr', 'financi', 'analyst', 'expedia', 'inc', 'bellevu', 'wa', 'financ', 'expediajob', 'job', 'job', 'hire']
0	0.50648699	b'sr finan

In [20]:
my_tweet = 'This is a ridiculously bright movie. The plot was terrible and I was sad until the ending!'
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['ridicul', 'bright', 'movi', 'plot', 'terribl', 'sad', 'end']
[[0.48125421]]
Negative sentiment
