# ***Sentiment Analysis of tweet samples using Logistic regression***




In [14]:
import nltk
import os as getcwd
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 

from nltk.corpus import twitter_samples 
nltk.download('twitter_samples')



[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [15]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

train_pos= all_positive_tweets[:4000]
test_pos = all_positive_tweets[4000:]
test_neg= all_negative_tweets[4000:]
train_neg= all_negative_tweets[:4000]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg 


In [57]:
print( len(all_positive_tweets))

5000


In [16]:
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [17]:
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

train_y.shape = (8000, 1)
test_y.shape = (2000, 1)


In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [78]:
import re                                  
import string
from nltk.corpus import stopwords          
from nltk.stem import PorterStemmer        
from nltk.tokenize import TweetTokenizer
def process_tweet(tweet):
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer()
    tweet_tokenized = tokenizer.tokenize(tweet)
    stopwords_english = stopwords.words('english') 
    tweet_processsed=[word for word in tweet_tokenized 
    if word not  in stopwords_english and word not in       
    string.punctuation and word[0]!='@' ]
    stemmer = PorterStemmer() 
    tweet_after_stem=[]
    for word in tweet_processsed:
        word=stemmer.stem(word)
        tweet_after_stem.append(word)
    return tweet_after_stem

print('This is an example of a positive tweet: \n', train_x[0])
print('This is an example of a processed positive tweet: \n', process_tweet(train_x[0]))


This is an example of a positive tweet: 
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
This is an example of a processed positive tweet: 
 ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [79]:
pos_words=[]
for tweet in all_positive_tweets:
    tweet=process_tweet(tweet)
    
    for word in tweet:
        
        pos_words.append(word)
freq_pos={}
for word in pos_words:
    if (word,1) not in freq_pos:
        freq_pos[(word,1)]=1
    else:
        freq_pos[(word,1)]=freq_pos[(word,1)]+1
neg_words=[]
for tweet in all_negative_tweets:
    tweet=text_process(tweet)
    
    for word in tweet:
        
        neg_words.append(word)
freq_neg={}
for word in neg_words:
    if (word,0) not in freq_neg:
        freq_neg[(word,0)]=1
    else:
        freq_neg[(word,0)]=freq_neg[(word,0)]+1
freqs_dict = dict(freq_pos)
freqs_dict.update(freq_neg)


In [60]:
print("type(freqs) = " + str(type(freqs_dict)))
print("len(freqs) = " + str(len(freqs_dict.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 16255


In [69]:
import numpy as np 
def features_extraction( tweet, freqs_dict):
  word_l= process_tweet(tweet)
  x=np.zeros((1,3))
  for i in word_l:
    try:
      x[0,1]+= freqs_dict[(i,1)]
    except:
      x[0,1]+=0
    try:
      x[0,2]+= freqs_dict[(i,0.0)]
    except:
      x[0,2]+= 0
  assert(x.shape==(1,3))
  return x




In [70]:
X = np.zeros((len(train_x), 3))
    
for i in range(len(train_x)):
    
    X[i, :]= features_extraction(train_x[i], freqs_dict)

In [71]:
def sigmoid(z):                                                                                   
    h = 1 / (1 + np.exp(-z))
    return h

def gradientDescent(x, y, theta, alpha, num_iters):
    
    m = x.shape[0]
    
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = np.dot(x,theta)
        
        # get the sigmoid of z
        h = sigmoid(z)
        
        # calculate the cost function
        J = -1./m * (np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(),np.log(1-h)))    

        # update the weights theta
        theta = theta = theta - (alpha/m) * np.dot(x.transpose(),(h-y))
        
    ### END CODE HERE ###
    J = float(J)
    return J, theta

In [72]:
Y = train_y

# Apply gradient descent
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 5000)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.12414201.
The resulting vector of weights is [0.0, 0.00097518, -0.00076929]


In [73]:
def predict_tweet(tweet, freqs, theta):
    
    
    # extract the features of the tweet and store it into x
    x = features_extraction(tweet,freqs)
    
    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x,theta))
    
    ### END CODE HERE ###
    
    return y_pred

In [74]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs_dict, theta)))

I am happy -> 0.458439
I am bad -> 0.403293
this movie should have been great. -> 0.538616
great -> 0.537388
great great -> 0.574360
great great great -> 0.610519
great great great great -> 0.645502


In [75]:
def test_logistic_regression(test_x, test_y, freqs_dict, theta):
    # the list for storing predictions
    y_hat = []
    
    for tweet in test_x:
        # get the label prediction for the tweet
        y_pred = predict_tweet(tweet, freqs_dict, theta)
        
        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1)
        else:
            # append 0 to the list
            y_hat.append(0)

    m=len(y_hat)
    y_hat=np.array(y_hat)
    y_hat=y_hat.reshape(m)
    test_y=test_y.reshape(m)
    accuracy = (y_hat==np.squeeze(test_y)).sum()/len(test_x)
    
    return accuracy

In [76]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs_dict, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9870
