In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import twitter_samples
import numpy as np
import re
import string

In [2]:
def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    tweet = re.sub(r'\$\w*', '', tweet)# remove stock market tickers like $GE
    tweet = re.sub(r'^RT[\s]+', '', tweet)# remove old style retweet text "RT"
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)# remove hyperlinks
    tweet = re.sub(r'#', '', tweet)# only removing the hash # sign from the word
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)# tokenize tweets
    tweet_tokens = tokenizer.tokenize(tweet) #array of words 
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean
    

In [3]:
#Gives frequency dictionary {(word,sentiment):frequency}
def frequency_dict(tweets, ys):
    yslist = np.squeeze(ys).tolist()
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

In [4]:
def sigmoid(z): 
    h = 1/(1+np.exp(-z))
    return h

In [5]:
"""
x - matrix of features (m,n+1)
y - Labels(m,1)
theta - weight vector(n+1,1)
alpha - learning rate
num_iters - number of iterations
J - cost
"""

def gradient_descent(x, y, theta, alpha, num_iters):
    m = x.shape[0]
    for i in range(0, num_iters):
        z = np.dot(x,theta)
        h = sigmoid(z)
        J=-(np.sum(y * np.log(h) + (1-y) * np.log(1-h)))/m 
        # update the weights theta
        theta = theta - ((alpha/m)*(np.dot(x.T,(h-y))))
        if J<=0.0001:
            break;
    J = J.astype(float)
    return J, theta

In [6]:
#feature vector of (1,3)
def extract_features(tweet, freqs):
    word_l = process_tweet(tweet)
    x = np.zeros((1, 3)) 
    x[0,0] = 1 
    for word in word_l:
        pair=(word,0)
        pair1=(word,1)
        if pair not in freqs:
            freqs[pair]=0
        if pair1 not in freqs:
            freqs[pair1]=0
        x[0,1] += freqs[pair1]
        x[0,2] += freqs[pair]      
    assert(x.shape == (1, 3))
    return x

In [7]:
#return the probability of the tweet being positive and negative
def predict_tweet(tweet, freqs, theta):
    x = extract_features(tweet,freqs)
    y_pred = sigmoid(np.dot(x,theta))
    return y_pred

In [8]:
#Return accuacy of the model
def accuracy(test_x, test_y, freqs, theta):
    m=test_y.shape[0]
    y_hat = []
    accuracy=0
    for tweet in test_x:
        y_pred = predict_tweet(tweet, freqs, theta)
        if y_pred > 0.5:
            y_hat.append(1)
        else:
            y_hat.append(0)

    for i in range(m):
        if y_hat[i]==test_y[i]:
            accuracy+=1 
            
    accuracy = accuracy/m
    return accuracy

In [9]:
def sentiment_analysis(tweet,freqs,theta):
    y_h = predict_tweet(tweet,freqs,theta)
    if y_h>0.5:
        print("POSITIVE")
    else:
        print("NEGATIVE")

In [10]:
# select the set of positive and negative tweets
#Download twitter_samples and stopwords if you haven't already
#nltk.download('twitter_samples') 
#nltk.download('stopwords')
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [11]:
# split the data into two pieces, one for training and one for testing (validation set) 
# Since this is not a very big set, we will split it into 80%-training and 20%-validation
# If it is a very big set, we will spilt it into 98%-training and 2%-validation
#5000 positive tweets and 5000 negative tweets
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]
train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [12]:
# Creating positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [13]:
freq_dict=frequency_dict(train_x,train_y)

In [14]:
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freq_dict)

J, theta = gradient_descent(X, train_y, np.zeros((3, 1)), 1e-9, 1500)

In [15]:
print("ACCURACY OF THE MODEL: {}%".format(accuracy(test_x,test_y,freq_dict,theta)*100))

ACCURACY OF THE MODEL: 99.5%


In [16]:
tweet1 = "Today my class was boring but i enjoyed"
tweet2 = "I am very much sad today"
sentiment_analysis(tweet1,freq_dict,theta)
sentiment_analysis(tweet2,freq_dict,theta)

POSITIVE
NEGATIVE


In [17]:
print("Cost: {} \nTheta:\n {}".format(J,theta))

Cost: 0.24216528822055303 
Theta:
 [[ 7.25244119e-08]
 [ 5.23898414e-04]
 [-5.55171267e-04]]
