# Logisitc Regression to analyze the positivity of tweets

In [24]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
import random

#### Download the tweet sample supplied by nltk

In [9]:
from nltk.corpus import twitter_samples
nltk.download("twitter_samples")

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\howuseeit\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

### 1. Preprocessing the tweets

In [173]:
## Read tweets into positive tweets list and negative tweets list
positive_tweets = twitter_samples.strings("positive_tweets.json")
negative_tweets = twitter_samples.strings("negative_tweets.json")

In [174]:
## Split the train, test dataset with 4:1 portion

X_train = positive_tweets[:4000] + negative_tweets[:4000]
X_test  = positive_tweets[4000:] + negative_tweets[4000:]

y_train = np.append(np.ones(len(positive_tweets[:4000])), np.zeros(len(negative_tweets[:4000])))
y_test = np.append(np.ones(len(positive_tweets[4000:])), np.zeros(len(negative_tweets[4000:])))

y_train = y_train.reshape(y_train.shape[0],1)
y_test = y_test.reshape(y_test.shape[0],1)


#### Preprocess the tweets by:
##### 1. use regular expression to strip unneccessary texts
##### 2. use stopwords by nltk to strip stopwords
##### 3. use PorterStemmer by nltk to get the absolute base form of words
##### 4. use nltk tokenizer to tokenize every tweets to list of words

In [175]:
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import re
import string

from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\howuseeit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [176]:

def tweet_process(tweets):
    tweets_after_cleaning = []

    for tweet in tweets:
        
        # stripping down hyperlinks, retweets indication, hashtags
        tweet = re.sub(r'^RT[\s]+', '',tweet)
        tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet)
        tweet = re.sub(r'#','',tweet)
        
        #tokenizing tweets
        tokenizer = TweetTokenizer(preserve_case = False,
                                   strip_handles = True,
                                   reduce_len = True)
        
        tweet = tokenizer.tokenize(tweet)
        
        #remove stopwords
        for i in tweet:
            if i in stopwords.words("english") or i in string.punctuation:
                    tweet.remove(i) 
                    
         # Stemming: strip words to most general form
        stemmer = PorterStemmer()
        tweet = [stemmer.stem(i) for i in tweet]
        tweets_after_cleaning.append(tweet)
        
    return tweets_after_cleaning   


### Build a word frequency dictionary with (word, positivity) to track every word in a non-repetitive manner which will be used to count positivity of each words

In [90]:

def word_frequency(Y, tweets):
    
    #searh through each word in the preprocessed tweets 
    #and make a dictionary of each words
    
    ys = np.squeeze(Y).tolist()
    freqs= {}
    
    for y,tweet in zip(ys, tweets):
        for word in tweet:
            
            pair = (word, y)
            
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

#### Extract features of a word by counting:
##### 1. Bias 1
##### 2. the time that it associates with a positive tweets
##### 3. the time that it associates with a negative tweets

In [114]:
def Extracting_features(tweets, freqs):
    
    x = np.zeros((len(tweets), 3))
    
    #set bias terms to 1
    x[:,0] = 1
    
    for tweet in tweets:
        for word in tweet:
            x[tweets.index(tweet), 1] += freqs.get((word, 1.0), 0)
            x[tweets.index(tweet), 2] += freqs.get((word, 0.0), 0)
            
    return x
            

##### Define sigmoid function
##### Use Gradient Descent to optimize the theta

In [187]:
def sigmoid(Z):
    return 1/(1 + np.exp(-Z))

def GradientDescent(x,y,theta,alpha,num_iters):
    
    m = x.shape[0]
    for i in range(num_iters):
        y_hat = sigmoid(np.dot(x,theta))

        J = -1/m * (np.dot(y.transpose(), y_hat) + np.dot((1-y).transpose(),(1-y_hat)))
    
        theta = theta - alpha/m * np.dot(x.transpose(), (y_hat - y))

    return float(J), theta

In [120]:
def predict(tweets, freqs, theta):
    
    x = Extracting_features(tweets, freqs)
    y_hat = sigmoid(np.dot(x,theta))
    
    return y_hat

def accuracy(y_hat, y):
    
    predicts = []
    for prediction in y_hat:
        if prediction > 0.5:
            predicts.append(1.0)
        else:
            predicts.append(0.0)
            
    accuracy = np.sum(np.array(predicts) == y.reshape(np.array(predicts).shape)) / len(predicts)
    return accuracy

In [None]:
## Preprocess training tweets and test tweets
X_train_post = tweet_process(X_train)
X_test_post  = tweet_process(X_test)

## Build the frequency dictionary for every word
freqs = word_frequency(y_train, X_train_post)

## Extract features from Training dataset
features = Extracting_features(X_train_post,freqs)

## Use Gradient Descent to train theta with alpha = 1e-8
J, theta = GradientDescent(features, y_train, np.zeros((3,1)), 1e-8, 1500)

## prediction of test data
predicts = predict(X_test_post,freqs, theta)

#accuracy
accuracy(predicts, y_test)