**Replace Manual version of Logistic Regression with TF based version.**

In [32]:
import nltk
import pandas as pd
import tensorflow.compat.v1 as tf

from nltk.corpus import twitter_samples
from tensorflow.python.framework.ops import disable_eager_execution

tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [33]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

Pre-processing the data

In [35]:
# cleans the text, tokenizes it into separate words, removes stopwords, and converts words to stems.
def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')

    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags: only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if word not in stopwords_english and word not in string.punctuation:
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)
    return tweets_clean

In [36]:
# a dictionary mapping each (word, sentiment) pair to its frequency
def calc_freqs(tweets, ys):
    yslist = np.squeeze(ys).tolist()
    
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet=tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

Split the data

In [37]:
all_positive_tweets = twitter_samples.strings("positive_tweets.json")
all_negative_tweets = twitter_samples.strings("negative_tweets.json")

# spliting of the data
test_pos = all_positive_tweets[4000:]
test_neg = all_negative_tweets[4000:]
x_test = test_pos + test_neg

train_pos = all_positive_tweets[:4000]
train_neg = all_negative_tweets[:4000]
x_train = train_pos + train_neg

# similarly create labels of training and testing data
y_train = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)
freqs = calc_freqs(x_train, y_train)

In [38]:
# x: a feature vector of dimension (1,2)
def extract_features(tweet, freqs):
    word_l =  process_tweet(tweet)
    x = np.zeros((1, 2)) 
    
    for word in word_l:
        # increment the word count for the positive label 1
        x[0,0] += freqs.get((word, 1.0),0)
        
        # increment the word count for the negative label 0
        x[0,1] += freqs.get((word, 0.0),0)
    
    assert(x.shape == (1, 2))
    return x

Create model class

In [39]:
class TweetLogisticRegressionModel:
    def fit(self, x, y, freq, alpha=1e-4, epochs=30, threshold_val=0.5, record_epoch=False):
        self._inputs = x
        self._tlen = len(y)
        self._freq = freq
        self._shape = 2
        self.threshold_val = threshold_val

        # return sample of standard normal distribution
        self._bias = tf.Variable(np.random.randn(1), name="Bias")
        self._weights = tf.Variable(np.random.randn(1, self._shape), name= "Weight")
        self._tinit = tf.compat.v1.global_variables_initializer()

        self._targets = y
        self.alpha = alpha
        self.epochs = epochs
        self._repoch = record_epoch
        self.__trainModel()
        return

    def __initSaver(self):
        saver = tf.compat.v1.train.Saver()
        self._saver = saver
        self._sesspath = "TSession"
        return

    def threshold(self, val):
        if val >= self.threshold_val:
            return 1
        return 0

    def predict(self, indata):
        with tf.compat.v1.Session() as sess:
            self._saver.restore(sess, save_path=self._sesspath)
            indata = self.__transform_data(indata)
            return sess.run(tf.nn.sigmoid(tf.add(tf.matmul(indata, self._weights, transpose_b=True), self._bias)))
        
        print("Failed to retreive session")
        return
    
    def getAccDets(self):
        if self._repoch:
            return self.__accurary_det
        return
    
    def __trainModel(self):
        if self._repoch:
            self._err = []
            self._precs = []
            self.__accurary_det = []

        # model of Logistic Regression of tensorflow
        logi = tf.nn.sigmoid(tf.add(tf.matmul(self._inputs, b=self._weights, transpose_b=True), self._bias))
        err = tf.nn.sigmoid_cross_entropy_with_logits(logits=logi, labels=self._targets)
        temp = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=self.alpha).minimize(err)
        self.__initSaver()
        with tf.compat.v1.Session() as sess:
            sess.run(self._tinit)
            print(f"Bias: {sess.run(self._bias)}")
            print(f"Weights: {sess.run(self._weights)}")

            for epoch in range(self.epochs):
                sess.run(temp)
                __preds = sess.run(logi)
                acc = ((__preds==self._targets).sum()) / self._tlen
                if self._repoch:
                    self.__accurary_det.append(acc)
                if epoch%10 == 0:
                    print(f"Accuracy: {acc}")
            self._saver.save(sess, self._sesspath)
        return

In [40]:
X = np.zeros((len(x_train), 2))
for i in range(len(x_train)):
    X[i, :] = extract_features(x_train[i], freqs)

model = TweetLogisticRegressionModel()
model.fit(x=X, y= y_train, freq=freqs, alpha=0.01, record_epoch=True, epochs=100)

Bias: [-0.68617635]
Weights: [[0.71908574 0.96995028]]
Accuracy: 0.49625
Accuracy: 0.496375
Accuracy: 0.4965
Accuracy: 0.4965
Accuracy: 0.4965
Accuracy: 0.496875
Accuracy: 0.496875
Accuracy: 0.496875
Accuracy: 0.496875
Accuracy: 0.496875
