# Aim:
* Extract features for logistic regression given some text
* Implement logistic regression from scratch
* Apply logistic regression on a natural language processing task
* Test logistic regression

We will be using a data set of tweets.

## Import functions and data

In [59]:
import nltk
from nltk.corpus import twitter_samples 
import pandas as pd
import tensorflow as tf
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

In [60]:

nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [61]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

# Preprocessing

In [62]:
#process_tweet(): cleans the text, tokenizes it into separate words, removes stopwords, and converts words to stems.
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')

    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
      if (word not in stopwords_english and word not in string.punctuation):  
        stem_word = stemmer.stem(word)
        tweets_clean.append(stem_word)
    return tweets_clean

In [63]:
#build_freqs counts how often a word in the 'corpus' (the entire set of tweets) was associated with
  # a positive label '1'         or 
  # a negative label '0', 

#then builds the freqs dictionary, where each key is a (word,label) tuple, 

#and the value is the count of its frequency within the corpus of tweets.

def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    yslist = np.squeeze(ys).tolist()
    freqs = {}

    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            #Update the count of pair if present, set it to 1 otherwise
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

### Prepare the data
* The `twitter_samples` contains subsets of 5,000 positive tweets, 5,000 negative tweets, and the full set of 10,000 tweets.  

In [64]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [65]:
# split the data into two pieces, one for training and one for testing
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

x_train = np.array(train_pos + train_neg)
x_test = np.array(test_pos + test_neg)

In [66]:
# combine positive and negative labels
y_train = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [67]:
# create frequency dictionary
freqs = build_freqs(x_train,y_train)

type(freqs) = <class 'dict'>
len(freqs) = 11346


In [69]:
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # tokenizes, stems, and removes stopwords
    #############################################################
    word_l =  process_tweet(tweet)
    
    x = np.zeros((1, 2)) 
        
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,0] += freqs.get((word, 1.0),0)
        
        # increment the word count for the negative label 0
        x[0,1] += freqs.get((word, 0.0),0)
        
    
    assert(x.shape == (1, 2))
    return x

# Model

In [86]:
class TweetLogisticRegressionModel:
  
  def fit(self, x, y,freq, alpha = 1e-4, epochs = 20, threshold_val = 0.5, record_epoch = False):
    self.threshold_val = threshold_val
    self._freq = freq
    self._shape = 2
    self._inputs = x
    self._tlen = len(y)

    # return sample of standard normal distribution
    self._bias = tf.Variable(np.random.randn(1), name = "Bias")
    self._weights = tf.Variable(np.random.randn(1, self._shape), name = "Weight")
    self._tinit = tf.compat.v1.global_variables_initializer()
    
    self._targets = y
    self.alpha = alpha
    self.epochs = epochs
    self._repoch = record_epoch
    self.__trainModel()
    return


  def __initSaver(self):
    saver = tf.compat.v1.train.Saver()
    self._saver = saver
    self._sesspath = 'TSession'
    return


  def threshold(self, val):
    if (val >= self.threshold_val):
      return (1)
    return (0)


  def predict(self, indata):
    with tf.compat.v1.Session() as sess:
      self._saver.restore(sess,save_path = self._sesspath)
      indata = self.__transform_data(indata)
      return sess.run(tf.nn.sigmoid(tf.add(tf.matmul(a = indata, b = self._weights, transpose_b = True), self._bias)))
    print("Failed To Retrieve Session")
    return


  def getAccDets(self):
    if self._repoch:
      return self.__accurary_det
    return


  def __trainModel(self):
    if self._repoch:
      self._err = []
      self._precs = []
      self.__accurary_det = []
    
    # model of logistic regreesion of tensorflow.
    logi = tf.nn.sigmoid(tf.add(tf.matmul(a = self._inputs, b = self._weights, transpose_b = True), self._bias))
    err = tf.nn.sigmoid_cross_entropy_with_logits(logits = logi, labels = self._targets)
    temp = tf.compat.v1.train.GradientDescentOptimizer(learning_rate = self.alpha, name='GradientDescent').minimize(err)
    self.__initSaver()
    with tf.compat.v1.Session() as sess:
      sess.run(self._tinit)
      print("Bias", sess.run(self._bias))
      print("Weights", sess.run(self._weights))
      for epoch in range(self.epochs):
        sess.run(temp)
        __preds = sess.run(logi)
        acc = ((__preds == self._targets).sum()) / self._tlen
        if self._repoch:
          self.__accurary_det.append(acc)
        if epoch % 10 == 0:
          print("Acc:", acc)
        # print(self.__accurary_det)
      self._saver.save(sess, self._sesspath)
    return

In [87]:
X = np.zeros((len(x_train), 2))
for i in range(len(x_train)):
    X[i, :]= extract_features(x_train[i], freqs)

model = TweetLogisticRegressionModel()
model.fit(x=X,y=y_train,freq=freqs, alpha=0.01, record_epoch=True, epochs = 100)

Bias [0.36196477]
Weights [[-0.51923439 -1.14085343]]
Acc: 0.469375
Acc: 0.468125
Acc: 0.46725
Acc: 0.4655
Acc: 0.532875
Acc: 0.532875
Acc: 0.532875
Acc: 0.532875
Acc: 0.532875
Acc: 0.532875
