# Import Necessary Libraries

In [1]:
# Libraries
from keras.src.legacy.preprocessing.text import tokenizer_from_json
import numpy as np

import re, string, nltk
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

import pickle

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\steve\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\steve\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Preprocessing of tweets that is our data

In [2]:

def process_tweet(post):
    lemmatizer = WordNetLemmatizer()
    stopwords_english = set(stopwords.words('english'))
    post = re.sub(r'\$\w*', '', post)
    post = re.sub(r'^RT[\s]+', '', post)
    post = re.sub(r'https?:\/\/.*[\r\n]*', '', post)
    post = re.sub(r'#', '', post)

    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tokens = tokenizer.tokenize(post)

    cleaned = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stopwords_english and word not in string.punctuation
    ]
    return cleaned

# Feature Set (Frequency of words)

In [3]:
# Build a frequency dictionary for each word in tweets to train the model using Logistic Regression

def build_freqs(tweets, ys):
    """
    Build frequency dictionary for each word in tweets

    Input:
        tweets: a list of tweets
        ys: an array of labels for the tweets. an m x 1 array with the sentiment label of each tweet (either 0 or 1).

    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its frequency
    """

    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # The list just needs to be a list, not a numpy array.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all words
    # and over all labels (0, 1) to record the number of occurrences of each word
    # with its corresponding label.
    freqs = {}

    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

In [4]:
# Testing the above code
tweets = ["I am happy", "I love this car", "I do not like this car", "I do not like the sound of the alarm",  "I am sleepy", "I am tired", "I am tired"]
ys = [1, 1, 0, 0, 1, 0, 0]
res = build_freqs(tweets, ys)
print(res)

{('happy', 1): 1, ('love', 1): 1, ('car', 1): 1, ('like', 0): 2, ('car', 0): 1, ('sound', 0): 1, ('alarm', 0): 1, ('sleepy', 1): 1, ('tired', 0): 2}


# Getting the Data

In [5]:
nltk.download('twitter_samples')
# Select Post of negative and positive tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

print('Number of positive tweets: ', len(all_positive_tweets))
print('Number of negative tweets: ', len(all_negative_tweets))

print('The first tweet looks like this:\n', all_positive_tweets[0])

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\steve\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


Number of positive tweets:  5000
Number of negative tweets:  5000
The first tweet looks like this:
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)


# Splitting the Data

In [6]:
# Splitting the data into train and test
train_pos = all_positive_tweets[:4000]
train_neg = all_negative_tweets[:4000]
test_pos = all_positive_tweets[4000:]
test_neg = all_negative_tweets[4000:]


In [7]:
train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [8]:
# Combine positive and negative labels
# Building the y - target variable here
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [9]:
# Create frequency dictionary
freqs = build_freqs(train_x, train_y)
print("type(freqs) = ", type(freqs))
print("len(freqs) = ", len(freqs))

print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) =  <class 'dict'>
len(freqs) =  12385
type(freqs) = <class 'dict'>
len(freqs) = 12385


In [10]:
# Test the function below
print('This is an example of a positive tweet: \n', train_x[24])
print('Label: ', train_y[24])

print("\n This is a processed version of the tweet: \n", process_tweet(train_x[24]))

This is an example of a positive tweet: 
 💅🏽💋 - :)))) haven't seen you in years
Label:  [1.]

 This is a processed version of the tweet: 
 ['💅🏽', '💋', ':)', 'seen', 'year']


# Building the Logistic Regression Model

In [46]:
# Sigmoid Function

def sigmoid(z):
    """
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    """
    zz = np.negative(z)
    h = 1 / (1 + np.exp(zz))
    return h

In [47]:
# Cost Function and Gradient
def gradient_descent(x, y, theta, alpha, num_iters):
    """
    Input:
        x: matrix of features which is (m, n+1)
        y: vector of labels. corresponding labels of the input matrix x, dimensions (m, 1)
        theta: vector of parameters. weight vector of dimension (n+1, 1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        theta: vector of parameters. final weight vector after training
        J: vector of cost for each iteration.  the final cost is J[-1]
    Hint: might want to print the cost to make sure that it is going down
    """
    # m = x.shape[0]
    #
    # get 'm', the number of rows in matrix x
    m = x.shape[0]
    for i in range(0, num_iters):
        z = np.dot(x, theta)
        h = sigmoid(z)
        # calculate the cost function
        cost = -1. / m * (np.dot(y.transpose(), np.log(h)) + np.dot((1 - y).transpose(), np.log(1 - h)))
        # update the weights theta
        theta = theta - (alpha / m) * np.dot(x.transpose(), (h - y))

    cost = float(cost)
    return cost, theta



# Extracting the Features

In [48]:
# Extracting features

def extract_features(tweet, freqs):
    """
    :param tweet: a list of words for one tweet
    :param freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    :return: x : a feature vector of dimension (1, n) where n = 3 (number of features)
    """
    # process_tweet tokenizes the tweet
    word_l = process_tweet(tweet)
    x = np.zeros((1, 3))

    # bias term is set to 1
    x[0, 0] = 1

    # loop through each word in the tweet
    for word in word_l:
        # increment the word count for the positive label 1
        x[0, 1] += freqs.get((word, 1.0), 0)
        # increment the word count for the negative label 0
        x[0, 2] += freqs.get((word, 0.0), 0)

    assert (x.shape == (1, 3))
    return x

In [49]:
# Test on training data
tmp1 = extract_features(train_x[22], freqs)
print(tmp1)

[[1.000e+00 3.004e+03 1.200e+02]]


# Train the Model

In [51]:
# Training the model

# Collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :] = extract_features(train_x[i], freqs)

# training labels corresponding to X
Y = train_y

# Apply gradient descent
# these values are predefined (Andrew NG)
J, theta = gradient_descent(X, Y, np.zeros((3, 1)), 1e-9, 1500)


  cost = float(cost)


# Predict Tweets

In [56]:
def predict_tweet(tweet, freqs, theta):
    """
    Input:
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output:
        y_pred: the probability of a tweet being positive or negative (float)
    """
    # extract the features of the tweet and store it into x
    x = extract_features(tweet, freqs)

    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x, theta))

    return y_pred


In [57]:
# Test Logistic Regression
def test_logistic_regression(test_x, test_y, freqs, theta):
    """

    :param test_x: a list of tweets
    :param test_y: (m,1) vector with the corresponding label of each tweet
    :param freqs: a dictioanry with each frequency of each pair (or tuple)
    :param theta: weight vector of dimension (3,1)
    :return: accuracy: (number of tweets classified correctly) / (total number of tweets)
    """

    # list for stroring predictions
    y_hat = []

    for tweet in test_x:
        # get label prediction for one tweet
        y_pred = predict_tweet(tweet, freqs, theta)
        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1)
        else:
            # append 0 to the list
            y_hat.append(0)

    # calculate the accuracy
    accuracy = (y_hat == np.squeeze(test_y)).sum() / len(test_x)
    return accuracy

In [58]:
# Test the function
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9970


# Predict with my own tweet

In [60]:
# Predict

def pre(sentence):
    yhat = predict_tweet(sentence, freqs, theta)

    if yhat > 0.5:
        return "Positive sentiment"
    elif yhat == 0:
        return "Neutral sentiment"
    else:
        return "Negative sentiment"

In [68]:
my_tweet = "Today is my dad\'s funeral"

In [69]:
res = pre(my_tweet)


In [70]:
res

'Negative sentiment'