In [1]:
import re
import string
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

# Data Creation

In [2]:
all_df = pd.read_csv('../Reviews.csv')
all_df = all_df[['Score', 'Text']]
all_df.drop_duplicates(inplace = True)

In [3]:
positive_reviews = all_df[all_df.Score == 5]
negative_reviews = all_df[all_df.Score != 5]

positive_reviews = positive_reviews['Text'].astype(str).values.tolist()
negative_reviews = negative_reviews['Text'].astype(str).values.tolist()

In [4]:
# split the data into two pieces, one for training and one for testing (validation set) 
pos_cutoff = int(len(positive_reviews)*0.8)
neg_cutoff = int(len(negative_reviews)*0.8)

test_pos = positive_reviews[pos_cutoff:]
train_pos = positive_reviews[:pos_cutoff]
test_neg = negative_reviews[neg_cutoff:]
train_neg = negative_reviews[:neg_cutoff]

In [5]:
train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [6]:
# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis = 0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis = 0)

In [7]:
# Print the shape train and test sets
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

train_y.shape = (314940, 1)
test_y.shape = (78735, 1)


# Preprocessing  
### Stemming, remove stop words, tokenize reviews and build frequency dictionary

In [8]:
def process_review(review):
    """Process review function.
    Input:
        review: a string containing a review
    Output:
        reviews_clean: a list of words containing the processed review
    """
    stemmer = SnowballStemmer('english')
    stopwords_english = stopwords.words('english')
    
    # remove html tags like <br />
    review = re.sub(r'<.*?>', ' ', review) # .* is for greedy and .*? makes it not greedy
    # remove --- or --
    review = re.sub(r'---', ' ', review)
    review = re.sub(r'--', ' ', review)
    # remove numbers
    review = re.sub(r'[0-9]', '', review)
    
    # tokenize review
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    review_tokens = tokenizer.tokenize(review)

    reviews_clean = []
    for word in review_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            stem_word = stemmer.stem(word)  # stemming word
            reviews_clean.append(stem_word)

    return reviews_clean

In [9]:
def build_freqs(reviews, ys):
    """Build frequencies.
    Input:
        reviews: a list of reviews
        ys: an m x 1 array with the sentiment label of each review
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    yslist = np.squeeze(ys).tolist()

    freqs = {}
    for y, review in zip(yslist, reviews):
        for word in process_review(review):
            pair = (word, y)
            freqs[pair] = freqs.get(pair, 0) + 1
            
    return freqs

In [10]:
# choose some random review
review_1 = positive_reviews[101]
review_2 = positive_reviews[10700]
print(review_1, '\n')
print(process_review(review_1), '\n')
print(review_2, '\n')
print(process_review(review_2), '\n')

review_y = np.append(np.ones((1, 1)), np.zeros((1, 1)), axis = 0)

review = [review_1, review_2]
freqs = build_freqs(review, review_y) # Build frequencies

print('frequencies:')
print(freqs) # Print the result

This is one of the best salsas that I have found in a long time but stay away from the variety pack. The other two that come with it are not worth your money. 

['one', 'best', 'salsa', 'found', 'long', 'time', 'stay', 'away', 'varieti', 'pack', 'two', 'come', 'worth', 'money'] 

You can buy this at the pet store for about 2.19 a can . on the web here this is a rip. Sorry 

['buy', 'pet', 'store', 'web', 'rip', 'sorri'] 

frequencies:
{('one', 1.0): 1, ('best', 1.0): 1, ('salsa', 1.0): 1, ('found', 1.0): 1, ('long', 1.0): 1, ('time', 1.0): 1, ('stay', 1.0): 1, ('away', 1.0): 1, ('varieti', 1.0): 1, ('pack', 1.0): 1, ('two', 1.0): 1, ('come', 1.0): 1, ('worth', 1.0): 1, ('money', 1.0): 1, ('buy', 0.0): 1, ('pet', 0.0): 1, ('store', 0.0): 1, ('web', 0.0): 1, ('rip', 0.0): 1, ('sorri', 0.0): 1}


In [11]:
freqs = build_freqs(train_x, train_y)

# Logistic regression

Logistic regression takes a regular linear regression, and applies a sigmoid to the output of the linear regression.  
Logistic regression:
$$ h(z) = \frac{1}{1+\exp^{-z}}$$
$$z = \theta_0 x_0 + \theta_1 x_1 + \theta_2 x_2 + ... \theta_N x_N$$  
  
The cost function used for logistic regression is the average of the log loss across all training examples:

$$J(\theta) = -\frac{1}{m} \sum_{i=1}^m y^{(i)}\log (h(z(\theta)^{(i)})) + (1-y^{(i)})\log (1-h(z(\theta)^{(i)})) $$  

The gradient of the cost function $J$ with respect to one of the weights $\theta_j$ is:

$$\nabla_{\theta_j}J(\theta) = \frac{1}{m} \sum_{i=1}^m(h^{(i)}-y^{(i)})x^{(i)}_j $$

To update the weight $\theta_j$, we adjust it by subtracting a fraction of the gradient determined by $\alpha$:
$$\theta_j = \theta_j - \alpha \times \nabla_{\theta_j}J(\theta) $$
$$\theta = \theta - \frac{\alpha}{m} \times (x^{T} \cdot (h-y)) $$

In [12]:
def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    h = 1/(1+(np.exp(-z)))
    
    return h

In [13]:
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    '''
    # the number of rows in matrix x
    m = x.shape[0]

    for i in range(0, num_iters):
        # get z, the dot product of x and theta
        z = np.dot(x, theta)

        # get the sigmoid of z
        h = sigmoid(z)

        # calculate the cost function
        J = -1./m*(np.dot(y.T, np.log(h)) + np.dot((1-y).T, np.log(1-h)))
        
        # update the weights theta
        theta = theta - (alpha/m)*np.dot(x.T, (h-y))

    return float(J), theta

In [14]:
def extract_features(review, freqs):
    '''
    Input: 
        review: a string of words for one review
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_review(review)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1), 0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0), 0)
        
    assert(x.shape == (1, 3))
    return x

# Training

In [15]:
# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :] = extract_features(train_x[i], freqs)

# training labels corresponding to X
Y = train_y

In [16]:
# Apply gradient descent
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-11, 10000)

In [17]:
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.56264156.
The resulting vector of weights is [0.0, 1.853e-05, -2.627e-05]


# Testing

In [18]:
def predict_review(review, freqs, theta):
    '''
    Input: 
        review: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output: 
        y_pred: the probability of a review being positive or negative
    '''
    # extract the features of the review and store it into x
    x = extract_features(review, freqs)
    
    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x, theta))
    
    return y_pred

In [19]:
def test_logistic_regression(test_x, freqs, theta):
    """
    Input: 
        test_x: a list of reviews
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    Output: 
        y_hat: the predictions
    """
    # the list for storing predictions
    y_hat = []
    
    for review in test_x:
        # get the label prediction for the review
        y_pred = predict_review(review, freqs, theta)
        y_hat.append(1.0) if y_pred > 0.5 else y_hat.append(0)

    return y_hat

In [20]:
y_hat = test_logistic_regression(test_x, freqs, theta)

# Scores

In [21]:
def scoring(test_y, y_hat):
    """
    Input:
        test_y: (m, 1) vector with the corresponding labels for the list of reviews
        y_hat: the predictions
    Output:
        accuracy: (# of reviews classified correctly) / (total # of reviews)
        precision: TP/(TP+FP)
        recall: TP/(TP+FN)
        f1_score: 2*(precision*recall)/(precision+recall)
    """
    test_y = pd.Series((i[0] for i in test_y), name='Actual')
    y_hat = pd.Series(y_hat, name='Predicted')

    confusion = pd.crosstab(test_y, y_hat) 
    tn = confusion[0][0]
    fn = confusion[0][1]
    fp = confusion[1][0]
    tp = confusion[1][1]
    
    accuracy = (tn+tp)/(tn+tp+fp+fn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1_score = 2*(precision*recall)/(precision+recall)
    
    return accuracy, precision, recall, f1_score

In [22]:
scores = scoring(test_y, y_hat)
print(f"Logistic regression model's accuracy = {scores[0]:.4f}")
print(f"Logistic regression model's precision = {scores[1]:.4f}")
print(f"Logistic regression model's recall = {scores[2]:.4f}")
print(f"Logistic regression model's f1-score = {scores[3]:.4f}")

Logistic regression model's accuracy = 0.7325
Logistic regression model's precision = 0.7649
Logistic regression model's recall = 0.8374
Logistic regression model's f1-score = 0.7995


# Error Analysis

In [None]:
for x,y in zip(test_x,test_y):
    y_hat = predict_review(x, freqs, theta)

    if np.abs(y - (y_hat > 0.5)) > 0:
        print('THE REVIEW IS:', x)
        print('THE PROCESSED REVIEW IS:', process_review(x))
        print('%d\t%0.8f\t' % (y, y_hat))

# Predict my own review

In [23]:
my_review = 'This food tasted meh'
print(process_review(my_review))
y_pred = predict_review(my_review, freqs, theta)
print(y_pred)
if y_pred > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['food', 'tast', 'meh']
[[0.45068514]]
Negative sentiment
