# Logistic Regression for Sentiment Analysis

In [1]:
import pandas as pd
import numpy as np


#from nltk.stem.porter import PorterStemmer
import re
import nltk
import string
import gensim
from gensim import corpora, models, similarities

#dealing with operating system , like reading file
import os

#from nltk.corpus import stopwords
from __future__ import division

from sklearn.feature_extraction.text import HashingVectorizer


### Getting a tokenized corpus for a word2vec model

In [2]:
df = pd.read_csv('shuffled_movie_data.csv')

#SOME CLEANING
def remove_punctuation(text):    
    return text.translate(None, string.punctuation) 

# Remove punctuation.
df['review_clean'] = df['review'].apply(remove_punctuation)



In [None]:
# Create the corpus
corpus = df['review_clean'].values.tolist()

# Tokenize corpus
#tok_corpus = [nltk.word_tokenize(sent.decode('utf-8')) for sent in corpus ]

#tokenizastion! saved the trained model here
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

tok_corpus = tokenizer.tokenize(corpus)

In [22]:
# sanity check
print corpus[1]

def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

print sentence_to_wordlist(corpus[1])

OK so I really like Kris Kristofferson and his usual easy going delivery of lines in his movies Age has helped him with his soft spoken low energy style and he will steal a scene effortlessly But Disappearance is his misstep Holy Moly this was a bad movie br br I must give kudos to the cinematography and and the actors including Kris for trying their darndest to make sense from this goofy confusing story None of it made sense and Kris probably didnt understand it either and he was just going through the motions hoping someone would come up to him and tell him what it was all about br br I dont care that everyone on this movie was doing out of love for the project or some such nonsense Ive seen low budget movies that had a plot for goodness sake This had none zilcho nada zippo empty of reason a complete waste of good talent scenery and celluloid br br I rented this piece of garbage for a buck and I want my money back I want my 2 hours back I invested on this Grade F waste of my time Don

### Train a word2vec model (using gensim library)

In [8]:
import multiprocessing
#step 2 build our model, another one is Glove
#define hyperparameters

# Dimensionality of the resulting word vectors.
#more dimensions mean more traiig them, but more generalized
num_features = 300

#
# Minimum word count threshold.
min_word_count = 12

# Number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 7

# Downsample setting for frequent words.
#rate 0 and 1e-5 
#how often to use
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
seed = 1

In [9]:
reviews2vec = gensim.models.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling,
)

In [10]:
reviews2vec.build_vocab(tok_corpus)
print("Word2Vec vocabulary length:", len(reviews2vec.wv.vocab))

('Word2Vec vocabulary length:', 61388)


In [11]:
reviews2vec.train(tok_corpus, total_examples=reviews2vec.corpus_count, epochs=reviews2vec.iter)

28585120

### save and load the word2vec model

In [12]:
#save model
if not os.path.exists("trained"):
    os.makedirs("trained")
reviews2vec.save(os.path.join("trained", "reviews2vec.w2v"))

## If you already have trained a model, you can start executing the lines from here

In [13]:
#load model
reviews2vec = gensim.models.Word2Vec.load(os.path.join("trained", "reviews2vec.w2v"))

### Excercise 1: define new features

In [14]:

word_list = ['awful', "awesome", "like","really", "good", "best", "interesting", "worst", "excellent", "poor", "awful", "terrible"  ]

important_words =[]

for word in word_list:    
    top_words = reviews2vec.most_similar(word, topn=10)
    top_words = [ word[0].encode('ascii','ignore') for word in top_words ]
    important_words += top_words
    
from random import shuffle
# shuffle the features
shuffle(important_words)
print important_words

    

['herei', 'subpar', 'comparebr', 'thrillerdrama', 'goodfor', 'actuallybr', 'cheesiest', 'suckedbr', 'storyor', 'moviejust', 'stupidest', 'watchin', 'seenand', 'badi', 'badand', 'badi', 'finest', 'lousy', 'weak', 'nominees', 'dreadful', 'awfulbr', 'funniest', 'horrible', 'dullest', 'badand', 'goodand', 'badmouth', 'catastrophically', 'firstthe', 'terriblethe', 'funnyso', 'crappiest', 'greatthe', 'engagingbr', 'exciting', 'nostalgias', 'goodthe', 'strangest', 'nominationsbr', 'finebr', 'skimmed', 'storywriting', 'wasi', 'dreadful', 'terrible', 'goodi', 'substantive', 'apalling', 'badthe', 'terriblebr', 'fantastic', 'whodoneit', 'dreadfulbr', 'greatthe', 'substandard', 'slowest', 'catastrophically', 'baftas', 'fascinating', 'outstanding', 'sillybut', 'terriblethe', 'superb', 'comedymusical', 'terrific', 'horrible', 'badand', 'bestworst', 'nominationbr', 'terrible', 'abit', 'horrible', 'actium', 'stunningbr', 'exceptional', 'awfulbr', 'halfbad', 'badi', 'badand', 'poorthe', 'goodit', 'supe

In [16]:
reviews2vec.most_similar('awful')

[(u'horrible', 0.6686462163925171),
 (u'terriblebr', 0.6507787704467773),
 (u'dreadful', 0.6422439217567444),
 (u'lousy', 0.6419680118560791),
 (u'awfulbr', 0.641074538230896),
 (u'terrible', 0.6394296288490295),
 (u'catastrophically', 0.6370165944099426),
 (u'badand', 0.6366477012634277),
 (u'terriblethe', 0.6366315484046936),
 (u'stupidthe', 0.6365667581558228)]

In [77]:
#This will take a while
for word in important_words:
    df[word] = df['review_clean'].apply(lambda s : s.split().count(word))

In [89]:
print len(important_words)

#df


120


### Excercise 2: implement a MaxEnt classifier, using regularization, according this https://web.stanford.edu/~jurafsky/slp3/7.pdf

In [79]:
#Split data 
#90% - train set,
#10% - validation set,
train_data, validation_data = np.split(df.sample(frac=1), [int(.9*len(df))])

In [80]:
print train_data.shape
print validation_data.shape
train_data.head()

(45000, 202)
(5000, 202)


Unnamed: 0,review,sentiment,review_clean,atrocious,astounding,sfx,incredible,Wellthe,appalling,BADbr,...,knowall,TERRIBLE,stupidest,welland,Gerrys,intriguing,WEBS,Blevel,lamest,impeccable
24868,A tedious gangster film that leaves you wishin...,0,A tedious gangster film that leaves you wishin...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43562,Joseph L. Mankiewicz's Sleuth didn't need a re...,0,Joseph L Mankiewiczs Sleuth didnt need a remak...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13313,Evan Almighty continues the mainstream Bruce A...,0,Evan Almighty continues the mainstream Bruce A...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49159,"Look, although we don't like to admit it, we'v...",1,Look although we dont like to admit it weve al...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43120,I have lost count of just how many times I hav...,1,I have lost count of just how many times I hav...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [81]:
def get_numpy_data(data_frame, output_name, features_name):
    data_frame_copy = data_frame.copy()
    
    features_frame = pd.core.frame.DataFrame()    
    for feature_name in features_name:
        features_frame[feature_name] = data_frame_copy[feature_name]
    feature_matrix = features_frame.values
    output_sarray = data_frame[output_name]
    output_array = output_sarray.values
    return(feature_matrix, output_array)

In [82]:
#Passing data to numpy matrix and array
feature_matrix_train, sentiment_train = get_numpy_data( train_data, 'sentiment', important_words)
feature_matrix_valid, sentiment_valid = get_numpy_data( validation_data, 'sentiment', important_words) 

In [83]:
sentiment_train = np.reshape(sentiment_train, (-1, 1))
sentiment_train = sentiment_train.T
feature_matrix_train = feature_matrix_train.T

In [84]:
print feature_matrix_train.shape
print sentiment_train.shape

(95, 45000)
(1, 45000)


In [85]:
sentiment_valid = np.reshape(sentiment_valid, (-1, 1))
sentiment_valid = sentiment_valid.T
feature_matrix_valid = feature_matrix_valid.T

print feature_matrix_valid.shape
print sentiment_valid.shape

(95, 5000)
(1, 5000)


In [86]:
#Helper method for gradient descent algorithm

def sigmoid(Z):
    
    
    # Computing P(y_i = +1 | x_i, w) using the link function
    #predictions = 1 /(1+ np.exp(-Z))  #predictions in probabilities
    
    return 1 /(1+ np.exp(-Z))


def layer_sizes(X, Y):
    n_x = len(X) # size of input layer
    n_h = 4
    n_y = len(Y) # size of output layer
    return (n_x, n_h, n_y)


def initialize_parameters(n_x, n_h, n_y):
    
    np.random.seed(2) # we set up a seed so that your output matches ours although the initialization is random.

    W1 = np.random.randn(n_h,n_x) * 0.01
    b1 = np.zeros((n_h,1))
    W2 = np.random.randn(n_y,n_h) * 0.01
    b2 = np.zeros((n_y,1))
    
    assert (W1.shape == (n_h, n_x))
    assert (b1.shape == (n_h, 1))
    assert (W2.shape == (n_y, n_h))
    assert (b2.shape == (n_y, 1))
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    return parameters

def forward_propagation(X, parameters):
    
    # Retrieve each parameter from the dictionary "parameters"
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    # Implement Forward Propagation to calculate A2 (probabilities)
    Z1 = np.dot(W1, X) + b1
    A1 = np.tanh(Z1)
    
    Z2 = np.dot(W2,A1) + b2
    A2 = sigmoid(Z2)
    
    assert(A2.shape == (1, X.shape[1]))
    
    cache = {"Z1": Z1,
             "A1": A1,
             "Z2": Z2,
             "A2": A2}
    
    return A2, cache


def compute_cost(A2, Y, parameters):
    
    m = Y.shape[1] # number of example

    # Compute the cross-entropy cost
    logprobs = np.dot(np.log(A2), Y.T) + np.dot(np.log(1 - A2), 1-Y.T)
    cost = -1/m * np.sum(logprobs)
    
    cost = np.squeeze(cost)     # makes sure cost is the dimension we expect. 
    assert(isinstance(cost, float))
    return cost

def backward_propagation(parameters, cache, X, Y):
    m = X.shape[1]
    
    # First, retrieve W1 and W2 from the dictionary "parameters".
    W1 = parameters['W1']
    W2 = parameters['W2']
        
    # Retrieve also A1 and A2 from dictionary "cache".
    A1 = cache['A1']
    A2 = cache['A2']
    
    # Backward propagation: calculate dW1, db1, dW2, db2. 
    dZ2 = A2-Y

    dW2 = 1/m * np.dot(dZ2,A1.T)
    db2 = 1/m * np.sum(dZ2, axis=1, keepdims = True) # sum all cols in a row
    
    dZ1 = np.multiply( np.dot(W2.T, dZ2)  , (1 - np.power(A1, 2)) ) 
    dW1 = 1/m * np.dot(dZ1, X.T)
    db1 = 1/m * np.sum(dZ1, axis=1, keepdims = True)
    
    grads = {"dW1": dW1,
             "db1": db1,
             "dW2": dW2,
             "db2": db2}
    return grads

def update_parameters(parameters, grads, learning_rate = 1.2):
    # Retrieve each parameter from the dictionary "parameters"
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    # Retrieve each gradient from the dictionary "grads"
    dW1 = grads["dW1"]
    db1 = grads["db1"]
    dW2 = grads["dW2"]
    db2 = grads["db2"]
    
    # Update rule for each parameter
    W1 = W1 - learning_rate*dW1
    b1 = b1 - learning_rate*db1
    W2 = W2 - learning_rate*dW2
    b2 = b2 - learning_rate*db2
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    return parameters


def nn_model(X, Y, n_h, num_iterations = 10000, print_cost=False):
    
    np.random.seed(3)
    n_x = layer_sizes(X, Y)[0]
    n_y = layer_sizes(X, Y)[2]
    
    # Initialize parameters, then retrieve W1, b1, W2, b2. Inputs: "n_x, n_h, n_y". Outputs = "W1, b1, W2, b2, parameters". 
    parameters = initialize_parameters(n_x, n_h, n_y)
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    # Loop (gradient descent)
    for i in range(0, num_iterations):
         
        # Forward propagation. Inputs: "X, parameters". Outputs: "A2, cache".
        A2, cache = forward_propagation(X, parameters)
        
        # Cost function. Inputs: "A2, Y, parameters". Outputs: "cost".
        cost = compute_cost(A2, Y, parameters)
 
        # Backpropagation. Inputs: "parameters, cache, X, Y". Outputs: "grads".
        grads = backward_propagation(parameters, cache, X, Y)
 
        # Gradient descent parameter update. Inputs: "parameters, grads". Outputs: "parameters".
        parameters = update_parameters(parameters, grads)
        
        
        # Print the cost every 1000 iterations
        if print_cost and i % 1000 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))

    return parameters

def predict(parameters, X):
    # Computes probabilities using forward propagation, and classifies to 0/1 using 0.5 as the threshold.
    A2, cache = forward_propagation(X, parameters)
    predictions = (A2>=0.5)
    return predictions


In [87]:
parameters = nn_model(feature_matrix_train, sentiment_train, n_h = 8, num_iterations = 1000, print_cost=True)

Cost after iteration 0: 0.693159


In [88]:
# Print accuracy
predictions = predict(parameters, feature_matrix_train)
print ('Accuracy: %d' % float((np.dot(sentiment_train,predictions.T) + np.dot(1-sentiment_train,1-predictions.T))/float(sentiment_train.size)*100) + '%')


predictions = predict(parameters, feature_matrix_valid)
print ('Accuracy: %d' % float((np.dot(sentiment_valid,predictions.T) + np.dot(1-sentiment_valid,1-predictions.T))/float(sentiment_valid.size)*100) + '%') 



Accuracy: 69%
Accuracy: 69%
