# Logistic Regression for Sentiment Analysis

In [1]:
import pandas as pd
import numpy as np


#from nltk.stem.porter import PorterStemmer
import re
import nltk
import string
import gensim
from gensim import corpora, models, similarities

#to convert html to text
from bs4 import BeautifulSoup

#dealing with operating system , like reading file
import os

#from nltk.corpus import stopwords
from __future__ import division

from sklearn.feature_extraction.text import HashingVectorizer


### Getting a tokenized corpus for a word2vec model

In [2]:
df = pd.read_csv('shuffled_movie_data.csv')

#SOME CLEANING
def remove_punctuation(text):    
    text = BeautifulSoup(text).get_text().encode('ascii','ignore')
    return text.translate(None, string.punctuation) 

# Remove punctuation.
df['review_clean'] = df['review'].apply(remove_punctuation)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [3]:
# Create the corpus
corpus = df['review_clean'].values.tolist()

# Tokenize corpus
tok_corpus = [nltk.word_tokenize(sent.decode('utf-8')) for sent in corpus ]


In [4]:
# sanity check
print (corpus[1])
print (tok_corpus[1])

OK so I really like Kris Kristofferson and his usual easy going delivery of lines in his movies Age has helped him with his soft spoken low energy style and he will steal a scene effortlessly But Disappearance is his misstep Holy Moly this was a bad movie I must give kudos to the cinematography and and the actors including Kris for trying their darndest to make sense from this goofy confusing story None of it made sense and Kris probably didnt understand it either and he was just going through the motions hoping someone would come up to him and tell him what it was all about I dont care that everyone on this movie was doing out of love for the project or some such nonsense Ive seen low budget movies that had a plot for goodness sake This had none zilcho nada zippo empty of reason a complete waste of good talent scenery and celluloid I rented this piece of garbage for a buck and I want my money back I want my 2 hours back I invested on this Grade F waste of my time Dont watch this movie

### Train a word2vec model (using gensim library)

In [5]:
import multiprocessing

# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 400

# Minimum word count threshold.
min_word_count = 12      #  ignore all words with total frequency lower than this.

# Number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()

# Context window length.
#window is the maximum distance between the current and predicted word within a sentence.
context_size = 7

# Downsample setting for frequent words.
#rate 0 and 1e-5 
#how often to use
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
seed = 1

In [6]:
reviews2vec_model = gensim.models.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling,
)

In [8]:
reviews2vec_model.build_vocab(tok_corpus)


RuntimeError: cannot sort vocabulary after model weights already initialized.

In [10]:
print("Word2Vec vocabulary length:", len(reviews2vec_model.wv.vocab))

('Word2Vec vocabulary length:', 30092)


In [11]:
print (reviews2vec_model.corpus_count)

50000


In [12]:
reviews2vec_model.train(tok_corpus, total_examples=reviews2vec_model.corpus_count, epochs=reviews2vec_model.iter)

41659898

In [13]:
#test
reviews2vec_model.most_similar('awful')

[(u'terrible', 0.8175176382064819),
 (u'atrocious', 0.7761964797973633),
 (u'horrendous', 0.7675600051879883),
 (u'horrible', 0.7653225660324097),
 (u'dreadful', 0.7593313455581665),
 (u'horrid', 0.7593281865119934),
 (u'lousy', 0.7511627674102783),
 (u'appalling', 0.7324734926223755),
 (u'Awful', 0.7061058282852173),
 (u'abysmal', 0.705527663230896)]

### save and load the word2vec model

In [14]:
#save model
if not os.path.exists("trained"):
    os.makedirs("trained")
reviews2vec_model.save(os.path.join("trained", "reviews2vec.w2v"))

## If you already have trained a model, you can start executing the lines from here

In [15]:
#load model
reviews2vec_model = gensim.models.Word2Vec.load(os.path.join("trained", "reviews2vec.w2v"))

### Preparing data 

In [16]:
#Split data 
#90% - train set,
#10% - validation set,

train_data, validation_data = np.split(df.sample(frac=1), [int(.9*len(df))])

In [17]:
print train_data.shape
print validation_data.shape
train_data.head()

(45000, 3)
(5000, 3)


Unnamed: 0,review,sentiment,review_clean
9991,"Were I not with friends, and so cheap, I would...",0,Were I not with friends and so cheap I would h...
3614,MPAA Rating PG-13<br /><br />My Rating: 10 and...,1,MPAA Rating PG13My Rating 10 and upMy Rating ...
42657,I am surprised that so many comments about thi...,0,I am surprised that so many comments about thi...
8420,I also saw The Last Stop at the Moving Picture...,1,I also saw The Last Stop at the Moving Picture...
44511,I clicked onto the Encore Mystery channel to w...,0,I clicked onto the Encore Mystery channel to w...


### Excercise 1: define new features

In [18]:
def getFeaturesFromReview( review, reviews2vec_model ):
    
    tok_review = nltk.word_tokenize( review )
    features = np.zeros( reviews2vec_model.vector_size )
    
    #for each word of the review, consult that word in the w2v_model, then add to the review2vec
    for word in tok_review:
        if word in reviews2vec_model:                        
            features +=  np.array( reviews2vec_model[word] )
        
    features /=  len( tok_review )  #normalize review
    
    return features


#test function
print "number of features:"
print( len(getFeaturesFromReview( df['review_clean'][1], reviews2vec_model )) )
print "\nfeatures for one review:"
print( getFeaturesFromReview( df['review_clean'][1], reviews2vec_model )[0:10] )

number of features:
400

features for one review:
[ 0.02156176  0.14134539  0.08880428  0.05256522 -0.14419097  0.20558562
 -0.16841365  0.20405753  0.06917345  0.1581469 ]


In [19]:
cadena = "featurue_" + str(1)
print len(train_data)
print len(validation_data)


45000
5000


In [20]:
def getFeatureMatrix(data_frame, reviews2vec_model):
    feature_matrix   = np.zeros( ( len(data_frame) , reviews2vec_model.vector_size ) )

    reviews = data_frame['review_clean'].values
    for i in range(len(data_frame)):
        feature_matrix[i] = getFeaturesFromReview(reviews[i], reviews2vec_model)
    
    return feature_matrix


feature_matrix_train = getFeatureMatrix(train_data, reviews2vec_model)
sentiment_matrix_train = train_data['sentiment'].values


    

In [21]:
print "observations x features"
print feature_matrix_train.shape
print feature_matrix_train[0]


observations x features
(45000, 400)
[ 0.0616405   0.12841867  0.10909981  0.06343006 -0.10798091  0.18266215
 -0.17067333  0.15067648  0.02028038  0.17144921  0.19480498  0.06771892
 -0.06612274 -0.16720054 -0.00759485 -0.04147195  0.06266785 -0.09646977
 -0.02220956 -0.12962679 -0.06481798  0.09190133  0.01777032  0.01433943
 -0.11267799  0.07886701  0.08487322 -0.09199788  0.09229028 -0.15855721
  0.02422404  0.10432643 -0.02578599  0.02992763  0.03873751 -0.06719865
 -0.01111012  0.03457657 -0.21048486 -0.06384608 -0.04244143  0.0778416
  0.00200225 -0.05728661  0.16785249 -0.07047689 -0.00187041  0.114737
  0.07178662 -0.01052806  0.16197565 -0.10114054  0.0865088  -0.10153684
 -0.04348475  0.18130604  0.1562086   0.16619596 -0.06502336  0.04683475
  0.00874115  0.11823475  0.02112846  0.03213234  0.10520442 -0.11300841
 -0.11355467  0.0761749  -0.06140949  0.06587857 -0.00774678  0.26487703
 -0.04014797  0.1716031   0.05950917 -0.00485175  0.00258972 -0.02121212
  0.10969694 -0.0

In [22]:
feature_matrix_valid = getFeatureMatrix(validation_data, reviews2vec_model)
sentiment_matrix_valid = validation_data['sentiment'].values


print "observations x features"
print feature_matrix_valid.shape
print feature_matrix_valid[0]

observations x features
(5000, 400)
[  1.01931002e-02   7.73581650e-02   8.47699401e-02   5.49387786e-02
  -1.07852798e-01   1.56354084e-01  -1.49090488e-01   1.57247423e-01
   3.79283632e-02   1.33099873e-01   1.26898337e-01   6.56414283e-02
  -5.73762854e-02  -8.58825491e-02  -3.65617679e-02  -7.05558592e-02
   6.23098300e-02  -7.30688211e-02  -5.72165984e-02  -5.54534699e-02
  -7.32261716e-02   5.17013417e-02  -1.31778424e-02  -3.71448484e-02
  -9.05236041e-02   8.78914643e-02   3.03008168e-02  -1.60629378e-02
   6.86010827e-02  -1.27573358e-01   3.91782224e-02   1.07194706e-01
  -7.90004530e-02   1.40557669e-02   3.99485763e-02  -3.28727923e-02
   4.79369051e-03   4.19837957e-02  -1.54084905e-01  -6.65031556e-02
  -7.02485679e-02   8.90962751e-02   4.23207929e-02  -6.83618520e-02
   1.51499883e-01  -6.29515199e-02  -2.38640371e-02   8.57518937e-02
   5.61278428e-02  -1.49770049e-02   9.57415580e-02  -8.97936206e-02
   8.37957609e-02  -4.00362706e-02  -5.54858274e-02   1.10331281e-0

In [23]:
print sentiment_matrix_valid.shape

(5000,)


In [25]:

sentiment_matrix_train = np.reshape(sentiment_matrix_train, (-1, 1))
sentiment_matrix_train = sentiment_matrix_train.T
feature_matrix_train = feature_matrix_train.T

sentiment_matrix_valid = np.reshape(sentiment_matrix_valid, (-1, 1))
sentiment_matrix_valid = sentiment_matrix_valid.T
feature_matrix_valid = feature_matrix_valid.T



In [26]:
print "features x observations"
print feature_matrix_valid.shape
print feature_matrix_train.shape

print sentiment_matrix_valid.shape
print sentiment_matrix_train.shape

features x observations
(400, 5000)
(400, 45000)
(1, 5000)
(1, 45000)


### Excercise 2: implement a MaxEnt classifier, using regularization, according this https://web.stanford.edu/~jurafsky/slp3/7.pdf

With the sigmoid function I get better results than using the probability function of the paper

In [27]:
#Helper method for neural network classifier with gradient descent 

def sigmoid(Z):
    return 1 /(1+ np.exp(-Z))

def initialize_parameters(n_x, n_h, n_y):    
    np.random.seed(1)
    W1 = np.random.randn(n_h,n_x) * 0.01
    b1 = np.zeros((n_h,1))
    W2 = np.random.randn(n_y,n_h) * 0.01
    b2 = np.zeros((n_y,1))
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    return parameters

def forward_propagation(X, parameters):
    # Retrieving each parameter from the dictionary "parameters"
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    # Forward Propagation to calculate A2 (probabilities)
    Z1 = np.dot(W1, X) + b1
    A1 = np.tanh(Z1)
    
    Z2 = np.dot(W2,A1) + b2
    A2 = sigmoid(Z2)
    
    cache = {"Z1": Z1,
             "A1": A1,
             "Z2": Z2,
             "A2": A2}
    
    return A2, cache

def compute_cost(A2, Y, parameters):
    
    m = Y.shape[1] # number of example

    # Compute the cross-entropy cost
    logprobs = np.dot(np.log(A2), Y.T) + np.dot(np.log(1 - A2), 1-Y.T)
    cost = -1/m * np.sum(logprobs)
    
    cost = np.squeeze(cost)     # makes sure cost is the dimension we expect. 
    assert(isinstance(cost, float))
    return cost

def backward_propagation(parameters, cache, X, Y):
    
    m = X.shape[1]
    
    # First, retrieve W1 and W2 from the dictionary "parameters".
    W1 = parameters['W1']
    W2 = parameters['W2']
        
    # Retrieve also A1 and A2 from dictionary "cache".
    A1 = cache['A1']
    A2 = cache['A2']
    
    # Backward propagation: calculate dW1, db1, dW2, db2. 
    dZ2 = A2-Y

    dW2 = 1/m * np.dot(dZ2,A1.T)
    db2 = 1/m * np.sum(dZ2, axis=1, keepdims = True) # sum all cols in a row
    
    dZ1 = np.multiply( np.dot(W2.T, dZ2)  , (1 - np.power(A1, 2)) ) 
    dW1 = 1/m * np.dot(dZ1, X.T)
    db1 = 1/m * np.sum(dZ1, axis=1, keepdims = True)
    
    grads = {"dW1": dW1,
             "db1": db1,
             "dW2": dW2,
             "db2": db2}
    return grads

def update_parameters(parameters, grads, learning_rate = 1.2):
    # Retrieve each parameter from the dictionary "parameters"
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    # Retrieve each gradient from the dictionary "grads"
    dW1 = grads["dW1"]
    db1 = grads["db1"]
    dW2 = grads["dW2"]
    db2 = grads["db2"]
    
    # Update rule for each parameter
    W1 = W1 - learning_rate*dW1
    b1 = b1 - learning_rate*db1
    W2 = W2 - learning_rate*dW2
    b2 = b2 - learning_rate*db2
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    return parameters


def nn_model(X, Y, n_h, learning_rate, num_iterations = 10000, print_cost=False):
    
    np.random.seed(3)
    n_x = len(X)
    n_y = len(Y)
    
    # Initialize parameters and retrieve W1, b1, W2, b2. Inputs: "n_x, n_h, n_y". Outputs = "W1, b1, W2, b2, parameters". 
    parameters = initialize_parameters(n_x, n_h, n_y)
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    # Loop (gradient descent)
    for i in range(0, num_iterations):
         
        # Forward propagation. Inputs: "X, parameters". Outputs: "A2, cache".
        A2, cache = forward_propagation(X, parameters)
        
        # Cost function. Inputs: "A2, Y, parameters". Outputs: "cost".
        cost = compute_cost(A2, Y, parameters)
 
        # Backpropagation. Inputs: "parameters, cache, X, Y". Outputs: "grads".
        grads = backward_propagation(parameters, cache, X, Y)
 
        # Gradient descent parameter update. Inputs: "parameters, grads". Outputs: "parameters".
        parameters = update_parameters(parameters, grads, learning_rate)
        
        
        # Print the cost every 500 iterations
        if print_cost and i % 500 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))

    return parameters

def predict(parameters, X):    
    # Computes probabilities using forward propagation, and classifies to 0/1 using 0.5 as the threshold.
    A2, cache = forward_propagation(X, parameters)
    predictions = (A2>=0.5)
    return predictions




In [28]:



parameters = nn_model(feature_matrix_train, sentiment_matrix_train, n_h = 8, learning_rate =1.3, num_iterations = 10000, print_cost=True)

Cost after iteration 0: 0.693132
Cost after iteration 500: 0.549645
Cost after iteration 1000: 0.460085
Cost after iteration 1500: 0.409217
Cost after iteration 2000: 0.379224
Cost after iteration 2500: 0.360313
Cost after iteration 3000: 0.347661
Cost after iteration 3500: 0.338789
Cost after iteration 4000: 0.332379
Cost after iteration 4500: 0.327697
Cost after iteration 5000: 0.323957
Cost after iteration 5500: 0.320704
Cost after iteration 6000: 0.317831
Cost after iteration 6500: 0.315287
Cost after iteration 7000: 0.313029
Cost after iteration 7500: 0.311019
Cost after iteration 8000: 0.309225
Cost after iteration 8500: 0.307620
Cost after iteration 9000: 0.306177
Cost after iteration 9500: 0.304875


### Exercise 3: compare  with your Neural Network

El accuracy obtenido usando sklearn.linear_model.SGDClassifier es de: 0.867

El accuracy obtenido usando una red neuronal con una capa oculta es;

In [32]:
# Print accuracy
print("Accuracy on training set")
predictions = predict(parameters, feature_matrix_train)
print ('Accuracy: %f' % float((np.dot(sentiment_matrix_train,predictions.T) + np.dot(1-sentiment_matrix_train,1-predictions.T))/float(sentiment_matrix_train.size)*100) + '%')

print("\nAccuracy on validation set")
predictions = predict(parameters, feature_matrix_valid)
print ('Accuracy: %f' % float((np.dot(sentiment_matrix_valid,predictions.T) + np.dot(1-sentiment_matrix_valid,1-predictions.T))/float(sentiment_matrix_valid.size)*100) + '%') 



Accuracy on training set
Accuracy: 87.115556%

Accuracy on validation set
Accuracy: 87.340000%
