docVecTrain.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os, sys
import random
import logging
import pickle
import pandas as pd
#import nltk
from bs4 import BeautifulSoup
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split

from settings import size, dmUnlabeled, dbowUnlabeled,\
 dmLabeled, dbowLabeled, useModifiedModule, testRes, trainedClassifer

if useModifiedModule:
    from doc2vec import Doc2Vec
    from doc2vec import LabeledSentence
else:
    from gensim.models.doc2vec import Doc2Vec
    from gensim.models.doc2vec import LabeledSentence

#from sklearn.preprocessing import scale


'''
Word2Vec is excelllent for extracting similarities between words represented in vector space,
but to do useful things with paragraph(which is typically several sentences with roughly 30-50 words),
we don't want to simply average the word vectors since it would lose the context and word order.

The state-of-art algorithm exploits neural model, 
quote from (gensim.model.doc2vec)
"via the distributed memory and distributed bag of words models from
[1]_, using either hierarchical softmax or negative sampling [2]_ [3]_."

The main steps of learning distributed representations are:

1, read in and generate unlabeled training data, labeled training data and test data
2, clean up and labelize data for feeding them into dov2vec
3, init a doc2vec with DM model and doc2vec with DBOW model
4, train DM and DBOW with unlabeled data and save the models to disk with 10 epochs
5, train DM and DBOW with labeled data and save the models to disk with 10 epochs
6, train DM and DBOW with test data but fix the hidden weights and word vector representations
7, save models to disk
8, extract the document vector for the labeled training data and test data from models using concatenation

My implementations are largely inspired by the idea from [1] and [2]
.. [1] Quoc Le and Tomas Mikolov. Distributed Representations of Sentences and Documents. http://arxiv.org/pdf/1405.4053v2.pdf
.. [2] Michael Czerny. Modern Methods for Sentiment Analysis https://districtdatalabs.silvrback.com/modern-methods-for-sentiment-analysis
'''


# gensim's doc2vec API requries that each paragraph should have some identifiers as label
# this label can be generated by TRAIN_i where i is just the index

def labelizeData(reviews, label):
    res = []
    for idx, review in enumerate(reviews):
        res.append(LabeledSentence(review, ["%s %d"%(label, idx)], ))
    return np.array(res)

def cleanUpText(review):
    #this is by no means exauhstive
    punctuation = """.,?!:;(){}[]"""
    #remove html tags
    review_text = BeautifulSoup(review).get_text()
    #replace '\n' with ''
    review_text = review_text.replace('\n', '')
    #treat punctuation as a individual word
    for c in punctuation:
        review_text = review_text.replace(c," %s "%c)

    return review_text.split()
'''
def cleanUpText(review):
    #remove html tags
    review_text = BeautifulSoup(review).get_text()
    #replace '\n' with ''
    review_text = review_text.replace('\n', '')
    #tokenize word
    return nltk.word_tokenize(review_text)
'''
def extracDocVec(model, labeledSentences, size):
    vecs = [np.array(model[ls.labels[0]]).reshape(1, size) for ls in labeledSentences]
    return np.concatenate(vecs)

def getPredictionFeature(model1, model2, x, size):
    return np.hstack((extracDocVec(model1, x, size), extracDocVec(model2, x, size)))

def flushLoggerInfo():
    #flush the logger info to stdout
    root = logging.getLogger()
    root.setLevel(logging.INFO)

    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    root.addHandler(ch)
     
if __name__ == "__main__":
    
    flushLoggerInfo()

    
    #step 1: read-in and generate dataset
    #the code for reading files is borrowed from the kaggle word2vec tutorial
    #https://github.com/wendykan/DeepLearningMovies/blob/master/Word2Vec_AverageVectors.py 
    train = pd.read_csv(os.path.join(os.path.dirname(__file__),\
     'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3 )
    test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3)
    unlabeled_train = pd.read_csv(os.path.join(os.path.dirname(__file__),\
     'data', 'unlabeledTrainData.tsv'), header=0,  delimiter="\t", quoting=3 )
    
    
    y_train = np.array(train['sentiment'])
    #step 2: clean up data
    x_train = [cleanUpText(x) for x in train['review']]
    x_test = [cleanUpText(x) for x in  test['review']]
    x_unlabeled = [ cleanUpText(x) for x in unlabeled_train['review'] ]

    #labelize the data so as to feed them into the doc2vec model
    x_train = labelizeData(x_train, "TRAIN")
    x_test = labelizeData(x_test, "TEST")
    x_unlabeled = labelizeData(x_unlabeled, "UNLABELED")

    assert(len(x_train) == 25000 and len(x_test) == 25000 \
        and len(x_unlabeled) == 50000)
    
    #step 3: init doc2vec model 
     
    if os.path.isfile(dmUnlabeled) and os.path.isfile(dbowUnlabeled):
        model_dm = Doc2Vec.load(dmUnlabeled)   
        model_dbow = Doc2Vec.load(dbowUnlabeled)
    else:

        #instantiate our DM model using heirachical softmax 
        model_dm = Doc2Vec(min_count=3, window=10, size=size,\
         workers=4, alpha = 0.025, train_lbls = False)
        model_dbow = Doc2Vec(min_count=3, window=10, size=size,\
         dm=0, workers=4, alpha = 0.025, train_lbls = False)

        #build vocab over all dataset
        model_dm.build_vocab(np.concatenate((x_train, x_test, x_unlabeled)))
        model_dbow.build_vocab(np.concatenate((x_train, x_test, x_unlabeled)))
        
        #step 4: train unlabeled data in 10 epochs
        for epoch in range(10):
            #shuffle the input data
            perm = np.random.permutation(x_unlabeled.shape[0])
            model_dbow.train(x_unlabeled[perm])
            model_dm.train(x_unlabeled[perm])
            #reset the learning rate to initial value
            model_dm.alpha = 0.025
            model_dbow.alpha = 0.025

        #save to disk 
        
        #model_dm.save(dmUnlabeled)
        #model_dbow.save(dbowUnlabeled)
    
    if os.path.isfile(dmLabeled) and os.path.isfile(dbowLabeled):
        model_dm = Doc2Vec.load(dmLabeled)   
        model_dbow = Doc2Vec.load(dbowLabeled)
    else:    
        #step 5: train labeled data
        model_dm.train_lbls = True
        model_dbow.train_lbls = True
        #We pass through the data set multiple times, shuffling the training reviews each time to improve accuracy.
        for epoch in range(10):
            #shuffle the input data
            perm = np.random.permutation(x_train.shape[0])
            model_dm.train(x_train[perm])
            model_dbow.train(x_train[perm])

            #reset the learning rate to initial value every two epochs
            if epoch % 4 == 0:
                model_dm.alpha = 0.025
                model_dbow.alpha = 0.025
        
        x_test = np.array(x_test)
        
        #step7: train test data
        #reset the learning rate for test set
        model_dm.alpha = 0.025
        model_dbow.alpha = 0.025
        #don't train over hidden weights nor word vectors
        model_dm.train_words = False
        model_dbow.train_words = False
        
        #train over test set
        for epoch in range(10):
            print '################test set %s' % epoch
            #shuffle the input data
            perm = np.random.permutation(x_test.shape[0])
            model_dm.train(x_test[perm])
            model_dbow.train(x_test[perm])
            #reset the learning rate to initial value
            if epoch % 4 == 0:
                model_dbow.alpha = 0.025
                model_dm.alpha = 0.025
        
        #step 7: save the model
        model_dm.save(dmLabeled)
        model_dbow.save(dbowLabeled)

    #step 8: get train vectors by concatenating dov vector from model_dm and model_dbow
    train_vecs = getPredictionFeature(model_dm, model_dbow, x_train, size)     
    test_vecs = getPredictionFeature(model_dm, model_dbow, x_test, size)

    
    #use Stochastic Logistic Regression model with l1 regularization
    lr = SGDClassifier(loss='log', penalty='l1')
    
    #(optional) scale both train input data and test input data
    lr.fit(train_vecs, y_train)
    
    #run test cases and save test results and model  
    result = lr.predict(test_vecs)
    #print result[:1000]
    #output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
    #output.to_csv(testRes, index=False, quoting=3)
    with open(trainedClassifer,'wb') as f:
        pickle.dump(lr, f)