docVecTrainExperiment.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os, sys
import random
import logging

from doc2vecExperiment import Doc2Vec
import pandas as pd
from doc2vecExperiment import LabeledSentence
from bs4 import BeautifulSoup
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split
#from sklearn.preprocessing import scale


'''
So as to obtain meaningful comparison between using my experimental doc2vec module and
using the original open source module. 

1, I first use the word vectors trained with the cython docvec from 
UNLABELED data.

2, I train document vector using LABELED data and TEST data given our new (python) doc2vec 
implementation. 

To compare the results, I feed the concatenated document vector from DM and DBOW models into 
traditional ML classifier.


The main steps of learning distributed representations are:

1, read in and generate unlabeled training data, labeled training data and test data
2, clean up and labelize data for feeding them into dov2vec
3, load doc2vec with DM model and DBOW model from the trained unlabeled model 
4, train DM and DBOW with labeled data and save the models to disk with 10 epochs
5, train DM and DBOW with test data but fix the hidden weights and word vector representations
6, save models to disk
7, extract the document vector for the labeled training data and test data from models using concatenation

'''


from settings import size, alpha, dmUnlabeled, dbowUnlabeled, expDmLabeled, expDbowLabeled

# gensim's doc2vec API requries that each paragraph should have some identifiers as label
# this label can be generated by TRAIN_i where i is just the index
# the code is borrowed from [4]
def labelizeData(reviews, label, sentiments = None):
    res = []
    if sentiments is not None:
        for idx, review in enumerate(reviews):
            res.append(LabeledSentence(review, ["%s %d"%(label, idx)], [sentiments[idx]]))
    else:
        for idx, review in enumerate(reviews):
            res.append(LabeledSentence(review, ["%s %d"%(label, idx)], ))
    return np.array(res)

def cleanUpText(review):
    #this is by no means exauhstive
    punctuation = """.,?!:;(){}[]"""
    #remove html tags
    review_text = BeautifulSoup(review).get_text()
    #replace '\n' with ''
    review_text = review_text.replace('\n', '')
    #treat punctuation as a individual word
    for c in punctuation:
        review_text = review_text.replace(c," %s "%c)

    return review_text.split()

def extracDocVec(model, labeledSentences, size):
    vecs = [np.array(model[ls.labels[0]]).reshape(1, size) for ls in labeledSentences]
    return np.concatenate(vecs)

def getPredictionFeature(model1, model2, x, size):
    return np.hstack((extracDocVec(model1, x, size), extracDocVec(model2, x, size)))

def saveTestResult(lr, test_vecs):
    result = lr.predict(test_vecs)
    output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
    output.to_csv(os.path.join(os.path.dirname(__file__), 'data', 'word2vec_sgdlog1.csv'), index=False, quoting=3)


if __name__ == "__main__":
    
    
    #flush the logger info to stdout
    root = logging.getLogger()
    root.setLevel(logging.INFO)

    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    root.addHandler(ch)
    
    #step 1: read-in and generate dataset
    #the code for reading files is borrowed from the kaggle word2vec tutorial
    #https://github.com/wendykan/DeepLearningMovies/blob/master/Word2Vec_AverageVectors.py 
    train = pd.read_csv(os.path.join(os.path.dirname(__file__),\
     'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3 )
    #test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3 )
    unlabeled_train = pd.read_csv(os.path.join(os.path.dirname(__file__),\
     'data', 'unlabeledTrainData.tsv'), header=0,  delimiter="\t", quoting=3 )
    
    #to make the evaluation easier, we split the 'train' into 4:1 for trainging and testing
    x_train, x_test, y_train, y_test = train_test_split(np.array(train['review']),\
     np.array(train['sentiment']), test_size=0.2)    

    #step 2: clean up data
    x_train = [ cleanUpText(x) for x in x_train ]
    x_test = [ cleanUpText(x) for x in  x_test ]
    x_unlabeled = [ cleanUpText(x) for x in unlabeled_train['review'] ]

    #labelize the data so as to feed them into the doc2vec model
    #the difference with the old doc2vec is that the new one incorporates binary sentiment
    #classification into the unsupervised learning 
    x_train = labelizeData(x_train, "TRAIN", y_train)
    x_test = labelizeData(x_test, "TEST")
    x_unlabeled = labelizeData(x_unlabeled, "UNLABELED")

    assert(len(x_train) == 20000 and len(x_test) == 5000 \
        and len(x_unlabeled) == 50000)
    #step 3: init doc2vec model 
    if os.path.isfile(dmUnlabeled) and os.path.isfile(dbowUnlabeled):
        #this is completely hack, I first load the gensim.models.dov2vec from disk
        model_dm0 = Doc2Vec.load(dmUnlabeled)   
        model_dbow0 = Doc2Vec.load(dbowUnlabeled)
        # then copy the weights to my own implemented doc2vec
        model_dm = Doc2Vec(min_count=3, window = 10, size=size,\
         workers=4, alpha = 0.025, train_lbls = False)
        model_dbow = Doc2Vec(min_count=3, window=10, size=size,\
         dm=0, workers=4, alpha = 0.025, train_lbls = False)
        model_dm.copyWeights(model_dm0)
        model_dbow.copyWeights(model_dbow0)
        #this is also a hack since Python is GC
        del model_dm0
        del model_dbow0
    else:
        print 'unlabeled trained model is not found'
        sys.exit(1)

    if os.path.isfile(expDmLabeled) and os.path.isfile(expDbowLabeled):
        model_dm = Doc2Vec.load(expDmLabeled) 
        model_dbow = Doc2Vec.load(expDbowLabeled)
    else:    
        #step 5: train labeled data
        model_dm.train_lbls = True
        model_dbow.train_lbls = True
        #We pass through the data set multiple times, shuffling the training reviews each time to improve accuracy.
        for epoch in range(10):
            #shuffle the input data
            perm = np.random.permutation(x_train.shape[0])
            model_dm.train(x_train[perm])
            model_dbow.train(x_train[perm])

            #reset the learning rate to initial value every two epochs
            if epoch % 2 == 0:
                model_dm.alpha = 0.025
                model_dbow.alpha = 0.025
        
        x_test = np.array(x_test)
        
        #step7: train test data
        #reset the learning rate for test set
        model_dm.alpha = 0.025
        model_dbow.alpha = 0.025
        #don't train over hidden weights nor word vectors
        model_dm.train_words = False
        model_dbow.train_words = False
        
        #train over test set
        for epoch in range(10):
            print '################test set %s' % epoch
            #shuffle the input data
            perm = np.random.permutation(x_test.shape[0])
            model_dm.train(x_test[perm])
            model_dbow.train(x_test[perm])
            #reset the learning rate to initial value
            if epoch % 5 == 0:
                model_dbow.alpha = 0.025
                model_dm.alpha = 0.025
        
        #step 7: save the model
        model_dm.save(expDmLabeled)
        model_dbow.save(expDbowLabeled)

    #step 8: get train vectors by concatenating dov vector from model_dm and model_dbow
    train_vecs = getPredictionFeature(model_dm, model_dbow, x_train, size)     
    test_vecs = getPredictionFeature(model_dm, model_dbow, x_test, size)

    
    #use Stochastic Logistic Regression model with l1 regularization
    lr = SGDClassifier(loss='log', penalty='l1')
    
    #(optional) scale both train input data and test input data
    lr.fit(train_vecs, y_train)
    
    print 'Test Accuracy: %.2f'%lr.score(test_vecs, y_test)