**Spanish Data**

In [1]:
from xml.dom.minidom import parse as pr
tree = pr('general-tweets-train-tagged.xml')
tweetsTree = tree.documentElement

In [2]:
tweets = tweetsTree.getElementsByTagName("tweet")
data = []
for tweet in tweets:
    try:
        content = tweet.getElementsByTagName("content")[0].childNodes[0].data
    except:
        continue
    polarity = tweet.getElementsByTagName("sentiments")[0].getElementsByTagName("value")[0].childNodes[0].data
    data.append([content, polarity])

In [3]:
print len(data)

7218


**Generic Functions (run first)**

In [None]:
#get accuracy and F score
def print_accuracy_zscore(Y_pred, Y_test):
    count = 0
    correct = 0
    scores = {'tp':0,'tn':0,'fp':0,'fn':0}

    for i in range(len(Y_pred)):
        count += 1
        if Y_pred[i] == Y_test[i]:
            correct += 1 

        #precision and recall
        #true positive 
        if Y_test[i] == 1:
            if Y_pred[i] == 1:
                scores['tp'] += 1
            else:
                scores['fn'] += 1
        else:
            if Y_pred[i] == 1:
                scores['fp'] += 1
            else:
                scores['tn'] += 1
    
    print scores
    test_acc = correct / count 
    precision = scores['tp'] / (scores['tp'] + scores['fp'])
    recall = scores['tp'] / (scores['tp'] + scores['fn'])
    z_score = (2*precision*recall)/(precision+recall)
    print "accuracy =", test_acc
    print "Z score =", z_score


**Model 1 - bow en-en**

In [None]:
from __future__ import division

import math
import numpy as np
import os
from itertools import chain, count
from collections import defaultdict
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords


# Global class labels.
POS_LABEL = 'pos'
NEG_LABEL = 'neg'
label_to_dict = {POS_LABEL:1, NEG_LABEL:0}

# Path to dataset
PATH_TO_DATA = "/Users/shamya/Documents/multiLingRep/aclImdb"
TRAIN_DIR = os.path.join(PATH_TO_DATA, "train")
TEST_DIR = os.path.join(PATH_TO_DATA, "test")

def word_to_index_util():
    """
    Convert word to index 
    """
    vocab = list(line.strip() for line in open(os.path.join(PATH_TO_DATA, "imdb.vocab")))
    index_to_word = list(set(vocab))
    word_to_index = dict(zip(index_to_word, count()))
    return word_to_index

def tokenize_doc(doc):
    """

    Tokenize a document and return its bag-of-words representation.
    doc - a string representing a document.
    returns a dictionary mapping each word to the number of times it appears in doc.
    """
    bow = defaultdict(float)
    #tokens = doc.split()
    #lowered_tokens = map(lambda t: t.lower(), tokens)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(doc)
    tokens = map(lambda t: t.lower(), tokens)
    #remove stop words
    #filtered_words = filter(lambda token: token not in stopwords.words('english'), tokens)
    for token in tokens:
        if token.isalpha(): 
            bow[token] += 1.0
    return bow

def string_to_vector(bow, word_to_index):
    """
    Convert the review string into a feature vector (count of words)
    """
    feat_vec = [0] * len(word_to_index)
    for word in bow:
        try:
            feat_vec[word_to_index[word]] = bow[word]
        except:
            continue
    return feat_vec
        
def process_data_np_array(dir_path):
    """
    Converts input data into an np array for sklearn use
    """
    word_to_index = word_to_index_util()
    data = np.zeros((1,len(word_to_index)+1))
    pos_path = os.path.join(dir_path, POS_LABEL)
    neg_path = os.path.join(dir_path, NEG_LABEL)
    print "Starting training with paths %s and %s" % (pos_path, neg_path)
    for (p, label) in [ (pos_path, POS_LABEL), (neg_path, NEG_LABEL) ]:
        filenames = os.listdir(p)
        count = 0 
        for f in filenames:
            with open(os.path.join(p,f),'r') as doc:
                count += 1
                content = doc.read()
                feat_vec = string_to_vector(tokenize_doc(content), word_to_index)
                feat_vec = [label_to_dict[label]] + feat_vec
                data = np.append(data, [feat_vec], 0)
                if count > 700:
                    break
                
    data = np.delete(data, 0, 0) #clear 1st junk row
    return data

train_data = process_data_np_array(TRAIN_DIR)
test_data = process_data_np_array(TEST_DIR)
print "%positive cases in training", (np.sum(train_data[:,0])/train_data.shape[0])



In [None]:
#test correctness of feature vectors
def word_to_index_util():
    """
    Convert word to index 
    """
    vocab = list(line.strip() for line in open(os.path.join(PATH_TO_DATA, "imdb.vocab")))
    index_to_word = list(set(vocab))
    word_to_index = dict(zip(index_to_word, count()))
    return word_to_index
word_to_index = word_to_index_util()
print test_data[0,word_to_index["ashton"]+1] #enter test word

In [None]:
#save data
np.save('train.npy',train_data)
np.save('test.npy',train_data)

In [None]:
#get in sklearn variables
#train_data = np.load('train.npy')
#test_data = np.load('test.npy')
np.random.shuffle(train_data)
np.random.shuffle(test_data)
X_train = train_data[:,1:]
Y_train = train_data[:,0]
X_test = test_data[:,1:]
Y_test = test_data[:,0]

In [None]:
#logistic regression
from sklearn import linear_model 
lr = linear_model.LogisticRegression()
lr.fit(X_train, Y_train)
Y_pred = lr.predict(X_test)
print_accuracy_zscore(Y_pred, Y_test)

**Model 2 - Embedding word2vec -> en-en**

In [None]:
#word2vec
import gensim
import numpy as np
import math

#Download https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
def w2vec_model():
  print "LOADING WORD2VEC MODEL"
  model = gensim.models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
  print "LOADED WORD2VEC MODEL"
  return model
w2v_model = w2vec_model()

In [None]:
from __future__ import division
from sklearn import svm
#from sklearn.linear_models import LinearRegression as lr
import numpy as np
import os
#from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# Path to dataset
PATH_TO_DATA = "/Users/shamya/Documents/multiLingRep/aclImdb"
TRAIN_DIR = os.path.join(PATH_TO_DATA, "train")
TEST_DIR = os.path.join(PATH_TO_DATA, "test")
POS_LABEL = 'pos'
NEG_LABEL = 'neg'


def tokensOfDocument(doc):
    #return word_tokenize(document)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(doc)
    tokens = map(lambda t: t.lower(), tokens)
    #remove stop words
    #filtered_words = filter(lambda token: token not in stopwords.words('english'), tokens)
    return tokens

def convertDocumentToVector(w2v_model, document, dimensionOfVector):
    tokens = tokensOfDocument(document)
    Vector = np.zeros((dimensionOfVector,))
    for token in tokens:
        if(token in w2v_model):
            vec_temp = w2v_model[token]
            Vector += vec_temp - np.mean(vec_temp)
    return Vector/len(tokens)

def processInputSet(inputDoc, w2v_model, inputAnswers, dimensionOfVector):
    inputY = inputAnswers
    inputX = convertDocumentToVector(w2v_model, inputDoc, dimensionOfVector)
    return inputX, inputY

def process_data_np_array(dir_path):
    """
    Converts input data into an np array for sklearn use
    """
    data = np.zeros((1,300))
    output = []
    pos_path = os.path.join(dir_path, POS_LABEL)
    neg_path = os.path.join(dir_path, NEG_LABEL)
    print "Starting training with paths %s and %s" % (pos_path, neg_path)
    for (p, label) in [ (pos_path, 1), (neg_path, 0) ]:
        filenames = os.listdir(p)
        count = 0 
        for f in filenames:
            with open(os.path.join(p,f),'r') as doc:
                count += 1
                content = doc.read()
                X, Y = processInputSet(content, w2v_model, label, 300)
                data = np.append(data, [X], 0)
                output.append(Y)
                if count > 700:
                    break
                
    data = np.delete(data, 0, 0) #clear 1st junk row
    print data.shape
    return data, output


trainX, trainY = process_data_np_array(TRAIN_DIR)
testX, testY = process_data_np_array(TEST_DIR)

clf = svm.SVC() #lr
clf.fit(trainX, trainY) 
y_pred = clf.predict(testX)
print len(testY) - np.sum(np.abs(testY - y_pred))/len(testY)
print_accuracy_zscore(y_pred, testY)

**Model 3 - Embedding self-trained -> en-en **