In [1]:
import pandas as pd

In [2]:
data_path = "leading_reviewer_data/"

In [3]:
from glob import glob
reviewer_path_list = glob(data_path+"*/")

In [4]:
import re
import os
import numpy as np

In [5]:
SYSTEM_IS_WINDOW = False

In [6]:
# only use 10 reviews for each reviewer (pick out the 10 largest size of reviews)
reviewers = []
reviewers_review = []
reviewers_literature = []
for reviewer_path in reviewer_path_list:
    if SYSTEM_IS_WINDOW:
        reviewers.append(re.search(r'leading_reviewer_data\\([a-zA-Z_-]*)\\',reviewer_path).group(1))
    else:
        reviewers.append(re.search(r'leading_reviewer_data/([a-zA-Z_-]*)/',reviewer_path).group(1))
    current_reviewer_review = [f for f in os.listdir(reviewer_path) if '.txt' in f]
    # select the 10 largest size of reviews
    current_selected_review_index = np.argsort([os.stat(reviewer_path+f).st_size for f in os.listdir(reviewer_path) if '.txt' in f])[::-1][:10]
    current_reviewer_review_selected = list(np.array(current_reviewer_review))
    # get the review texts
    review_texts = []
    for nm in current_reviewer_review_selected:
        with open(reviewer_path+nm, encoding="utf8") as f:
            text_to_append = f.read().replace('\n',' ')
            review_texts.append(text_to_append)
    reviewers_review.append(review_texts)
    
    if SYSTEM_IS_WINDOW:
        current_reviewer_literature = [f for f in os.listdir(reviewer_path+"training_data\\") if '.txt' in f]
    else:
        current_reviewer_literature = [f for f in os.listdir(reviewer_path+"training_data/") if '.txt' in f]
    # get the literature texts
    literature_texts = []
    for nm in current_reviewer_literature:
        if SYSTEM_IS_WINDOW:
            text_nm = reviewer_path+"training_data\\"+nm
        else:
            text_nm = reviewer_path+"training_data/"+nm
        with open(text_nm, encoding="utf8") as f:
            text_to_append = f.read().replace('\n',' ')
            literature_texts.append(text_to_append)
    reviewers_literature.append(literature_texts)

# Build training set, development set and test set
## Randomly picking the test data from the whole dataset
total: 25 reviewers

each reviewer has 10 reviews and 10 literatures.

Test set: half of the reviews (each reviewer has 5 reviews)

Development set: the other half of the reviews

Training set: all the literatures

In [7]:
np.random.seed(100)
dev_set = []
test_set = []
for reviewer in reviewers_review:
    # copy() is important here, or the reviewers_review will be shuffled
    reviewer_cp = reviewer.copy()
    np.random.shuffle(reviewer_cp)
    dev_set.append(reviewer_cp[:5])
    test_set.append(reviewer_cp[5:])

# Features

In [8]:
import nltk
from scipy.cluster.vq import whiten
from sklearn.feature_extraction.text import CountVectorizer
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

## Bag of words feature

bag of words feature needs to build the corpus from review data

In [9]:
all_review_texts = [' '.join(sublib) for sublib in reviewers_review]
all_review_texts = ' '.join(all_review_texts)
all_tokens = nltk.word_tokenize(all_review_texts)

# word_to_ix maps each word in the vocab to a unique integer, which will be its index into the Bag of words vector
word_to_ix = {}
for word in all_tokens:
    if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix)
        
VOCAB_SIZE = len(word_to_ix)

NUM_TOP_WORDS = 1000
# get most common words in the whole review set
fdist = nltk.FreqDist(all_tokens)
vocab_top = fdist.most_common(NUM_TOP_WORDS)
vocab_selected = [word_to_ix[voc[0]] for voc in vocab_top]

In [10]:
# the input_text can only be one literature
def BagOfWords(input_text, word_to_ix, vocab_selected):
    """
    Compute the bag of words feature vectors, based on the most common words
     in reviews
    """
    vec = torch.zeros(len(word_to_ix))
    tokens = nltk.word_tokenize(input_text)
    for word in tokens:
        if word in word_to_ix:
            vec[word_to_ix[word]] += 1
    # the bag of word vector
    vec = vec.view(1, -1)
    # the bow vector for the selected top words
    fvs_bow = torch.index_select(vec,1,torch.tensor(vocab_selected))
    fvs_bow /= torch.sum(vec)
    return fvs_bow

## Bag of characters features (n-gram)

In [11]:
GRAM_N = 3

import nltk
from nltk.util import ngrams
tokenizer_with_tail_pukt = nltk.tokenize.RegexpTokenizer(r'[a-zA-Z0-9-.()\[\]{}]+[,.:?!\"\']?')

def NGramCharacter(input_text,GRAM_N):
    all_tokens = tokenizer_with_tail_pukt.tokenize(input_text)
    N_gram = []
    for token in all_tokens:
        grams = list(ngrams(token,GRAM_N))
        for gm in grams:
            N_gram.append(''.join(gm))
    return N_gram

Build N-gram character vocabulary from reivews

In [12]:
all_review_texts = [' '.join(sublib) for sublib in reviewers_review]
all_review_texts = ' '.join(all_review_texts)
N_gram_char = NGramCharacter(all_review_texts,GRAM_N)

In [13]:
len(N_gram_char)

559367

In [14]:
# ngram_to_ix maps each ngram in the vocab to a unique integer, which will be its index into the Bag of characters vector
ngram_to_ix = {}
for ngram in N_gram_char:
    if ngram not in ngram_to_ix:
        ngram_to_ix[ngram] = len(ngram_to_ix)
        
VOCAB_CHAR_SIZE = len(ngram_to_ix)

NUM_TOP_NGRAMS = 1000
# get most common ngrams in the whole review set
fdist = nltk.FreqDist(N_gram_char)
vocab_char_top = fdist.most_common(NUM_TOP_NGRAMS)
vocab_char_selected = [ngram_to_ix[voc[0]] for voc in vocab_char_top]


In [15]:
# the input_text can only be one literature
def BagOfCharacters(input_text, ngram_to_ix, vocab_char_selected):
    """
    Compute the bag of character feature vectors, based on the most common ngrams
     in reviews
    """
    GRAM_N = len(list(ngram_to_ix.keys())[0])
    
    vec = torch.zeros(len(ngram_to_ix))
    N_gram_char = NGramCharacter(input_text,GRAM_N)
    
    for ngram in N_gram_char:
        if ngram in ngram_to_ix:
            vec[ngram_to_ix[ngram]] += 1
    # the bag of chars vector
    vec = vec.view(1, -1)
    # the boc vector for the selected top chars
    fvs_boc = torch.index_select(vec,1,torch.tensor(vocab_char_selected))
    fvs_boc /= torch.sum(vec)
    return fvs_boc

## Syntactic features

In [16]:
tagdict = nltk.data.load('help/tagsets/upenn_tagset.pickle')
pos_list = list(tagdict.keys())

In [17]:
def SyntacticFeatures(reviews_texts, pos_list):
    """
    Extract feature vector for part of speech frequencies
    """
    # get part of speech for each token in each review
    def token_to_pos(ch):
        tokens = nltk.word_tokenize(ch)
        return [p[1] for p in nltk.pos_tag(tokens)]
    review_pos = [token_to_pos(ch) for ch in reviews_texts]
 
    # count frequencies for all POS types
    fvs_syntax = np.array([[ch.count(pos) for pos in pos_list] for ch in review_pos]).astype(np.float64)
 
    # normalise by dividing each row by number of tokens in the review
    fvs_syntax /= np.c_[np.array([len(ch) for ch in review_pos])]
    
    return fvs_syntax

# Classifier

## Baseline: logistic regression

In [56]:
import sklearn

In [18]:
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable

torch.manual_seed(100)

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

### training data

In [19]:
fvs_boc_train = np.array([], dtype=np.int64).reshape(0,NUM_TOP_NGRAMS)
fvs_bow_train = np.array([], dtype=np.int64).reshape(0,NUM_TOP_WORDS)
fvs_syntax_train = np.array([], dtype=np.int64).reshape(0,len(pos_list))
true_label_train = np.array([], dtype=np.int64).reshape(0,1)
for lit in np.arange(len(reviewers_literature)):
    # bag of words
    for per_lit in reviewers_literature[lit]:
        fvs_bow_train_cur = BagOfWords(per_lit, word_to_ix, vocab_selected)
        fvs_bow_train = np.concatenate((fvs_bow_train,fvs_bow_train_cur))
    # bag of chars (n-grams)
    for per_lit in reviewers_literature[lit]:
        fvs_boc_train_cur = BagOfCharacters(per_lit, ngram_to_ix, vocab_char_selected)
        fvs_boc_train = np.concatenate((fvs_boc_train,fvs_boc_train_cur))
    # syntax
    fvs_syntax_train_cur = SyntacticFeatures(reviewers_literature[lit], pos_list)
    fvs_syntax_train = np.concatenate((fvs_syntax_train,fvs_syntax_train_cur))
    # label
    true_label_train = np.concatenate((true_label_train, lit*np.ones(( len(reviewers_literature[lit]) ,1))))

In [58]:
fvs_boc_train = sklearn.preprocessing.scale(fvs_boc_train)
fvs_bow_train = sklearn.preprocessing.scale(fvs_bow_train)
fvs_syntax_train = sklearn.preprocessing.scale(fvs_syntax_train)

In [59]:
fvs_train = np.concatenate((fvs_bow_train,fvs_syntax_train,fvs_boc_train),axis=1)

## Logistic Regression classifier

In [21]:
class LRClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, input_size, num_labels):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(LRClassifier, self).__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is fvs_dim
        # and the output is num_labels!
        self.linear = nn.Linear(input_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, x):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        out = self.linear(x)
        return out

In [86]:
NUM_LABELS = len(reviewers_literature)
INPUT_SIZE = fvs_train.shape[1]
input_vector = torch.from_numpy(fvs_train)
input_vector = input_vector.float()
input_vector = input_vector.to(device)
labels = torch.from_numpy(true_label_train)
labels = labels.view(-1)
labels = labels.long()
labels = labels.to(device)
model = LRClassifier(INPUT_SIZE, NUM_LABELS)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

num_epochs = 1

# Training the Model
for epoch in range(num_epochs):    
    # Forward + Backward + Optimize
    optimizer.zero_grad()
    outputs = model(input_vector)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
        
    if epoch % 1 == 0:
        print ('Epoch: [%d/%d], Loss: %.4f' %(epoch+1, num_epochs, loss.data))

Epoch: [1/1], Loss: 3.3435


### Score use training data

In [87]:
# Test the Model
correct = 0
total = 0

outputs = model(input_vector)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum()
    
print('Accuracy of the model on the %d training sets: %f %%' % (total , 100 * correct / total))

Accuracy of the model on the 251 training sets: 99.000000 %


### development data

In [50]:
fvs_boc_dev = np.array([], dtype=np.int64).reshape(0,NUM_TOP_NGRAMS)
fvs_bow_dev = np.array([], dtype=np.int64).reshape(0,NUM_TOP_WORDS)
fvs_syntax_dev = np.array([], dtype=np.int64).reshape(0,len(pos_list))
true_label_dev = np.array([], dtype=np.int64).reshape(0,1)
for index in np.arange(len(dev_set)):
    # bag of words
    for per_review in dev_set[index]:
        fvs_bow_dev_cur = BagOfWords(per_review, word_to_ix, vocab_selected)
        fvs_bow_dev = np.concatenate((fvs_bow_dev,fvs_bow_dev_cur))
    # bag of chars (n-grams)
    for per_review in dev_set[index]:
        fvs_boc_dev_cur = BagOfCharacters(per_review, ngram_to_ix, vocab_char_selected)
        fvs_boc_dev = np.concatenate((fvs_boc_dev,fvs_boc_dev_cur))
    # syntax
    fvs_syntax_dev_cur = SyntacticFeatures(dev_set[index], pos_list)
    fvs_syntax_dev = np.concatenate((fvs_syntax_dev,fvs_syntax_dev_cur))
    # label
    true_label_dev = np.concatenate((true_label_dev, index*np.ones(( len(dev_set[index]) ,1))))

In [60]:
fvs_boc_dev = sklearn.preprocessing.scale(fvs_boc_dev)
fvs_bow_dev = sklearn.preprocessing.scale(fvs_bow_dev)
fvs_syntax_dev = sklearn.preprocessing.scale(fvs_syntax_dev)

In [61]:
fvs_dev = np.concatenate((fvs_bow_dev,fvs_syntax_dev,fvs_boc_dev),axis=1)

### Score use dev data

In [88]:
input_vector = torch.from_numpy(fvs_dev)
input_vector = input_vector.float()
input_vector = input_vector.to(device)
labels = torch.from_numpy(true_label_dev)
labels = labels.view(-1)
labels = labels.long()
labels = labels.to(device)

# Test the Model
correct = 0
total = 0

outputs = model(input_vector)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum()
    
print('Accuracy of the model on the %d development sets: %f %%' % (total , 100 * correct / total))

Accuracy of the model on the 125 development sets: 56.000000 %


### Test data and score

In [199]:
fvs_bow_test = np.array([], dtype=np.int64).reshape(0,NUM_TOP_WORDS)
fvs_syntax_test = np.array([], dtype=np.int64).reshape(0,len(pos_list))
true_label_test = np.array([], dtype=np.int64).reshape(0,1)
for index in np.arange(len(test_set)):
    # bag of words
    for per_review in test_set[index]:
        fvs_bow_test_cur = BagOfWords(per_review, word_to_ix, vocab_selected)
        fvs_bow_test = np.concatenate((fvs_bow_test,fvs_bow_test_cur))
    # syntax
    fvs_syntax_test_cur = SyntacticFeatures(test_set[index], pos_list)
    fvs_syntax_test = np.concatenate((fvs_syntax_test,fvs_syntax_test_cur))
    # label
    true_label_test = np.concatenate((true_label_test, index*np.ones(( len(test_set[index]) ,1))))

In [200]:
fvs_test = np.concatenate((fvs_bow_test,fvs_syntax_test),axis=1)

In [201]:
input_vector = torch.from_numpy(fvs_test)
input_vector = input_vector.float()
input_vector = input_vector.to(device)
labels = torch.from_numpy(true_label_test)
labels = labels.view(-1)
labels = labels.long()
labels = labels.to(device)

# Test the Model
correct = 0
total = 0

outputs = model(input_vector)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum()
print('Accuracy of the model on the %d test sets: %f %%' % (total , 100 * correct / total))

Accuracy of the model on the 199 test sets: 25.000000 %


## SVM

In [89]:
from sklearn import svm

In [97]:
clf = svm.LinearSVC(multi_class="ovr",C=0.01)
X = fvs_train
y = true_label_train.flatten()

In [98]:
clf.fit(X,y)

LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

### Score use training data

In [99]:
correct = 0
labels = true_label_train.flatten()

predicted = clf.predict(fvs_train)
total = labels.size
correct = (predicted == labels).sum()
print('Accuracy of the model on the %d training sets: %f %%' % (total , 100 * correct / total))

Accuracy of the model on the 251 training sets: 100.000000 %


### Score use dev data

In [100]:
# Test the Model
correct = 0
labels = true_label_dev.flatten()

predicted = clf.predict(fvs_dev)
total = labels.size
correct = (predicted == labels).sum()
print('Accuracy of the model on the %d development sets: %f %%' % (total , 100 * correct / total))

Accuracy of the model on the 125 development sets: 59.200000 %
