In [1]:
import pandas as pd

In [1]:
data_path = "leading_reviewer_data/"

In [2]:
from glob import glob
reviewer_path_list = glob(data_path+"*/")

In [79]:
import re
import os
import numpy as np

In [382]:
# only use 10 reviews for each reviewer (pick out the 10 largest size of reviews)
reviewers = []
reviewers_review = []
reviewers_literature = []
for reviewer_path in reviewer_path_list:
    reviewers.append(re.search(r'leading_reviewer_data\\([a-zA-Z_-]*)\\',reviewer_path).group(1))
    current_reviewer_review = [f for f in os.listdir(reviewer_path) if '.txt' in f]
    # select the 10 largest size of reviews
    current_selected_review_index = np.argsort([os.stat(reviewer_path+f).st_size for f in os.listdir(reviewer_path) if '.txt' in f])[::-1][:10]
    current_reviewer_review_selected = list(np.array(current_reviewer_review)[ind])
    # get the review texts
    review_texts = []
    for nm in current_reviewer_review_selected:
        with open(reviewer_path+nm, encoding="utf8") as f:
            text_to_append = f.read().replace('\n',' ')
            review_texts.append(text_to_append)
    reviewers_review.append(review_texts)
    
    current_reviewer_literature = [f for f in os.listdir(reviewer_path+"training_data\\") if '.txt' in f]
    # get the literature texts
    literature_texts = []
    for nm in current_reviewer_literature:
        with open(reviewer_path+"training_data\\"+nm, encoding="utf8") as f:
            text_to_append = f.read().replace('\n',' ')
            literature_texts.append(text_to_append)
    reviewers_literature.append(literature_texts)

# Build training set, development set and test set
## Randomly picking the test data from the whole dataset
total: 25 reviewers

each reviewer has 10 reviews and 10 literatures.

Test set: half of the reviews (each reviewer has 5 reviews)

Development set: the other half of the reviews

Training set: all the literatures

In [396]:
np.random.seed(100)
dev_set = []
test_set = []
for reviewer in reviewers_review:
    # copy() is important here, or the reviewers_review will be shuffled
    reviewer_cp = reviewer.copy()
    np.random.shuffle(reviewer_cp)
    dev_set.append(reviewer_cp[:5])
    test_set.append(reviewer_cp[5:])

# Features

### Lexical and punctuation features

Lexical features:

The average number of words per sentence

Sentence length variation

Lexical diversity, which is a measure of the richness of the author’s vocabulary

Punctuation features:

Average number of commas, semicolons and colons per sentence

In [153]:
# create feature vectors
def LexicalFeatures(reviews_texts):
    """
    Compute feature vectors for word and punctuation features
    """
    num_reviews = len(reviews_texts)
    fvs_lexical = np.zeros((len(reviews_texts), 3), np.float64)
    fvs_punct = np.zeros((len(reviews_texts), 3), np.float64)
    for e, ch_text in enumerate(reviews_texts):
        # note: the nltk.word_tokenize includes punctuation
        tokens = nltk.word_tokenize(ch_text.lower())
        words = word_tokenizer.tokenize(ch_text.lower())
        sentences = sentence_tokenizer.tokenize(ch_text)
        vocab = set(words)
        words_per_sentence = np.array([len(word_tokenizer.tokenize(s))
                                   for s in sentences])
 
        # average number of words per sentence
        fvs_lexical[e, 0] = words_per_sentence.mean()
        # sentence length variation
        fvs_lexical[e, 1] = words_per_sentence.std()
        # Lexical diversity
        fvs_lexical[e, 2] = len(vocab) / float(len(words))
 
        # Commas per sentence
        fvs_punct[e, 0] = tokens.count(',') / float(len(sentences))
        # Semicolons per sentence
        fvs_punct[e, 1] = tokens.count(';') / float(len(sentences))
        # Colons per sentence
        fvs_punct[e, 2] = tokens.count(':') / float(len(sentences))

    # apply whitening to decorrelate the features
    fvs_lexical = whiten(fvs_lexical)
    fvs_punct = whiten(fvs_punct)
    
    return fvs_lexical, fvs_punct

## bag of words feature

bag of words feature needs to build the corpus from training data

In [763]:
all_training_texts = [' '.join(sublib) for sublib in reviewers_literature]
all_training_texts = ' '.join(all_training_texts)
all_tokens = nltk.word_tokenize(all_training_texts)
# get rid of punctuations or other special markers
all_tokens = [token.lower() for token in all_tokens if token.isalpha()]

# word_to_ix maps each word in the vocab to a unique integer, which will be its index into the Bag of words vector
word_to_ix = {}
for word in all_tokens:
    if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix)
        
VOCAB_SIZE = len(word_to_ix)

NUM_TOP_WORDS = 20
# get most common words in the whole training set
fdist = nltk.FreqDist(all_tokens)
vocab_top = fdist.most_common(NUM_TOP_WORDS)
vocab_selected = [word_to_ix[voc[0]] for voc in vocab_top]

In [764]:
# the input_text can only be one literature
def BagOfWords(input_text, word_to_ix, vocab_selected):
    """
    Compute the bag of words feature vectors, based on the most common words
     in the whole training set
    """
    vec = torch.zeros(len(word_to_ix))
    tokens = nltk.word_tokenize(input_text)
    # get rid of punctuations or other special markers
    tokens = [token.lower() for token in tokens if token.isalpha()]
    for word in tokens:
        if word in word_to_ix:
            vec[word_to_ix[word]] += 1
    # the bag of word vector
    vec = vec.view(1, -1)
    # the bow vector for the selected top words
    fvs_bow = torch.index_select(vec,1,torch.tensor(vocab_selected))
    fvs_bow /= torch.sum(vec)
    return fvs_bow

## Syntactic features

In [765]:
def SyntacticFeatures(reviews_texts):
    """
    Extract feature vector for part of speech frequencies
    """
    # get part of speech for each token in each review
    def token_to_pos(ch):
        tokens = nltk.word_tokenize(ch)
        return [p[1] for p in nltk.pos_tag(tokens)]
    review_pos = [token_to_pos(ch) for ch in reviews_texts]
 
    # count frequencies for common POS types
    pos_list = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS']
    fvs_syntax = np.array([[ch.count(pos) for pos in pos_list] for ch in review_pos]).astype(np.float64)
 
    # normalise by dividing each row by number of tokens in the review
    fvs_syntax /= np.c_[np.array([len(ch) for ch in review_pos])]
    
    return fvs_syntax

# Classifier

## Baseline: logistic regression

In [766]:
import nltk
from scipy.cluster.vq import whiten
from sklearn.feature_extraction.text import CountVectorizer
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

### training data

In [767]:
fvs_lexical_train = np.array([], dtype=np.int64).reshape(0,3)
fvs_punct_train = np.array([], dtype=np.int64).reshape(0,3)
fvs_bow_train = np.array([], dtype=np.int64).reshape(0,NUM_TOP_WORDS)
fvs_syntax_train = np.array([], dtype=np.int64).reshape(0,6)
true_label_train = np.array([], dtype=np.int64).reshape(0,1)
for lit in np.arange(len(reviewers_literature)):
    # lexical and punct
    fvs_lexical_train_cur, fvs_punct_train_cur = LexicalFeatures(reviewers_literature[lit])
    fvs_lexical_train = np.concatenate((fvs_lexical_train,fvs_lexical_train_cur))
    fvs_punct_train = np.concatenate((fvs_punct_train,fvs_punct_train_cur))
    # bag of words
    for per_lit in reviewers_literature[lit]:
        fvs_bow_train_cur = BagOfWords(per_lit, word_to_ix, vocab_selected)
        fvs_bow_train = np.concatenate((fvs_bow_train,fvs_bow_train_cur))
    # syntax
    fvs_syntax_train_cur = SyntacticFeatures(reviewers_literature[lit])
    fvs_syntax_train = np.concatenate((fvs_syntax_train,fvs_syntax_train_cur))
    # label
    true_label_train = np.concatenate((true_label_train, lit*np.ones(( len(reviewers_literature[lit]) ,1))))

In [768]:
fvs_train = np.concatenate((fvs_bow_train,fvs_syntax_train),axis=1)

## Logistic Regression classifier

In [769]:
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable

torch.manual_seed(100)
cuda = torch.device('cuda')

In [770]:
class LRClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, input_size, num_labels):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(LRClassifier, self).__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is fvs_dim
        # and the output is num_labels!
        self.linear = nn.Linear(input_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, x):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        out = self.linear(x)
        return out

In [783]:
NUM_LABELS = len(reviewers_literature)
INPUT_SIZE = fvs_train.shape[1]
input_vector = torch.from_numpy(fvs_train)
input_vector = input_vector.float()
input_vector = input_vector.to(cuda)
labels = torch.from_numpy(true_label_train)
labels = labels.view(-1)
labels = labels.long()
labels = labels.to(cuda)
model = LRClassifier(INPUT_SIZE, NUM_LABELS)
model = model.to(cuda)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

num_epochs = 500

# Training the Model
for epoch in range(num_epochs):    
    # Forward + Backward + Optimize
    optimizer.zero_grad()
    outputs = model(input_vector)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
        
    if epoch % 50 == 0:
        print ('Epoch: [%d/%d], Loss: %.4f' %(epoch+1, num_epochs, loss.data))

Epoch: [1/500], Loss: 3.2290
Epoch: [51/500], Loss: 3.1808
Epoch: [101/500], Loss: 3.1401
Epoch: [151/500], Loss: 3.0998
Epoch: [201/500], Loss: 3.0603
Epoch: [251/500], Loss: 3.0217
Epoch: [301/500], Loss: 2.9841
Epoch: [351/500], Loss: 2.9474
Epoch: [401/500], Loss: 2.9116
Epoch: [451/500], Loss: 2.8767


### development data

In [772]:
fvs_lexical_dev = np.array([], dtype=np.int64).reshape(0,3)
fvs_punct_dev = np.array([], dtype=np.int64).reshape(0,3)
fvs_bow_dev = np.array([], dtype=np.int64).reshape(0,NUM_TOP_WORDS)
fvs_syntax_dev = np.array([], dtype=np.int64).reshape(0,6)
true_label_dev = np.array([], dtype=np.int64).reshape(0,1)
for index in np.arange(len(dev_set)):
    # lexical and punct
    fvs_lexical_dev_cur, fvs_punct_dev_cur = LexicalFeatures(dev_set[index])
    fvs_lexical_dev = np.concatenate((fvs_lexical_dev,fvs_lexical_dev_cur))
    fvs_punct_dev = np.concatenate((fvs_punct_dev,fvs_punct_dev_cur))
    # bag of words
    for per_review in dev_set[index]:
        fvs_bow_dev_cur = BagOfWords(per_review, word_to_ix, vocab_selected)
        fvs_bow_dev = np.concatenate((fvs_bow_dev,fvs_bow_dev_cur))
    # syntax
    fvs_syntax_dev_cur = SyntacticFeatures(dev_set[index])
    fvs_syntax_dev = np.concatenate((fvs_syntax_dev,fvs_syntax_dev_cur))
    # label
    true_label_dev = np.concatenate((true_label_dev, index*np.ones(( len(dev_set[index]) ,1))))

In [773]:
fvs_dev = np.concatenate((fvs_bow_dev,fvs_syntax_dev),axis=1)

### Score use dev data

In [784]:
input_vector = torch.from_numpy(fvs_dev)
input_vector = input_vector.float()
input_vector = input_vector.to(cuda)
labels = torch.from_numpy(true_label_dev)
labels = labels.view(-1)
labels = labels.long()
labels = labels.to(cuda)

# Test the Model
correct = 0
total = 0

outputs = model(input_vector)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum()
    
print('Accuracy of the model on the %d development sets: %f %%' % (total , 100 * correct / total))

Accuracy of the model on the 125 development sets: 24.000000 %


### Test data and score

In [786]:
fvs_lexical_test = np.array([], dtype=np.int64).reshape(0,3)
fvs_punct_test = np.array([], dtype=np.int64).reshape(0,3)
fvs_bow_test = np.array([], dtype=np.int64).reshape(0,NUM_TOP_WORDS)
fvs_syntax_test = np.array([], dtype=np.int64).reshape(0,6)
true_label_test = np.array([], dtype=np.int64).reshape(0,1)
for index in np.arange(len(test_set)):
    # lexical and punct
    fvs_lexical_test_cur, fvs_punct_test_cur = LexicalFeatures(test_set[index])
    fvs_lexical_test = np.concatenate((fvs_lexical_test,fvs_lexical_test_cur))
    fvs_punct_test = np.concatenate((fvs_punct_test,fvs_punct_test_cur))
    # bag of words
    for per_review in test_set[index]:
        fvs_bow_test_cur = BagOfWords(per_review, word_to_ix, vocab_selected)
        fvs_bow_test = np.concatenate((fvs_bow_test,fvs_bow_test_cur))
    # syntax
    fvs_syntax_test_cur = SyntacticFeatures(test_set[index])
    fvs_syntax_test = np.concatenate((fvs_syntax_test,fvs_syntax_test_cur))
    # label
    true_label_test = np.concatenate((true_label_test, index*np.ones(( len(test_set[index]) ,1))))

In [787]:
fvs_test = np.concatenate((fvs_bow_test,fvs_syntax_test),axis=1)

In [795]:
input_vector = torch.from_numpy(fvs_test)
input_vector = input_vector.float()
input_vector = input_vector.to(cuda)
labels = torch.from_numpy(true_label_test)
labels = labels.view(-1)
labels = labels.long()
labels = labels.to(cuda)

# Test the Model
correct = 0
total = 0

outputs = model(input_vector)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum()
print('Accuracy of the model on the %d test sets: %f %%' % (total , 100 * correct / total))

Accuracy of the model on the 125 test sets: 20.000000 %


## SVM