In [1]:
import pandas as pd
import numpy as np
import glob
import os
import json
import csv
import math
import scipy.sparse
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from bs4 import BeautifulSoup
import requests
from sklearn.utils import shuffle
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.estimator_checks import check_estimator
from sklearn.base import clone
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from collections import defaultdict
from sklearn.model_selection import PredefinedSplit
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.estimator_checks import check_estimator
from scipy.sparse import hstack
import warnings
from joblib import dump, load
from sklearn.decomposition import LatentDirichletAllocation
from scipy.spatial.distance import cosine
import subprocess
import ast

In [4]:
warnings.filterwarnings(action='once')

In [3]:
pd.set_option('display.max_colwidth', -1)

## Base Model

In [4]:
class UnigramClassifier(BaseEstimator):
    def __init__(self, verbose=10, random_state=0, C=10, penalty='l1',
                 solver='liblinear', class_weight='balanced', max_iter=500,
                 vocabulary={}
                ):
        self.verbose = verbose
        self.random_state = random_state
        self.C = C
        self.penalty = penalty
        self.solver = solver
        self.class_weight = class_weight
        self.max_iter = max_iter
        self.vocabulary = vocabulary
 
    
    def fit(self, X, y):
        self.X_, self.y_ = X, y
        
        self.clf_ = LogisticRegression(
            verbose=self.verbose,
            random_state=self.random_state,
            C=self.C,
            penalty=self.penalty,
            solver=self.solver,
            class_weight=self.class_weight,
            max_iter=self.max_iter
        )
        self.clf_.fit(self.X_, self.y_)
        
        return self

    
    def predict(self, X):
        return self.clf_.predict(X)

In [2]:
def preprocess_data(
    words, vocabulary=None,
    want_lc=False,
    want_gt=False, gt_vocab=None,
    want_gwc=False,
    want_ac=False
):
    genre = words.iloc[0]['genre']
    print(genre)
    
    content_words = build_content_words(words)
    
    test_fold_ids = get_test_fold_ids(content_words, genre)
    assert len(content_words) == len(test_fold_ids)
    
    y = build_labels(content_words)
    
    if vocabulary is None:
        vocabulary = content_words[['word']].drop_duplicates().set_index('word')
    
    X, gt_vocab = build_features(
        content_words, vocabulary, genre,
        want_lc,
        want_gt, gt_vocab,
        want_gwc,
        want_ac
    )
    
    shuffle(X, y, random_state=0)
    return X, test_fold_ids, vocabulary, gt_vocab, y

def build_content_words(all_words):
    # Keeping content words: http://ucrel.lancs.ac.uk/bnc2/bnc2guide.htm#m2adv
    # Does not include forms of 'be', 'do', 'have', or modal verbs,
    # as suggested in https://github.com/EducationalTestingService/metaphor/tree/master/content-words
    content_word_pos = set([
        'NN1', 'NN2', 'NN0',
        'NP0',
        'NN1-NP0', 'NN1-VVB', 'NN1-VVG', 'NN2-VVZ',
        'NP0-NN1',
        'VVB', 'VVD', 'VVG', 'VVI', 'VVN', 'VVZ',
        'VVB-NN1', 'VVD-VVN', 'VVD-AJ0', 'VVG-AJ0', 'VVG-NN1', 'VVZ-NN2',
        'AJ0', 'AJC', 'AJS',
        'AJ0-NN1', 'AJ0-VVG', 'AJ0-VVN',
        'AV0', 'AVQ', 'AVP',
        'AV0-AJ0'
    ])

    content_words = all_words[all_words['word_type'].isin(content_word_pos)]

    return content_words

def get_test_fold_ids(words, genre):
    num_folds = {
        'newspapers': 10,
        'conversation': 10,
        'academic': 10,
        'fiction': 11
    }

    test_fold_ids = defaultdict(int)

    for i, (text, _) in enumerate(words.groupby('text_id')):
        test_fold_ids[text] = i % num_folds[genre]

    return words['text_id'].apply(lambda x: test_fold_ids[x])

def build_labels(words):
    labels = words['function'] == 'mrw'

    return labels

def build_features(
    dataset, vocabulary, genre,
    want_lc,
    want_gt, gt_vocab,
    want_gwc,
    want_ac
):
    
    sparse_unigram_features, _ = sparse_featurize(dataset, vocabulary)
    X = sparse_unigram_features
    print(X.shape)
    
    if want_lc:
        lc_feature = build_lc_feature(dataset, genre).merge(dataset)['is_in_longest_chain'].fillna(False)
        display(sum(lc_feature))
        X = hstack([X,
                    csr_matrix(
                        np.array(lc_feature)
                        [:,None])], format='csr')
        display(X[:100].todense())
        
    if want_gt:
        gt_feature, gt_vocab = build_gt_feature(dataset, genre, gt_vocab)
        X = hstack([X, csr_matrix(np.array(gt_feature)[:,None])], format='csr')
        
    if want_gwc:
        gwc_feature = build_gwc_feature(dataset)
        X = hstack([X, csr_matrix(np.array(gwc_feature)[:,None])], format='csr')
        
    if want_ac:
        ac_feature = build_ac_feature(dataset)
        X = hstack([X, csr_matrix(ac_feature)], format='csr')       
    
    print(X.shape)
    return X, gt_vocab

## Context Unigram

In [4]:
def sparse_featurize(words, vocabulary):
    words = words.sort_values('word_id')
    
    iterable_sentences = words.groupby('sentence_id')['word'].apply(list)

    indices = []
    data = []
    indptr = [0]
    for i, word, sentence_id in words[['word', 'sentence_id']].itertuples():
        sentence = iterable_sentences[sentence_id]

        # Used to generate a row of 0's and 1's for sparse matrix
        unique_unigram_indices = set()
        for context_word in sentence:
            if context_word == word:
                continue
            if context_word not in vocabulary.index:
                unique_unigram_indices.add(len(vocabulary))
            else:
                unique_unigram_indices.add(vocabulary.index.get_loc(context_word))

        # Data = 1 at column index of sparse matrix = row index of word in vocabulary.
        indices.extend(unique_unigram_indices)  # Indices of the data values in this row.
        data.extend(np.ones(len(unique_unigram_indices), dtype=int))  # Data of all one's.
        indptr.append(indptr[-1]+len(unique_unigram_indices))

        if not i % 2000:
            print('-', end='', flush=True)

    return (
        csr_matrix((data, indices, indptr), dtype=int, shape=(len(words), len(vocabulary)+1)),
        iterable_sentences
    )   

## Lexical Chain

In [16]:
def build_paragraphs(dataset):
    all_sentences = pd.read_csv('data/raw/sentences.csv', encoding='ISO-8859-1')

    # Sentences the given dataset contains.
    sentences = all_sentences.merge(dataset)[['sentence_id', 'paragraph_id', 'sentence']].drop_duplicates()

    paragraph_sentences = sentences[sentences['paragraph_id'] != -1]

    paragraphs = paragraph_sentences.groupby('paragraph_id')['sentence'].agg(list).apply(lambda x: ' '.join(x))
    return sentences, paragraphs


def save_paragraphs(paragraphs):
    try:
        files = glob.glob('data/paragraphs/*')
        for f in files:
            os.remove(f)
    except:
        os.mkdir('data/paragraphs')
        
    for i, paragraph in paragraphs.iteritems():
        file_name = "data/paragraphs/paragraph_{}.txt".format(i)
        with open(file_name, 'w', encoding='utf-8') as f:
            print('Writing {}'.format(file_name))
            f.write(paragraph)  


def get_longest_chains(genre):
    # Go to ELKB directory from the parent directory of this file.
#     dir = os.path.dirname(os.path.realpath(__file__))
#     os.chdir(os.path.join(dir, '../ELKB'))

    with open('data/longest_chains.csv'.format(genre), 'w', encoding='utf-8') as f:
        csv_writer = csv.writer(f, delimiter=',')
        csv_writer.writerow(('paragraph_id', 'longest_chain'))

        for file_name in glob.glob('data/paragraphs/paragraph*.txt'.format(genre)):
            paragraph_id = os.path.splitext(os.path.basename(file_name))[
                0].split('_')[-1]

            with open(file_name, 'r', encoding='utf-8') as g:
                result = subprocess.run(
                    'java applications/LexicalChain -f ../{}'.format(file_name),
                    cwd='ELKB/',
                    stdout=subprocess.PIPE)

                result = result.stdout.decode('utf-8')

                result = result.strip().split('\n')

                # Does not contain a lexical chain.
                # First 3 lines are descriptives by the script.
                if len(result) < 4:
                    continue

                longest_chain = []
                len_longest_chain = 0
                for r in result[3:]:
                    chain = [word.strip()
                             for word in r.split('[')[0].split(',')]
                    if len(chain) >= len_longest_chain:
                        longest_chain.extend(chain)
                        
                    len_longest_chain = len(chain)
                    
                # longest_chain = result[3]
                # longest_chain = [word.strip() for word in longest_chain.split('[')[0].split(',')]
                longest_chain_str = ' '.join(longest_chain)

                # line = '{},{}'.format(paragraph_id, longest_chain)
                # f.write(line + '\n')
                row = [paragraph_id, longest_chain_str]
                print('Writing longest chain for {} with {} words!\n'
                      .format(file_name, len(longest_chain)))
                csv_writer.writerow(row)

In [18]:
def get_lexical_chains(dataset, sentences, paragraphs):
    longest_chains = pd.read_csv('data/longest_chains.csv')  # Paragraphs with longest chains != ''

    longest_chain_in_paragraphs = longest_chains.merge(paragraphs, left_on='paragraph_id', right_index=True)
    words_and_longest_chains = (
        longest_chain_in_paragraphs
        .merge(sentences)
        .merge(dataset[['sentence_id', 'word_id', 'word']])
    )

    words_and_longest_chains['longest_chain_list'] = (
        words_and_longest_chains['longest_chain']
        .apply(lambda x: set(x.split(' ')))
    )

    words_and_longest_chains['is_in_longest_chain'] = [
        word in chain for (word, chain) in zip(
            words_and_longest_chains['word'],
            words_and_longest_chains['longest_chain_list']
        )
    ]
    return words_and_longest_chains[['word_id', 'longest_chain_list', 'is_in_longest_chain']]

def build_lc_feature(dataset, genre):
    sentences, paragraphs = build_paragraphs(dataset)
    
    lc_feature = get_lexical_chains(dataset, sentences, paragraphs)
    
    dataset_with_lc_feature = dataset.merge(lc_feature, how='left')

    return dataset_with_lc_feature.sort_values('word_id')

## Global Topic Distribution

In [7]:
# n_topics
def calc_t_given_d(doc_topic_dist):
    total_d = sum((sum(dtd) for dtd in doc_topic_dist))
    t_given_d = [
        sum(tdd)/total_d for tdd in zip(*doc_topic_dist)
    ]
    return t_given_d

# n_topics X n_vocabulary
def calc_w_given_t(components):
    return components / components.sum(axis=1)[:, np.newaxis]

# n_vocabulary
def calc_w(components):
    return components.sum(axis=0)/sum(sum(arr) for arr in components)

# n_topics
def calc_t(doc_topic_dist):
    return doc_topic_dist.sum(axis=0)/sum(sum(arr) for arr in doc_topic_dist)

# n_vocabulary X n_topics
def calc_t_given_w(components, doc_topic_dist):
    w_given_t = calc_w_given_t(components)
    p_w = calc_w(components)
    p_t = calc_t(doc_topic_dist)

    t_given_w = list(zip(*((w_given_t / p_w) * p_t[:,None])))
    return t_given_w

# Turn lemmas of document to a document word matrix.
def document_word_matrix(words, vocabulary):
    indices = []
    data = []
    indptr = [0]
    for i, word in words['lemma'].iteritems():
        # Used to generate a row of 0's and 1's for sparse matrix
        unique_unigram_indices = set()
        # ID for an unknown word is == len(vocabulary).
        if word not in vocabulary.index:
            unique_unigram_indices.add(len(vocabulary))
        else:
            unique_unigram_indices.add(vocabulary.index.get_loc(word))

        # Data = 1 at column index of sparse matrix = row index of word in vocabulary.
        indices.extend(unique_unigram_indices)  # Indices of the data values in this row.
        data.extend(np.ones(len(unique_unigram_indices), dtype=int))  # Data of all one's.
        indptr.append(indptr[-1]+len(unique_unigram_indices))

        if not i % 2000:
            print('...', end='', flush=True)

    return csr_matrix((data, indices, indptr), dtype=int, shape=(len(words), len(vocabulary)+1))

In [None]:
def build_gt_feature(content_words, genre, vocab):
    sentences = pd.read_csv('data/raw/sentences.csv', encoding='ISO-8859-1')[['sentence_id', 'paragraph_id']]
    dataset = (
        content_words
         .merge(sentences[sentences['paragraph_id'] != -1])
    )

    lda_model = load('models/lda_{}.joblib'.format(genre))
    
    if vocab is None:
        vocab = dataset[['lemma']].drop_duplicates().set_index('lemma')

    gt_feature = []
    for paragraph, group in dataset.groupby('paragraph_id'):
        sparse_matrix = document_word_matrix(group, vocab)
        paragraph_topic_dist = lda_model.transform(sparse_matrix)

        t_given_w = calc_t_given_w(lda_model.components_, paragraph_topic_dist)
        t_given_d = calc_t_given_d(paragraph_topic_dist)
        for word in group.itertuples():
            t_given_w_index = vocab.index.get_loc(word.lemma) if word.lemma in vocab.index else len(vocab)
            cos_sim = cosine(
                t_given_w_index,
                t_given_d
            )
            gt_feature.append([word.word_id, cos_sim])

        print('Calculating for paragraph {}'.format(paragraph))

    gt_feature = content_words.merge(
        pd.DataFrame(gt_feature, columns=['word_id', 'GT']),
        how='left'
    )
    display(content_words.sample(5))
    display(gt_feature.sample(5))
    print('Nulls:', len(gt_feature), len(gt_feature[gt_feature['GT'].isnull()]))
    return gt_feature['GT'].fillna(0), vocab

## GWC

In [2]:
def build_gwc_feature(dataset):
    words_with_frames = pd.read_csv('data/words_with_frames.csv')
    display(dataset.merge(words_with_frames, left_on='word_id', right_on='word_id'))
    gwc_feature = dataset.merge(words_with_frames, left_on='word_id', right_on='word_id')['gwc']
    print(len(dataset), len(gwc_feature))
    assert len(dataset) == len(gwc_feature)
    return gwc_feature

## AC

In [None]:
def build_ac_feature(dataset):
    dep_types = ['amod', 'nsubj', 'obj', 'appos', 'flat', 'aux',
                 'acl_relcl', 'det', 'nummod', 'compound', 'case',
                 'nmod', 'mark', 'acl', 'obl', 'ccomp', 'nmod_poss',
                 'cc', 'conj', 'punct', 'advmod', 'nsubj_pass',
                 'aux_pass', 'cop', 'parataxis', 'advcl', 'compound_prt', 
                 'expl', 'obl_tmod', 'det_predet', 'csubj', 'xcomp',
                 'iobj', 'discourse', 'list', 'vocative', 'nmod_tmod',
                 'obl_npmod', 'nmod_npmod', 'cc_preconj', 'fixed',
                 'goeswith', 'reparandum']
    
    assert len(dep_types) == 43
    
    words_with_ac_ratings = pd.read_csv('data/words_with_ac_ratings.csv')
    display(dataset.merge(words_with_ac_ratings, left_on='word_id', right_on='word_id'))
    ac_feature = dataset.merge(words_with_ac_ratings, left_on='word_id', right_on='word_id')['ac']
    
    words_with_dep_acs = pd.read_csv('data/words_with_dep_acs.csv')
    dep_ac_feature = dataset.merge(words_with_dep_acs, left_on='word_id', right_on='word_id')[dep_types]
    assert len(dep_ac_feature.columns == 43)
    
    print(len(dataset), len(ac_feature))
    assert len(dataset) == len(ac_feature)
    return pd.concat([ac_feature, dep_ac_feature], axis=1)

## Get Results

In [None]:
def do_experiment(
    name,
    genre,
    want_lc=False,
    want_gt=False,
    want_gwc=False,
    want_ac=False
):
    with open('results/{}_{}.txt'.format(name, genre), 'w') as f:
        train_set = pd.read_csv('data/train/{}/words.csv'.format(genre), encoding='ISO--8859-1', na_filter=False)
        X_train, test_fold_ids, vocabulary, gt_vocab, y_train = preprocess_data(
            train_set,
            want_lc=want_lc, want_gt=want_gt, want_gwc=want_gwc, want_ac=want_ac)

        unigram_classifier = UnigramClassifier(
            verbose=10, random_state=0, C=10, penalty='l1',
            solver='liblinear', class_weight='balanced', max_iter=500,
            vocabulary=vocabulary
        )

        C = np.logspace(-2,4,20)
        penalty = ['l1', 'l2']
        hyperparameters = {'C':C, 'penalty':penalty}
        clf = GridSearchCV(unigram_classifier, hyperparameters,
                           cv=PredefinedSplit(test_fold_ids),
                           verbose=10, scoring='f1', n_jobs=-1)

        clf.fit(X_train, y_train)

        test_set = pd.read_csv('data/test/{}/words.csv'.format(genre), encoding='ISO--8859-1', na_filter=False)
        X_test, _, _, _, y_test = preprocess_data(
            test_set, vocabulary=vocabulary, gt_vocab=gt_vocab,
            want_lc=want_lc, want_gt=want_gt, want_gwc=want_gwc, want_ac=want_ac)
        preds = clf.predict(X_test)

        result = classification_report(y_test, preds, output_dict=True)
        print(result)
        f.write(json.dumps(result))