In [2]:
import pandas as pd
import numpy as np
import glob
import os
import json
import csv
import math
import scipy.sparse
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from bs4 import BeautifulSoup
import requests
from sklearn.utils import shuffle
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.estimator_checks import check_estimator
from sklearn.base import clone
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from collections import defaultdict
from sklearn.model_selection import PredefinedSplit
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.estimator_checks import check_estimator
from scipy.sparse import hstack

In [3]:
pd.set_option('display.max_colwidth', -1)

Model

In [4]:
class UnigramClassifier(BaseEstimator):
    def __init__(self, verbose=10, random_state=0, C=10, penalty='l1',
                 solver='liblinear', class_weight='balanced', max_iter=500,
                 vocabulary={}
                ):
        self.verbose = verbose
        self.random_state = random_state
        self.C = C
        self.penalty = penalty
        self.solver = solver
        self.class_weight = class_weight
        self.max_iter = max_iter
        self.vocabulary = vocabulary
 
    
    def fit(self, X, y):
        self.X_, self.y_ = X, y
        
        self.clf_ = LogisticRegression(
            verbose=self.verbose,
            random_state=self.random_state,
            C=self.C,
            penalty=self.penalty,
            solver=self.solver,
            class_weight=self.class_weight,
            max_iter=self.max_iter
        )
        self.clf_.fit(self.X_, self.y_)
        
        return self

    
    def predict(self, X):
        return self.clf_.predict(X)

In [None]:
def preprocess_data(words, want_lc=False, run_elkb=True):
    genre = words.iloc[0]['genre']
    
    content_words = build_content_words(words)
    
    test_fold_ids = get_test_fold_ids(content_words, genre)
    assert len(content_words) == len(test_fold_ids)
    
    y = build_labels(content_words)
    
    vocabulary = content_words[['word']].drop_duplicates().set_index('word')
    
    X = build_features(content_words, vocabulary, genre, want_lc, run_elkb)
    
    shuffle(X, y, random_state=0)
    return X, test_fold_ids, vocabulary, y

def build_content_words(all_words):
    # Keeping content words: http://ucrel.lancs.ac.uk/bnc2/bnc2guide.htm#m2adv
    # Does not include forms of 'be', 'do', 'have', or modal verbs,
    # as suggested in https://github.com/EducationalTestingService/metaphor/tree/master/content-words
    content_word_pos = set([
        'NN1', 'NN2', 'NN0',
        'NP0',
        'NN1-NP0', 'NN1-VVB', 'NN1-VVG', 'NN2-VVZ',
        'NP0-NN1',
        'VVB', 'VVD', 'VVG', 'VVI', 'VVN', 'VVZ',
        'VVB-NN1', 'VVD-VVN', 'VVD-AJ0', 'VVG-AJ0', 'VVG-NN1', 'VVZ-NN2',
        'AJ0', 'AJC', 'AJS',
        'AJ0-NN1', 'AJ0-VVG', 'AJ0-VVN',
        'AV0', 'AVQ', 'AVP',
        'AV0-AJ0'
    ])

    content_words = all_words[all_words['word_type'].isin(content_word_pos)]

    return content_words

def get_test_fold_ids(words, genre):
    num_folds = {
        'newspapers': 10,
        'conversation': 10,
        'academic': 10,
        'fiction': 11
    }

    test_fold_ids = defaultdict(int)

    for i, (text, _) in enumerate(words.groupby('text_id')):
        test_fold_ids[text] = i % num_folds[genre]

    return words['text_id'].apply(lambda x: test_fold_ids[x])

def build_labels(words):
#         print('Building labels.')
    labels = words['function'] == 'mrw'

    return labels

def build_features(dataset, vocabulary, genre, want_lc, run_elkb): 
    sparse_unigram_features, _ = sparse_featurize(dataset, vocabulary)
    X = sparse_unigram_features
    
    if want_lc:
        lc_feature = build_lc_feature(dataset, genre, run_elkb)
        X = hstack([X, csr_matrix(np.array(lc_feature)[:,None])], format='csr')
    
    return X

Context Unigram

In [4]:
def sparse_featurize(words, vocabulary):
    words = words.sort_values('word_id')
    
    iterable_sentences = words.groupby('sentence_id')['word'].apply(list)

    indices = []
    data = []
    indptr = [0]
    for i, word, sentence_id in words[['word', 'sentence_id']].itertuples():
        sentence = iterable_sentences[sentence_id]

        # Used to generate a row of 0's and 1's for sparse matrix
        unique_unigram_indices = set()
        for context_word in sentence:
            if context_word == word:
                continue
            if context_word not in vocabulary.index:
                unique_unigram_indices.add(len(vocabulary))
            else:
                unique_unigram_indices.add(vocabulary.index.get_loc(context_word))

        # Data = 1 at column index of sparse matrix = row index of word in vocabulary.
        indices.extend(unique_unigram_indices)  # Indices of the data values in this row.
        data.extend(np.ones(len(unique_unigram_indices), dtype=int))  # Data of all one's.
        indptr.append(indptr[-1]+len(unique_unigram_indices))

        if not i % 2000:
            print('-', end='', flush=True)

    return (
        csr_matrix((data, indices, indptr), dtype=int, shape=(len(words), len(vocabulary)+1)),
        iterable_sentences
    )   

Lexical Chain

In [5]:
def build_paragraphs(dataset):
    all_sentences = pd.read_csv('data/raw/sentences.csv', encoding='ISO-8859-1')

    # Sentences the given dataset contains.
    sentences = all_sentences.merge(dataset)[['sentence_id', 'paragraph_id', 'sentence']].drop_duplicates()

    paragraph_sentences = sentences[sentences['paragraph_id'] != -1]

    paragraphs = paragraph_sentences.groupby('paragraph_id')['sentence'].agg(list).apply(lambda x: ' '.join(x))
    return paragraphs


def save_paragraphs(paragraphs):
    try:
        files = glob.glob('data/tmp/paragraphs/*')
        for f in files:
            os.remove(f)
    except:
        os.mkdir('data/tmp/paragraphs')
        
    for i, paragraph in paragraphs.iteritems():
        file_name = "data/tmp/paragraphs/paragraph_{}.txt".format(i)
        with open(file_name, 'w', encoding='utf-8') as f:
            print('Writing {}'.format(file_name))
            f.write(paragraph)  


def get_longest_chains(genre):
    # Go to ELKB directory from the parent directory of this file.
#     dir = os.path.dirname(os.path.realpath(__file__))
#     os.chdir(os.path.join(dir, '../ELKB'))

    with open('data/tmp/longest_chains.csv'.format(genre), 'w', encoding='utf-8') as f:
        csv_writer = csv.writer(f, delimiter=',')
        csv_writer.writerow(('paragraph_id', 'longest_chain'))

        for file_name in glob.glob('data/tmp/paragraphs/paragraph*.txt'.format(genre)):
            paragraph_id = os.path.splitext(os.path.basename(file_name))[
                0].split('_')[-1]

            with open(file_name, 'r', encoding='utf-8') as g:
                result = subprocess.run(
                    'java applications/LexicalChain -f ../{}'.format(file_name),
                    cwd='ELKB/',
                    stdout=subprocess.PIPE)

                result = result.stdout.decode('utf-8')

                result = result.strip().split('\n')

                # Does not contain a lexical chain.
                # First 3 lines are descriptives by the script.
                if len(result) < 4:
                    continue

                longest_chain = []
                len_longest_chain = 0
                for r in result[3:]:
                    chain = [word.strip()
                             for word in r.split('[')[0].split(',')]
                    if len(chain) >= len_longest_chain:
                        longest_chain.extend(chain)
                        
                    len_longest_chain = len(chain)
                    
                # longest_chain = result[3]
                # longest_chain = [word.strip() for word in longest_chain.split('[')[0].split(',')]
                longest_chain_str = ' '.join(longest_chain)

                # line = '{},{}'.format(paragraph_id, longest_chain)
                # f.write(line + '\n')
                row = [paragraph_id, longest_chain_str]
#                 print(row)
                print('Writing longest chain for {} with {} words!\n'
                      .format(file_name, len(longest_chain)))
                csv_writer.writerow(row)

def get_lexical_chains(dataset):
    longest_chains = pd.read_csv('data/tmp/longest_chains.csv')  # Paragraphs with longest chains != ''

    longest_chain_in_paragraphs = longest_chains.merge(paragraphs, left_on='paragraph_id', right_index=True)
    words_and_longest_chains = (
        longest_chain_in_paragraphs
        .merge(sentences)
        .merge(dataset[['sentence_id', 'word_id', 'word']])
    )

    words_and_longest_chains['longest_chain_list'] = (
        words_and_longest_chains['longest_chain']
        .apply(lambda x: set(x.split(' ')))
    )

    words_and_longest_chains['is_in_longest_chain'] = [
        word in chain for (word, chain) in zip(
            words_and_longest_chains['word'],
            words_and_longest_chains['longest_chain_list']
        )
    ]

    return words_and_longest_chains[['word_id', 'is_in_longest_chain']]

def build_lc_feature(dataset, genre, run_elkb):
    paragraphs = build_paragraphs(dataset)
    
    if run_elkb:
        save_paragraphs(paragraphs)

        get_longest_chains(genre)
    
    lc_feature = get_lexical_chains(dataset)
    
    dataset_with_lc_feature = dataset.merge(lc_feature, how='left')

    return dataset_with_lc_feature.sort_values('word_id')['is_in_longest_chain'].fillna(False)

Topic modeling