In [1]:
import pandas as pd
import numpy as np
import glob
import os
import json
import csv
import math
import scipy.sparse
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from bs4 import BeautifulSoup
import requests
from sklearn.utils import shuffle

In [2]:
pd.set_option('display.max_colwidth', -1)

Start by exploring academic training dataset.

In [3]:
all_words = pd.read_csv('data/train/academic/words.csv', encoding='ISO--8859-1', na_filter=False)
display(len(all_words))
all_words.head()

48964

Unnamed: 0.1,Unnamed: 0,word_id,word,lemma,word_type,function,seg_type,sentence_id,text_id,text_tag,genre
0,25070,25070,HER,she,DPS,,,1267,a6u-fragment02,a6u,academic
1,25071,25071,DRESS,dress,NN1,,,1267,a6u-fragment02,a6u,academic
2,25072,25072,HANGS,hang,VVZ,,,1267,a6u-fragment02,a6u,academic
3,25073,25073,HERE',here',NP0,,,1267,a6u-fragment02,a6u,academic
4,25074,25074,DE-FROCKING,de-frock,VVG,mrw,met,1267,a6u-fragment02,a6u,academic


## Keep only content words and filter stop verbs!

In [4]:
# Keeping content words: http://ucrel.lancs.ac.uk/bnc2/bnc2guide.htm#m2adv
# Does not include forms of 'be', 'do', 'have', or modal verbs,
# as suggested in https://github.com/EducationalTestingService/metaphor/tree/master/content-words
content_word_pos = set([
    'NN1', 'NN2', 'NN0',
    'NP0',
    'NN1-NP0', 'NN1-VVB', 'NN1-VVG', 'NN2-VVZ',
    'NP0-NN1',
    'VVB', 'VVD', 'VVG', 'VVI', 'VVN', 'VVZ',
    'VVB-NN1', 'VVD-VVN', 'VVD-AJ0', 'VVG-AJ0', 'VVG-NN1', 'VVZ-NN2',
    'AJ0', 'AJC', 'AJS',
    'AJ0-NN1', 'AJ0-VVG', 'AJ0-VVN',
    'AV0', 'AVQ', 'AVP',
    'AV0-AJ0'
])

In [5]:
content_words = all_words[all_words['word_type'].isin(content_word_pos)]
display(len(content_words))
content_words.head(10)

25360

Unnamed: 0.1,Unnamed: 0,word_id,word,lemma,word_type,function,seg_type,sentence_id,text_id,text_tag,genre
1,25071,25071,DRESS,dress,NN1,,,1267,a6u-fragment02,a6u,academic
2,25072,25072,HANGS,hang,VVZ,,,1267,a6u-fragment02,a6u,academic
3,25073,25073,HERE',here',NP0,,,1267,a6u-fragment02,a6u,academic
4,25074,25074,DE-FROCKING,de-frock,VVG,mrw,met,1267,a6u-fragment02,a6u,academic
6,25076,25076,KAHLO,kahlo,NP0,,,1267,a6u-fragment02,a6u,academic
7,25077,25077,CULT,cult,NN1,,,1267,a6u-fragment02,a6u,academic
8,25078,25078,Oriana,oriana,NP0-NN1,,,1268,a6u-fragment02,a6u,academic
9,25079,25079,Baddeley,baddeley,NP0,,,1268,a6u-fragment02,a6u,academic
13,25083,25083,witnessed,witness,VVN,mrw,met,1269,a6u-fragment02,a6u,academic
15,25085,25085,shift,shift,NN1-VVB,,,1269,a6u-fragment02,a6u,academic


## Sparsification

Create a vocabulary using content words. This will be used as the unigram features.

In [6]:
vocabulary = content_words[['word']].drop_duplicates().set_index('word')

display(len(vocabulary))
vocabulary.sample(10)

7327

operated
Jackson
underpin
Flowerdew
out
Rennes
patterns
warrant
promoted
Code


In [12]:
def sparse_featurize(words, vocabulary):
    print('Build iterable sentences.')
    iterable_sentences = words.groupby('sentence_id')['word'].apply(list)
    display(len(iterable_sentences))
    display(iterable_sentences[:5])

    indices = []
    data = []
    indptr = [0]
    for i, word, sentence_id in words[['word', 'sentence_id']].itertuples():
        sentence = iterable_sentences[sentence_id]

        # Used to generate a row of 0's and 1's for sparse matrix
        unique_unigram_indices = set()
        for context_word in sentence:
            if context_word == word:
                continue
            unique_unigram_indices.add(vocabulary.index.get_loc(context_word))

        # Data = 1 at column index of sparse matrix = row index of word in vocabulary.
        indices.extend(unique_unigram_indices)  # Indices of the data values in this row.
        data.extend(np.ones(len(unique_unigram_indices), dtype=int))  # Data of all one's.
        indptr.append(indptr[-1]+len(unique_unigram_indices))

        if not i % 2000:
            print('-', end='', flush=True)

    return (
        csr_matrix((data, indices, indptr), dtype=int, shape=(len(words), len(vocabulary))),
        iterable_sentences
    )

In [13]:
words = content_words
sparse_unigram_features, iterable_sentences = sparse_featurize(words, vocabulary)

scipy.sparse.save_npz('data/sparse_unigram_features.npz', sparse_unigram_features)
display(sparse_unigram_features)
sparse_unigram_features.shape

'Build iterable sentences.'

1961

sentence_id
1267    [DRESS, HANGS, HERE', DE-FROCKING, KAHLO, CULT]                                                                               
1268    [Oriana, Baddeley]                                                                                                            
1269    [witnessed, shift, art, establishment, attitudes, art, produced, traditional, parameters]                                     
1270    [work, previously, marginalised, artists, become, area, rich, speculation, art, dealers, priced, modern, masters, market]     
1271    [Almost, year, witnessed, discovery, new, artistic, terrain, graffiti, art, Soviet, art, Australian, art, art, Latin, America]
Name: word, dtype: object

-----------

<25360x7327 sparse matrix of type '<class 'numpy.int32'>'
	with 386398 stored elements in Compressed Sparse Row format>

(25360, 7327)

Inspect if sparse features are valid.

In [9]:
sparse_unigram_features = scipy.sparse.load_npz('data/sparse_unigram_features.npz')

In [11]:
select_word_i = 210

selected_word = words.iloc[select_word_i]
display(selected_word)
sentence_of_selected_word = sorted(list(set(iterable_sentences.loc[selected_word.sentence_id])))
display(sentence_of_selected_word)

unigram_select_indices = np.nonzero(sparse_unigram_features[select_word_i])[:50][1]  # Nonzero indices indicating word.
display(unigram_select_indices)
sparse_matrix_sentence_selections = sorted(list(vocabulary.iloc[unigram_select_indices].index))
display(sparse_matrix_sentence_selections)

Unnamed: 0     25449         
word_id        25449         
word           passionate    
lemma          passionate    
word_type      AJ0           
function                     
seg_type                     
sentence_id    1282          
text_id        a6u-fragment02
text_tag       a6u           
genre          academic      
Name: 379, dtype: object

['Diego',
 'Rivera',
 'appearance',
 'come',
 'dominate',
 'emotional',
 'flamboyant',
 'husband',
 'obsession',
 'pain',
 'passionate',
 'physical',
 'responses',
 'work']

array([ 70,  71, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,  16])

['Diego',
 'Rivera',
 'appearance',
 'come',
 'dominate',
 'emotional',
 'flamboyant',
 'husband',
 'obsession',
 'pain',
 'physical',
 'responses',
 'work']

Looks good!

## Train

In [11]:
labels = words['function'] == 'mrw'
display(len(labels))
display(labels.sample(10))
words[labels].head()

30545

26579    False
38668    True 
2617     True 
38976    False
12019    False
16848    False
42138    False
40767    False
58870    True 
24636    False
Name: function, dtype: bool

Unnamed: 0,word_id,word,lemma,word_type,function,seg_type,sentence_id,text_id,text_types
4,25074,DE-FROCKING,de-frock,VVG,mrw,met,1267,a6u-fragment02,a6u
13,25083,witnessed,witness,VVN,mrw,met,1269,a6u-fragment02,a6u
20,25090,attitudes,attitude,NN2,mrw,met,1269,a6u-fragment02,a6u
37,25107,area,area,NN1,mrw,met,1270,a6u-fragment02,a6u
39,25109,rich,rich,AJ0,mrw,met,1270,a6u-fragment02,a6u


In [12]:
sum(labels)/len(labels)

0.15252905549189721

In [13]:
X, y = shuffle(sparse_unigram_features, labels, random_state=0)
X

<30545x8329 sparse matrix of type '<class 'numpy.int32'>'
	with 460517 stored elements in Compressed Sparse Row format>

In [14]:
clf = LogisticRegression(verbose=10, random_state=0, C=10, penalty='l1',
                         solver='liblinear', class_weight='balanced', max_iter=500)

In [15]:
# hyperparameters = {'C':[1, 5, 10, 20, 100], 'penalty':['l1', 'l2']}
# from sklearn.model_selection import GridSearchCV
# clf = GridSearchCV(LogisticRegression(class_weight='balanced', verbose=10), hyperparameters,
#                                       verbose=10, cv=10, scoring='f1')
# clf.fit(X_train, y_train)

In [16]:
scoring = ['accuracy', 'precision', 'recall', 'f1']
scores = cross_validate(clf, scoring=scoring, X=X, y=y, cv=10, verbose=10)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[LibLinear][CV]  , accuracy=0.8693944353518822, precision=0.5607985480943739, recall=0.6630901287553648, f1=0.6076696165191741, total=   9.9s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.0s remaining:    0.0s


[CV]  , accuracy=0.872013093289689, precision=0.5688073394495413, recall=0.6652360515021459, f1=0.6132542037586548, total=   7.9s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   18.0s remaining:    0.0s


[CV]  , accuracy=0.8648117839607201, precision=0.5454545454545454, recall=0.6824034334763949, f1=0.6062917063870352, total=   7.2s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   25.4s remaining:    0.0s


[CV]  , accuracy=0.8628477905073649, precision=0.5435992578849722, recall=0.628755364806867, f1=0.5830845771144278, total=   4.8s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   30.4s remaining:    0.0s


[CV]  , accuracy=0.8677577741407528, precision=0.5607843137254902, recall=0.6137339055793991, f1=0.5860655737704917, total=   4.9s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   35.4s remaining:    0.0s


[CV]  , accuracy=0.8664484451718494, precision=0.5516014234875445, recall=0.6652360515021459, f1=0.603112840466926, total=   4.5s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   40.0s remaining:    0.0s


[CV]  , accuracy=0.8650949574328749, precision=0.5490909090909091, recall=0.648068669527897, f1=0.594488188976378, total=   5.1s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   45.3s remaining:    0.0s


[CV]  , accuracy=0.8755730189914865, precision=0.5817490494296578, recall=0.6566523605150214, f1=0.6169354838709677, total=   5.0s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   50.4s remaining:    0.0s


[CV]  , accuracy=0.8654223968565815, precision=0.5491949910554562, recall=0.6587982832618026, f1=0.5990243902439025, total=   4.1s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   54.6s remaining:    0.0s


[CV]  , accuracy=0.860792662954471, precision=0.5332225913621262, recall=0.6903225806451613, f1=0.6016869728209934, total=   4.3s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   59.1s finished


In [17]:
clf.fit(X, y)

[LibLinear]

LogisticRegression(C=10, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=500,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=0,
          solver='liblinear', tol=0.0001, verbose=10, warm_start=False)

In [18]:
scores



{'fit_time': array([9.99348378, 8.00513864, 7.31442618, 4.92084098, 4.95672727,
        4.58477211, 5.24496722, 5.07560992, 4.16706419, 4.43465042]),
 'score_time': array([0.01197028, 0.00997376, 0.00602484, 0.00797224, 0.00598478,
        0.00498891, 0.00502777, 0.00698352, 0.00698185, 0.00702095]),
 'test_accuracy': array([0.86939444, 0.87201309, 0.86481178, 0.86284779, 0.86775777,
        0.86644845, 0.86509496, 0.87557302, 0.8654224 , 0.86079266]),
 'train_accuracy': array([0.96991633, 0.96722445, 0.96762459, 0.96995271, 0.96755184,
        0.96784285, 0.96744389, 0.96744389, 0.96999018, 0.96857268]),
 'test_precision': array([0.56079855, 0.56880734, 0.54545455, 0.54359926, 0.56078431,
        0.55160142, 0.54909091, 0.58174905, 0.54919499, 0.53322259]),
 'train_precision': array([0.83958838, 0.82762739, 0.82931206, 0.84030726, 0.82911266,
        0.83109866, 0.82861698, 0.82901038, 0.84006462, 0.83326661]),
 'test_recall': array([0.66309013, 0.66523605, 0.68240343, 0.62875536, 0.6

In [19]:
sum(scores['test_f1']/len(scores['test_f1']))

0.6011613553928952

In [20]:
clf.predict(X)

from sklearn.metrics import classification_report
print(classification_report(y, clf.predict(X)))

              precision    recall  f1-score   support

       False       1.00      0.96      0.98     25886
        True       0.82      0.99      0.90      4659

   micro avg       0.97      0.97      0.97     30545
   macro avg       0.91      0.98      0.94     30545
weighted avg       0.97      0.97      0.97     30545

