In [1]:
import pandas as pd

oleh_dataset = pd.read_csv('dataset.csv')
oleh_dataset = oleh_dataset[oleh_dataset['label'] != 'unknown']
oleh_dataset['label'].value_counts()

tell    922
show    405
Name: label, dtype: int64

In [2]:
with open('show-validation.txt', 'r') as f:
    show_sents = f.readlines()
    
with open('tell-validation.txt', 'r') as f:
    tell_sents = f.readlines()
    
scraped_dataset = pd.DataFrame({'sentence': show_sents + tell_sents,
                                'label': ['show'] * len(show_sents) + ['tell'] * len(tell_sents)})

scraped_dataset['sentence'] = scraped_dataset['sentence'].str.strip()

scraped_dataset['label'].value_counts()

show    40
tell    26
Name: label, dtype: int64

In [3]:
with open('katia-show.txt', 'r') as f:
    show_sents = f.readlines()
    
with open('katia-tell.txt', 'r') as f:
    tell_sents = f.readlines()

katia_dataset = pd.DataFrame({'sentence': show_sents + tell_sents,
                              'label': ['show'] * len(show_sents) + ['tell'] * len(tell_sents)})

katia_dataset['sentence'] = katia_dataset['sentence'].str.strip()

katia_dataset['label'].value_counts()

tell    118
show     15
Name: label, dtype: int64

# Катя надіслала розмічені речення, тому перерахую бейзлайн

In [4]:
import spacy
nlp = spacy.load("en_core_web_md")

In [5]:
oleh_dataset['sentence'] = oleh_dataset['sentence'].apply(nlp)
scraped_dataset['sentence'] = scraped_dataset['sentence'].apply(nlp)
katia_dataset['sentence'] = katia_dataset['sentence'].apply(nlp)

In [6]:
from sklearn.model_selection import train_test_split
X_oleh_train, X_oleh_test, y_oleh_train, y_oleh_test = train_test_split(oleh_dataset['sentence'], oleh_dataset['label'], random_state=42)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

def tokenize(model):
    return [tok.text for tok in model]

def lemmatize(model):
    return [tok.lemma_ for tok in model]

def make_baseline_clf():
    return Pipeline([('vect', CountVectorizer(lowercase=False, token_pattern=None)),
                     ('nb', MultinomialNB()),
                    ])

def validation_report(clf):
    print('Oleh:')
    print(classification_report(y_oleh_test, clf.predict(X_oleh_test)))
    print('')
    print('')
    print('')
    print('Scraped:')
    print(classification_report(scraped_dataset['label'], clf.predict(scraped_dataset['sentence'])))
    print('')
    print('')
    print('')
    print('Katia:')
    print(classification_report(katia_dataset['label'], clf.predict(katia_dataset['sentence'])))
    print('')
    print('')
    print('')
    print('All:')
    print(classification_report(pd.concat([y_oleh_test, scraped_dataset['label'], katia_dataset['label']]), 
                                clf.predict(pd.concat([X_oleh_test, scraped_dataset['sentence'], katia_dataset['sentence']]))))

In [8]:
clf = make_baseline_clf()
clf.set_params(vect__tokenizer=tokenize)

clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.74      0.35      0.48       110
        tell       0.75      0.94      0.83       222

    accuracy                           0.74       332
   macro avg       0.74      0.65      0.65       332
weighted avg       0.74      0.74      0.71       332




Scraped:
              precision    recall  f1-score   support

        show       0.75      0.15      0.25        40
        tell       0.41      0.92      0.57        26

    accuracy                           0.45        66
   macro avg       0.58      0.54      0.41        66
weighted avg       0.62      0.45      0.38        66




Katia:
              precision    recall  f1-score   support

        show       0.11      0.13      0.12        15
        tell       0.89      0.86      0.88       118

    accuracy                           0.78       133
   macro avg       0.50      0.50      0.50       133
weighted avg       0.80      0.78      0.79   

In [9]:
from sklearn.model_selection import GridSearchCV

parameter_grid = [{'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)],
                   'vect__tokenizer': [tokenize, lemmatize],
                   'nb__alpha': [1e-10, 0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 1],
                   'nb__fit_prior': [True, False],
                  }]

gs_clf = GridSearchCV(make_baseline_clf(), parameter_grid, scoring='f1_macro')
gs_clf.fit(X_oleh_train, y_oleh_train)
gs_clf.best_params_

{'nb__alpha': 0.3,
 'nb__fit_prior': False,
 'vect__ngram_range': (1, 1),
 'vect__tokenizer': <function __main__.tokenize(model)>}

In [10]:
validation_report(gs_clf)

Oleh:
              precision    recall  f1-score   support

        show       0.56      0.58      0.57       110
        tell       0.79      0.77      0.78       222

    accuracy                           0.71       332
   macro avg       0.67      0.68      0.67       332
weighted avg       0.71      0.71      0.71       332




Scraped:
              precision    recall  f1-score   support

        show       0.81      0.33      0.46        40
        tell       0.46      0.88      0.61        26

    accuracy                           0.55        66
   macro avg       0.64      0.60      0.53        66
weighted avg       0.67      0.55      0.52        66




Katia:
              precision    recall  f1-score   support

        show       0.16      0.40      0.23        15
        tell       0.91      0.73      0.81       118

    accuracy                           0.69       133
   macro avg       0.53      0.56      0.52       133
weighted avg       0.82      0.69      0.74   

# Покращена версія

In [11]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

def ds_func(f):
    return lambda X: [f(x) for x in X]

def combine_extractors(funcs):
    def combined(x):
        feats = {}
        for e in funcs:
            feats.update(e(x))
        return feats
    return combined

def make_rfc_classifier(*feature_extractors):
    classifier = Pipeline([('extractor', FunctionTransformer()),
                           ('dict_vect', DictVectorizer()),
                           ('rfc', RandomForestClassifier(random_state=42))])
    params = {'extractor__func': ds_func(combine_extractors(feature_extractors))}
    classifier.set_params(**params)
    
    return classifier

def make_lrc_classifier(*feature_extractors):
    classifier = Pipeline([('extractor', FunctionTransformer()),
                           ('dict_vect', DictVectorizer()),
                           ('lrc', LogisticRegression())])
        
    params = {'lrc__random_state': 42,
              'lrc__solver': 'sag',
              'lrc__multi_class': 'multinomial',
              'lrc__max_iter': 5000,
              'extractor__func': ds_func(combine_extractors(feature_extractors))}
    classifier.set_params(**params)

    return classifier

## Витягую частотність POS i DEP тегів
### Тут використовую Random Forest, бо логістична регресія видає всюди нулі. Певно ці значення не fit-яться лінійною моделлю 🤔

In [12]:
from collections import Counter

def extract_pos_freqs(doc):
    pos_freqs = Counter([tok.pos_ for tok in doc])    
    return {pos + '_num': freq / len(doc) for pos, freq in pos_freqs.items()}

extract_pos_freqs(nlp('I like cats very much.'))

{'PRON_num': 0.16666666666666666,
 'VERB_num': 0.16666666666666666,
 'NOUN_num': 0.16666666666666666,
 'ADV_num': 0.3333333333333333,
 'PUNCT_num': 0.16666666666666666}

In [13]:
clf = make_rfc_classifier(extract_pos_freqs)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.61      0.25      0.35       110
        tell       0.71      0.92      0.80       222

    accuracy                           0.70       332
   macro avg       0.66      0.58      0.58       332
weighted avg       0.68      0.70      0.65       332




Scraped:
              precision    recall  f1-score   support

        show       1.00      0.17      0.30        40
        tell       0.44      1.00      0.61        26

    accuracy                           0.50        66
   macro avg       0.72      0.59      0.45        66
weighted avg       0.78      0.50      0.42        66




Katia:
              precision    recall  f1-score   support

        show       0.06      0.07      0.06        15
        tell       0.88      0.87      0.88       118

    accuracy                           0.78       133
   macro avg       0.47      0.47      0.47       133
weighted avg       0.79      0.78      0.79   

In [14]:
def extract_dep_freqs(doc):
    dep_freqs = Counter([tok.dep_ for tok in doc])
    return {dep + '_num': freq / len(doc) for dep, freq in dep_freqs.items()}

In [15]:
clf = make_rfc_classifier(extract_dep_freqs)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.50      0.25      0.34       110
        tell       0.70      0.87      0.78       222

    accuracy                           0.67       332
   macro avg       0.60      0.56      0.56       332
weighted avg       0.64      0.67      0.63       332




Scraped:
              precision    recall  f1-score   support

        show       1.00      0.17      0.30        40
        tell       0.44      1.00      0.61        26

    accuracy                           0.50        66
   macro avg       0.72      0.59      0.45        66
weighted avg       0.78      0.50      0.42        66




Katia:
              precision    recall  f1-score   support

        show       0.14      0.13      0.14        15
        tell       0.89      0.90      0.89       118

    accuracy                           0.81       133
   macro avg       0.52      0.52      0.52       133
weighted avg       0.81      0.81      0.81   

## Від комбінування цих фіч якість особливо не покращується :(

In [17]:
clf = make_rfc_classifier(extract_pos_freqs, extract_dep_freqs)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.56      0.26      0.36       110
        tell       0.71      0.90      0.79       222

    accuracy                           0.69       332
   macro avg       0.63      0.58      0.58       332
weighted avg       0.66      0.69      0.65       332




Scraped:
              precision    recall  f1-score   support

        show       1.00      0.15      0.26        40
        tell       0.43      1.00      0.60        26

    accuracy                           0.48        66
   macro avg       0.72      0.57      0.43        66
weighted avg       0.78      0.48      0.40        66




Katia:
              precision    recall  f1-score   support

        show       0.07      0.07      0.07        15
        tell       0.88      0.88      0.88       118

    accuracy                           0.79       133
   macro avg       0.47      0.47      0.47       133
weighted avg       0.79      0.79      0.79   

### Збираю інформацію про головні підмет і присудок.

In [18]:
with open('emotions-dict.txt', 'r') as f:
    emotions = set([x.strip() for x in f.readlines()])

In [19]:
with open('abstract.txt', 'r') as f:
    abstracts = set([x.strip() for x in f.readlines()])

In [590]:
len(abstracts)

573

In [20]:
def find_main_token(doc):
    return [tok for tok in doc if tok.dep_ == 'ROOT'][0]

def extract_subj_verb(doc):
    feats = {}
    main = find_main_token(doc)
    
    if main.pos_ == 'VERB':
        feats['main-word'] = main.text
        feats['main-pos'] = main.pos_
        feats['main-lemma'] = main.lemma_
        
        subj = None
        for tok in doc:
            if tok.head.dep_ == 'ROOT' and tok.dep_ == 'nsubj':
                subj = tok
                break
        if subj:
            feats['subj-word'] = subj.text
            feats['subj-pos'] = subj.pos_
            feats['subj-lemma'] = subj.lemma_
            
    return feats

### Якість на рівні з попередніми класифікаторами

In [21]:
clf = make_lrc_classifier(extract_subj_verb)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.63      0.35      0.45       110
        tell       0.74      0.90      0.81       222

    accuracy                           0.72       332
   macro avg       0.68      0.62      0.63       332
weighted avg       0.70      0.72      0.69       332




Scraped:
              precision    recall  f1-score   support

        show       1.00      0.07      0.14        40
        tell       0.41      1.00      0.58        26

    accuracy                           0.44        66
   macro avg       0.71      0.54      0.36        66
weighted avg       0.77      0.44      0.31        66




Katia:
              precision    recall  f1-score   support

        show       0.11      0.13      0.12        15
        tell       0.89      0.86      0.88       118

    accuracy                           0.78       133
   macro avg       0.50      0.50      0.50       133
weighted avg       0.80      0.78      0.79   

### Контекст підмету і присудка. Якість в порівнянні з RF трішки покращилася

In [81]:
def ctx(x, size, check_important=True):
    lefts = x.doc[:x.i]
    rights = x.doc[x.i+1:]

    left_ctx = [x for x in lefts if not check_important or is_important(x)][-size:]
    if len(left_ctx) < size:
        left_ctx = ([None] * (size - len(left_ctx))) + left_ctx
    
    right_ctx = [x for x in rights if not check_important or is_important(x)][:size]
    if len(right_ctx) < size:
        right_ctx = right_ctx + ([None] * (size - len(right_ctx)))

    return list(reversed(left_ctx)), right_ctx

ctx(nlp('My mom likes cats very much')[2], 4)

([mom, None, None, None], [cats, None, None, None])

In [502]:
# from sklearn.preprocessing import KBinsDiscretizer

# def retrieve_unigrams(ngrams):
#     return {k:v for k, v in ngrams.items() if len(k.split(' ')) == 1}

# unigrams = retrieve_unigrams(ngrams)
# discretizer = KBinsDiscretizer(encode='ordinal', strategy='uniform', n_bins=8)
# discretizer.fit([[sum(val)] for val in unigrams.values()])

# def unigram_freq_discr(word):
#     return discretizer.transform([[sum(get_freqs([word]))]])[0][0]

In [574]:
wn.synsets('nice', 'a')

[Synset('nice.a.01'),
 Synset('decent.s.01'),
 Synset('nice.s.03'),
 Synset('dainty.s.04'),
 Synset('courteous.s.01')]

In [579]:
def pos_synsets(lemma, pos):
    if pos == 'NOUN':
        return wn.synsets(lemma, 'n')
    if pos == 'VERB':
        return wn.synsets(lemma, 'v')
    if pos == 'ADV':
        return wn.synsets(lemma, 'r')
    if pos == 'ADJ':
        return wn.synsets(lemma, 'a')
    return []

pos_synsets('nice', 'ADJ')

[Synset('nice.a.01'),
 Synset('decent.s.01'),
 Synset('nice.s.03'),
 Synset('dainty.s.04'),
 Synset('courteous.s.01')]

In [82]:
def extract_subj_verb_ctx(doc):
    feats = {}
    main = find_main_token(doc)
    
    if main.pos_ == 'VERB':
        left_ctx, right_ctx = ctx(main, 3)

#         feats['main-4-word'] = left_ctx[3].text if left_ctx[3] else '<<<none>>>'
#         feats['main-4-pos'] = left_ctx[3].pos_ if left_ctx[3] else '<<<none>>>'
#         feats['main-4-lemma'] = left_ctx[3].lemma_ if left_ctx[3] else '<<<none>>>'
        feats['main-3-word'] = left_ctx[2].lower_ if left_ctx[2] else '<<<none>>>'
        feats['main-2-word'] = left_ctx[1].lower_ if left_ctx[1] else '<<<none>>>'
        feats['main-1-word'] = left_ctx[0].lower_ if left_ctx[0] else '<<<none>>>'
        feats['main+1-word'] = right_ctx[0].lower_ if right_ctx[0] else '<<<none>>>'
        feats['main+2-word'] = right_ctx[1].lower_ if right_ctx[1] else '<<<none>>>'
        feats['main+3-word'] = right_ctx[2].lower_ if right_ctx[2] else '<<<none>>>'
        
        
        feats['main-3-pos'] = left_ctx[2].pos_ if left_ctx[2] else '<<<none>>>'
        feats['main-2-pos'] = left_ctx[1].pos_ if left_ctx[1] else '<<<none>>>'
        feats['main-1-pos'] = left_ctx[0].pos_ if left_ctx[0] else '<<<none>>>'
        feats['main+1-pos'] = right_ctx[0].pos_ if right_ctx[0] else '<<<none>>>'
        feats['main+2-pos'] = right_ctx[1].pos_ if right_ctx[1] else '<<<none>>>'
        feats['main+3-pos'] = right_ctx[2].pos_ if right_ctx[2] else '<<<none>>>'
        
        feats['main-3-lemma'] = left_ctx[2].lemma_ if left_ctx[2] else '<<<none>>>'
        feats['main-2-lemma'] = left_ctx[1].lemma_ if left_ctx[1] else '<<<none>>>'
        feats['main-1-lemma'] = left_ctx[0].lemma_ if left_ctx[0] else '<<<none>>>'
        feats['main+1-lemma'] = right_ctx[0].lemma_ if right_ctx[0] else '<<<none>>>'
        feats['main+2-lemma'] = right_ctx[1].lemma_ if right_ctx[1] else '<<<none>>>'
        feats['main+3-lemma'] = right_ctx[2].lemma_ if right_ctx[2] else '<<<none>>>'

        feats['main-3-is-emotion'] = left_ctx[2].lower_ in emotions if left_ctx[2] else False
        feats['main-2-is-emotion'] = left_ctx[1].lower_ in emotions if left_ctx[1] else False
        feats['main-1-is-emotion'] = left_ctx[0].lower_ in emotions if left_ctx[0] else False
        feats['main+1-is-emotion'] = right_ctx[0].lower_ in emotions if right_ctx[0] else False
        feats['main+2-is-emotion'] = right_ctx[1].lower_ in emotions if right_ctx[1] else False
        feats['main+3-is-emotion'] = right_ctx[2].lower_ in emotions if right_ctx[2] else False

        feats['main-3-is-emotion'] = left_ctx[2].lower_ in emotions if left_ctx[2] else False
        feats['main-2-is-emotion'] = left_ctx[1].lower_ in emotions if left_ctx[1] else False
        feats['main-1-is-emotion'] = left_ctx[0].lower_ in emotions if left_ctx[0] else False
        feats['main+1-is-emotion'] = right_ctx[0].lower_ in emotions if right_ctx[0] else False
        feats['main+2-is-emotion'] = right_ctx[1].lower_ in emotions if right_ctx[1] else False
        feats['main+3-is-emotion'] = right_ctx[2].lower_ in emotions if right_ctx[2] else False

#         feats['main-3-n-synonyms'] = len(pos_synsets(left_ctx[2].lemma_, left_ctx[2].pos_)) if left_ctx[2] else 0
#         feats['main-2-n-synonyms'] = len(pos_synsets(left_ctx[1].lemma_, left_ctx[1].pos_)) if left_ctx[1] else 0
#         feats['main-1-n-synonyms'] = len(pos_synsets(left_ctx[0].lemma_, left_ctx[0].pos_)) if left_ctx[0] else 0
#         feats['main+1-n-synonyms'] = len(pos_synsets(right_ctx[0].lemma_, right_ctx[0].pos_)) if right_ctx[0] else False
#         feats['main+2-n-synonyms'] = len(pos_synsets(right_ctx[1].lemma_, right_ctx[1].pos_)) if right_ctx[1] else False
#         feats['main+3-n-synonyms'] = len(pos_synsets(right_ctx[2].lemma_, right_ctx[2].pos_)) if right_ctx[2] else False

        feats['main-3-abstract'] = left_ctx[2].lower_ in abstracts if left_ctx[2] else False
        feats['main-2-abstract'] = left_ctx[1].lower_ in abstracts if left_ctx[1] else False
        feats['main-1-abstract'] = left_ctx[0].lower_ in abstracts if left_ctx[0] else False
        feats['main+1-abstract'] = right_ctx[0].lower_ in abstracts if right_ctx[0] else False
        feats['main+2-abstract'] = right_ctx[1].lower_ in abstracts if right_ctx[1] else False
        feats['main+3-abstract'] = right_ctx[2].lower_ in abstracts if right_ctx[2] else False


        mrc_word = mrc.get(left_ctx[2].text.upper()) if left_ctx[2] else None
        if mrc_word:
            feats['main-3-kf_freq'] = mrc_word.kf_freq if mrc_word.kf_freq else 0
            feats['main-3-kf_ncats'] = mrc_word.kf_ncats if mrc_word.kf_ncats else 0
            feats['main-3-kf_nsamp'] = mrc_word.kf_nsamp if mrc_word.kf_nsamp else 0
            feats['main-3-tl_freq'] = mrc_word.tl_freq if mrc_word.tl_freq else 0
            feats['main-3-brown_freq'] = mrc_word.brown_freq if mrc_word.brown_freq else 0
            feats['main-3-fam'] = mrc_word.fam if mrc_word.fam else 0
            feats['main-3-conc'] = mrc_word.conc if mrc_word.conc else 0
            feats['main-3-imag'] = mrc_word.imag if mrc_word.imag else 0
            feats['main-3-meanc'] = mrc_word.meanc if mrc_word.meanc else 0
            feats['main-3-meanp'] = mrc_word.meanp if mrc_word.meanp else 0

        mrc_word = mrc.get(left_ctx[1].text.upper()) if left_ctx[1] else None
        if mrc_word:
            feats['main-2-kf_freq'] = mrc_word.kf_freq if mrc_word.kf_freq else 0
            feats['main-2-kf_ncats'] = mrc_word.kf_ncats if mrc_word.kf_ncats else 0
            feats['main-2-kf_nsamp'] = mrc_word.kf_nsamp if mrc_word.kf_nsamp else 0
            feats['main-2-tl_freq'] = mrc_word.tl_freq if mrc_word.tl_freq else 0
            feats['main-2-brown_freq'] = mrc_word.brown_freq if mrc_word.brown_freq else 0
            feats['main-2-fam'] = mrc_word.fam if mrc_word.fam else 0
            feats['main-2-conc'] = mrc_word.conc if mrc_word.conc else 0
            feats['main-2-imag'] = mrc_word.imag if mrc_word.imag else 0
            feats['main-2-meanc'] = mrc_word.meanc if mrc_word.meanc else 0
            feats['main-2-meanp'] = mrc_word.meanp if mrc_word.meanp else 0

        mrc_word = mrc.get(left_ctx[0].text.upper()) if left_ctx[0] else None
        if mrc_word:
            feats['main-1-kf_freq'] = mrc_word.kf_freq if mrc_word.kf_freq else 0
            feats['main-1-kf_ncats'] = mrc_word.kf_ncats if mrc_word.kf_ncats else 0
            feats['main-1-kf_nsamp'] = mrc_word.kf_nsamp if mrc_word.kf_nsamp else 0
            feats['main-1-tl_freq'] = mrc_word.tl_freq if mrc_word.tl_freq else 0
            feats['main-1-brown_freq'] = mrc_word.brown_freq if mrc_word.brown_freq else 0
            feats['main-1-fam'] = mrc_word.fam if mrc_word.fam else 0
            feats['main-1-conc'] = mrc_word.conc if mrc_word.conc else 0
            feats['main-1-imag'] = mrc_word.imag if mrc_word.imag else 0
            feats['main-1-meanc'] = mrc_word.meanc if mrc_word.meanc else 0
            feats['main-1-meanp'] = mrc_word.meanp if mrc_word.meanp else 0

        mrc_word = mrc.get(right_ctx[0].text.upper()) if right_ctx[0] else None
        if mrc_word:
            feats['main+1-kf_freq'] = mrc_word.kf_freq if mrc_word.kf_freq else 0
            feats['main+1-kf_ncats'] = mrc_word.kf_ncats if mrc_word.kf_ncats else 0
            feats['main+1-kf_nsamp'] = mrc_word.kf_nsamp if mrc_word.kf_nsamp else 0
            feats['main+1-tl_freq'] = mrc_word.tl_freq if mrc_word.tl_freq else 0
            feats['main+1-brown_freq'] = mrc_word.brown_freq if mrc_word.brown_freq else 0
            feats['main+1-fam'] = mrc_word.fam if mrc_word.fam else 0
            feats['main+1-conc'] = mrc_word.conc if mrc_word.conc else 0
            feats['main+1-imag'] = mrc_word.imag if mrc_word.imag else 0
            feats['main+1-meanc'] = mrc_word.meanc if mrc_word.meanc else 0
            feats['main+1-meanp'] = mrc_word.meanp if mrc_word.meanp else 0

        mrc_word = mrc.get(right_ctx[1].text.upper()) if right_ctx[1] else None
        if mrc_word:
            feats['main+2-kf_freq'] = mrc_word.kf_freq if mrc_word.kf_freq else 0
            feats['main+2-kf_ncats'] = mrc_word.kf_ncats if mrc_word.kf_ncats else 0
            feats['main+2-kf_nsamp'] = mrc_word.kf_nsamp if mrc_word.kf_nsamp else 0
            feats['main+2-tl_freq'] = mrc_word.tl_freq if mrc_word.tl_freq else 0
            feats['main+2-brown_freq'] = mrc_word.brown_freq if mrc_word.brown_freq else 0
            feats['main+2-fam'] = mrc_word.fam if mrc_word.fam else 0
            feats['main+2-conc'] = mrc_word.conc if mrc_word.conc else 0
            feats['main+2-imag'] = mrc_word.imag if mrc_word.imag else 0
            feats['main+2-meanc'] = mrc_word.meanc if mrc_word.meanc else 0
            feats['main+2-meanp'] = mrc_word.meanp if mrc_word.meanp else 0

        mrc_word = mrc.get(right_ctx[2].text.upper()) if right_ctx[2] else None
        if mrc_word:
            feats['main+3-kf_freq'] = mrc_word.kf_freq if mrc_word.kf_freq else 0
            feats['main+3-kf_ncats'] = mrc_word.kf_ncats if mrc_word.kf_ncats else 0
            feats['main+3-kf_nsamp'] = mrc_word.kf_nsamp if mrc_word.kf_nsamp else 0
            feats['main+3-tl_freq'] = mrc_word.tl_freq if mrc_word.tl_freq else 0
            feats['main+3-brown_freq'] = mrc_word.brown_freq if mrc_word.brown_freq else 0
            feats['main+3-fam'] = mrc_word.fam if mrc_word.fam else 0
            feats['main+3-conc'] = mrc_word.conc if mrc_word.conc else 0
            feats['main+3-imag'] = mrc_word.imag if mrc_word.imag else 0
            feats['main+3-meanc'] = mrc_word.meanc if mrc_word.meanc else 0
            feats['main+3-meanp'] = mrc_word.meanp if mrc_word.meanp else 0



#         feats['main-3-freq-dicr'] = unigram_freq_discr(left_ctx[2].text) if left_ctx[2] else 0
#         feats['main-2-freq-dicr'] = unigram_freq_discr(left_ctx[1].text) if left_ctx[1] else 0
#         feats['main-1-freq-dicr'] = unigram_freq_discr(left_ctx[0].text) if left_ctx[0] else 0
#         feats['main+1-freq-dicr'] = unigram_freq_discr(right_ctx[0].text) if right_ctx[0] else 0
#         feats['main+2-freq-dicr'] = unigram_freq_discr(right_ctx[1].text) if right_ctx[1] else 0
#         feats['main+3-freq-dicr'] = unigram_freq_discr(right_ctx[2].text) if right_ctx[2] else 0

#         feats['main+4-word'] = right_ctx[3].text if right_ctx[3] else '<<<none>>>'
#         feats['main+4-pos'] = right_ctx[3].pos_ if right_ctx[3] else '<<<none>>>'
#         feats['main+4-lemma'] = right_ctx[3].lemma_ if right_ctx[3] else '<<<none>>>'
        
        subj = None
        for tok in doc:
            if tok.head.dep_ == 'ROOT' and tok.dep_ == 'nsubj':
                subj = tok
                break
        if subj:
            left_ctx, right_ctx = ctx(subj, 3)
#             feats['subj-4-word'] = left_ctx[3].text if left_ctx[3] else '<<<none>>>'
#             feats['subj-4-pos'] = left_ctx[3].pos_ if left_ctx[3] else '<<<none>>>'
#             feats['subj-4-lemma'] = left_ctx[3].lemma_ if left_ctx[3] else '<<<none>>>'
            
            feats['subj-3-word'] = left_ctx[2].lower_ if left_ctx[2] else '<<<none>>>'
            feats['subj-2-word'] = left_ctx[1].lower_ if left_ctx[1] else '<<<none>>>'
            feats['subj-1-word'] = left_ctx[0].lower_ if left_ctx[0] else '<<<none>>>'
            feats['subj+1-word'] = right_ctx[0].lower_ if right_ctx[0] else '<<<none>>>'
            feats['subj+2-word'] = right_ctx[1].lower_ if right_ctx[1] else '<<<none>>>'
            feats['subj+3-word'] = right_ctx[2].lower_ if right_ctx[2] else '<<<none>>>'
            
            feats['subj-3-pos'] = left_ctx[2].pos_ if left_ctx[2] else '<<<none>>>'
            feats['subj-2-pos'] = left_ctx[1].pos_ if left_ctx[1] else '<<<none>>>'
            feats['subj-1-pos'] = left_ctx[0].pos_ if left_ctx[0] else '<<<none>>>'
            feats['subj+1-pos'] = right_ctx[0].pos_ if right_ctx[0] else '<<<none>>>'
            feats['subj+2-pos'] = right_ctx[1].pos_ if right_ctx[1] else '<<<none>>>'
            feats['subj+3-pos'] = right_ctx[2].pos_ if right_ctx[2] else '<<<none>>>'
            
            feats['subj-3-lemma'] = left_ctx[2].lemma_ if left_ctx[2] else '<<<none>>>'
            feats['subj-2-lemma'] = left_ctx[1].lemma_ if left_ctx[1] else '<<<none>>>'
            feats['subj-1-lemma'] = left_ctx[0].lemma_ if left_ctx[0] else '<<<none>>>'
            feats['subj+1-lemma'] = right_ctx[0].lemma_ if right_ctx[0] else '<<<none>>>'
            feats['subj+2-lemma'] = right_ctx[1].lemma_ if right_ctx[1] else '<<<none>>>'
            feats['subj+3-lemma'] = right_ctx[2].lemma_ if right_ctx[2] else '<<<none>>>'

            feats['subj-3-is-emotion'] = left_ctx[2].lower_ in emotions if left_ctx[2] else False
            feats['subj-2-is-emotion'] = left_ctx[1].lower_ in emotions if left_ctx[1] else False
            feats['subj-1-is-emotion'] = left_ctx[0].lower_ in emotions if left_ctx[0] else False
            feats['subj+1-is-emotion'] = right_ctx[0].lower_ in emotions if right_ctx[0] else False
            feats['subj+2-is-emotion'] = right_ctx[1].lower_ in emotions if right_ctx[1] else False
            feats['subj+3-is-emotion'] = right_ctx[2].lower_ in emotions if right_ctx[2] else False

            feats['subj-3-abstract'] = left_ctx[2].lower_ in abstracts if left_ctx[2] else False
            feats['subj-2-abstract'] = left_ctx[1].lower_ in abstracts if left_ctx[1] else False
            feats['subj-1-abstract'] = left_ctx[0].lower_ in abstracts if left_ctx[0] else False
            feats['subj+1-abstract'] = right_ctx[0].lower_ in abstracts if right_ctx[0] else False
            feats['subj+2-abstract'] = right_ctx[1].lower_ in abstracts if right_ctx[1] else False
            feats['subj+3-abstract'] = right_ctx[2].lower_ in abstracts if right_ctx[2] else False

            mrc_word = mrc.get(left_ctx[2].text.upper()) if left_ctx[2] else None
            if mrc_word:
                feats['subj-3-kf_freq'] = mrc_word.kf_freq if mrc_word.kf_freq else 0
                feats['subj-3-kf_ncats'] = mrc_word.kf_ncats if mrc_word.kf_ncats else 0
                feats['subj-3-kf_nsamp'] = mrc_word.kf_nsamp if mrc_word.kf_nsamp else 0
                feats['subj-3-tl_freq'] = mrc_word.tl_freq if mrc_word.tl_freq else 0
                feats['subj-3-brown_freq'] = mrc_word.brown_freq if mrc_word.brown_freq else 0
                feats['subj-3-fam'] = mrc_word.fam if mrc_word.fam else 0
                feats['subj-3-conc'] = mrc_word.conc if mrc_word.conc else 0
                feats['subj-3-imag'] = mrc_word.imag if mrc_word.imag else 0
                feats['subj-3-meanc'] = mrc_word.meanc if mrc_word.meanc else 0
                feats['subj-3-meanp'] = mrc_word.meanp if mrc_word.meanp else 0

            mrc_word = mrc.get(left_ctx[1].text.upper()) if left_ctx[1] else None
            if mrc_word:
                feats['subj-2-kf_freq'] = mrc_word.kf_freq if mrc_word.kf_freq else 0
                feats['subj-2-kf_ncats'] = mrc_word.kf_ncats if mrc_word.kf_ncats else 0
                feats['subj-2-kf_nsamp'] = mrc_word.kf_nsamp if mrc_word.kf_nsamp else 0
                feats['subj-2-tl_freq'] = mrc_word.tl_freq if mrc_word.tl_freq else 0
                feats['subj-2-brown_freq'] = mrc_word.brown_freq if mrc_word.brown_freq else 0
                feats['subj-2-fam'] = mrc_word.fam if mrc_word.fam else 0
                feats['subj-2-conc'] = mrc_word.conc if mrc_word.conc else 0
                feats['subj-2-imag'] = mrc_word.imag if mrc_word.imag else 0
                feats['subj-2-meanc'] = mrc_word.meanc if mrc_word.meanc else 0
                feats['subj-2-meanp'] = mrc_word.meanp if mrc_word.meanp else 0

            mrc_word = mrc.get(left_ctx[0].text.upper()) if left_ctx[0] else None
            if mrc_word:
                feats['subj-1-kf_freq'] = mrc_word.kf_freq if mrc_word.kf_freq else 0
                feats['subj-1-kf_ncats'] = mrc_word.kf_ncats if mrc_word.kf_ncats else 0
                feats['subj-1-kf_nsamp'] = mrc_word.kf_nsamp if mrc_word.kf_nsamp else 0
                feats['subj-1-tl_freq'] = mrc_word.tl_freq if mrc_word.tl_freq else 0
                feats['subj-1-brown_freq'] = mrc_word.brown_freq if mrc_word.brown_freq else 0
                feats['subj-1-fam'] = mrc_word.fam if mrc_word.fam else 0
                feats['subj-1-conc'] = mrc_word.conc if mrc_word.conc else 0
                feats['subj-1-imag'] = mrc_word.imag if mrc_word.imag else 0
                feats['subj-1-meanc'] = mrc_word.meanc if mrc_word.meanc else 0
                feats['subj-1-meanp'] = mrc_word.meanp if mrc_word.meanp else 0

            mrc_word = mrc.get(right_ctx[0].text.upper()) if right_ctx[0] else None
            if mrc_word:
                feats['subj+1-kf_freq'] = mrc_word.kf_freq if mrc_word.kf_freq else 0
                feats['subj+1-kf_ncats'] = mrc_word.kf_ncats if mrc_word.kf_ncats else 0
                feats['subj+1-kf_nsamp'] = mrc_word.kf_nsamp if mrc_word.kf_nsamp else 0
                feats['subj+1-tl_freq'] = mrc_word.tl_freq if mrc_word.tl_freq else 0
                feats['subj+1-brown_freq'] = mrc_word.brown_freq if mrc_word.brown_freq else 0
                feats['subj+1-fam'] = mrc_word.fam if mrc_word.fam else 0
                feats['subj+1-conc'] = mrc_word.conc if mrc_word.conc else 0
                feats['subj+1-imag'] = mrc_word.imag if mrc_word.imag else 0
                feats['subj+1-meanc'] = mrc_word.meanc if mrc_word.meanc else 0
                feats['subj+1-meanp'] = mrc_word.meanp if mrc_word.meanp else 0

            mrc_word = mrc.get(right_ctx[1].text.upper()) if right_ctx[1] else None
            if mrc_word:
                feats['subj+2-kf_freq'] = mrc_word.kf_freq if mrc_word.kf_freq else 0
                feats['subj+2-kf_ncats'] = mrc_word.kf_ncats if mrc_word.kf_ncats else 0
                feats['subj+2-kf_nsamp'] = mrc_word.kf_nsamp if mrc_word.kf_nsamp else 0
                feats['subj+2-tl_freq'] = mrc_word.tl_freq if mrc_word.tl_freq else 0
                feats['subj+2-brown_freq'] = mrc_word.brown_freq if mrc_word.brown_freq else 0
                feats['subj+2-fam'] = mrc_word.fam if mrc_word.fam else 0
                feats['subj+2-conc'] = mrc_word.conc if mrc_word.conc else 0
                feats['subj+2-imag'] = mrc_word.imag if mrc_word.imag else 0
                feats['subj+2-meanc'] = mrc_word.meanc if mrc_word.meanc else 0
                feats['subj+2-meanp'] = mrc_word.meanp if mrc_word.meanp else 0

            mrc_word = mrc.get(right_ctx[2].text.upper()) if right_ctx[2] else None
            if mrc_word:
                feats['subj+3-kf_freq'] = mrc_word.kf_freq if mrc_word.kf_freq else 0
                feats['subj+3-kf_ncats'] = mrc_word.kf_ncats if mrc_word.kf_ncats else 0
                feats['subj+3-kf_nsamp'] = mrc_word.kf_nsamp if mrc_word.kf_nsamp else 0
                feats['subj+3-tl_freq'] = mrc_word.tl_freq if mrc_word.tl_freq else 0
                feats['subj+3-brown_freq'] = mrc_word.brown_freq if mrc_word.brown_freq else 0
                feats['subj+3-fam'] = mrc_word.fam if mrc_word.fam else 0
                feats['subj+3-conc'] = mrc_word.conc if mrc_word.conc else 0
                feats['subj+3-imag'] = mrc_word.imag if mrc_word.imag else 0
                feats['subj+3-meanc'] = mrc_word.meanc if mrc_word.meanc else 0
                feats['subj+3-meanp'] = mrc_word.meanp if mrc_word.meanp else 0


            
#             feats['subj-3-n-synonyms'] = len(pos_synsets(left_ctx[2].lemma_, left_ctx[2].pos_)) if left_ctx[2] else 0
#             feats['subj-2-n-synonyms'] = len(pos_synsets(left_ctx[1].lemma_, left_ctx[1].pos_)) if left_ctx[1] else 0
#             feats['subj-1-n-synonyms'] = len(pos_synsets(left_ctx[0].lemma_, left_ctx[0].pos_)) if left_ctx[0] else 0
#             feats['subj+1-n-synonyms'] = len(pos_synsets(right_ctx[0].lemma_, right_ctx[0].pos_)) if right_ctx[0] else False
#             feats['subj+2-n-synonyms'] = len(pos_synsets(right_ctx[1].lemma_, right_ctx[1].pos_)) if right_ctx[1] else False
#             feats['subj+3-n-synonyms'] = len(pos_synsets(right_ctx[2].lemma_, right_ctx[2].pos_)) if right_ctx[2] else False

#             feats['subj-3-freq-dicr'] = unigram_freq_discr(left_ctx[2].text) if left_ctx[2] else 0
#             feats['subj-2-freq-dicr'] = unigram_freq_discr(left_ctx[1].text) if left_ctx[1] else 0
#             feats['subj+1-freq-dicr'] = unigram_freq_discr(right_ctx[0].text) if right_ctx[0] else 0
#             feats['subj+2-freq-dicr'] = unigram_freq_discr(right_ctx[1].text) if right_ctx[1] else 0
#             feats['subj+3-freq-dicr'] = unigram_freq_discr(right_ctx[2].text) if right_ctx[2] else 0

#             feats['subj+4-word'] = right_ctx[3].text if right_ctx[3] else '<<<none>>>'
#             feats['subj+4-pos'] = right_ctx[3].pos_ if right_ctx[3] else '<<<none>>>'
#             feats['subj+4-lemma'] = right_ctx[3].lemma_ if right_ctx[3] else '<<<none>>>'


    return feats

In [74]:
'foo'.upper()

'FOO'

In [24]:
clf = make_lrc_classifier(extract_subj_verb, extract_subj_verb_ctx)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.55      0.35      0.42       110
        tell       0.73      0.86      0.79       222

    accuracy                           0.69       332
   macro avg       0.64      0.60      0.61       332
weighted avg       0.67      0.69      0.67       332




Scraped:
              precision    recall  f1-score   support

        show       1.00      0.12      0.22        40
        tell       0.43      1.00      0.60        26

    accuracy                           0.47        66
   macro avg       0.71      0.56      0.41        66
weighted avg       0.77      0.47      0.37        66




Katia:
              precision    recall  f1-score   support

        show       0.20      0.20      0.20        15
        tell       0.90      0.90      0.90       118

    accuracy                           0.82       133
   macro avg       0.55      0.55      0.55       133
weighted avg       0.82      0.82      0.82   

In [25]:
def extract_subj_verb_dependants(doc):
    feats = {}
    main = find_main_token(doc)
    
    if main.pos_ == 'VERB':
        for tok in doc:
            if tok.head.i == main.i:                
                feats[f'main-{tok.dep_}-word'] = tok.text
                feats[f'main-{tok.dep_}-pos'] = tok.pos_
                feats[f'main-{tok.dep_}-lemma'] = tok.lemma_
        
        subj = None
        for tok in doc:
            if tok.head.dep_ == 'ROOT' and tok.dep_ == 'nsubj':
                subj = tok
                break
        if subj:
            for tok in doc:
                if tok.head.i == subj.i:                
                    feats[f'subj-{tok.dep_}-word'] = tok.text
                    feats[f'subj-{tok.dep_}-pos'] = tok.pos_
                    feats[f'subj-{tok.dep_}-lemma'] = tok.lemma_
            
    return feats

extract_subj_verb_dependants(nlp('I like cats very much'))

{'main-nsubj-word': 'I',
 'main-nsubj-pos': 'PRON',
 'main-nsubj-lemma': '-PRON-',
 'main-ROOT-word': 'like',
 'main-ROOT-pos': 'VERB',
 'main-ROOT-lemma': 'like',
 'main-dobj-word': 'cats',
 'main-dobj-pos': 'NOUN',
 'main-dobj-lemma': 'cat',
 'main-advmod-word': 'much',
 'main-advmod-pos': 'ADV',
 'main-advmod-lemma': 'much'}

In [26]:
clf = make_lrc_classifier(extract_subj_verb, extract_subj_verb_ctx, extract_subj_verb_dependants)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.58      0.38      0.46       110
        tell       0.74      0.86      0.79       222

    accuracy                           0.70       332
   macro avg       0.66      0.62      0.63       332
weighted avg       0.68      0.70      0.68       332




Scraped:
              precision    recall  f1-score   support

        show       1.00      0.12      0.22        40
        tell       0.43      1.00      0.60        26

    accuracy                           0.47        66
   macro avg       0.71      0.56      0.41        66
weighted avg       0.77      0.47      0.37        66




Katia:
              precision    recall  f1-score   support

        show       0.24      0.27      0.25        15
        tell       0.91      0.89      0.90       118

    accuracy                           0.82       133
   macro avg       0.57      0.58      0.57       133
weighted avg       0.83      0.82      0.82   

In [537]:
def extract_emotions(doc):
    feats = {}
    
    feats['has-emotion'] = len([x for x in doc if x.lower_ in emotions]) > 0
    
    return feats

extract_emotions(nlp('I am dead'))

{'has-emotion': False}

In [547]:
clf = make_lrc_classifier(extract_subj_verb, extract_subj_verb_ctx, extract_subj_verb_dependants)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(EmotionsClf(clf))

Oleh:
              precision    recall  f1-score   support

        show       0.56      0.31      0.40        97
        tell       0.73      0.88      0.80       206

    accuracy                           0.70       303
   macro avg       0.64      0.60      0.60       303
weighted avg       0.67      0.70      0.67       303




Scraped:
              precision    recall  f1-score   support

        show       1.00      0.12      0.22        40
        tell       0.43      1.00      0.60        26

    accuracy                           0.47        66
   macro avg       0.71      0.56      0.41        66
weighted avg       0.77      0.47      0.37        66




Katia:
              precision    recall  f1-score   support

        show       0.33      0.27      0.30        15
        tell       0.91      0.93      0.92       118

    accuracy                           0.86       133
   macro avg       0.62      0.60      0.61       133
weighted avg       0.84      0.86      0.85   

### Використовую в якості фіч вектор речення і вектори головних підмета і присудка

In [29]:
import numpy as np

def vector_to_feats(prefix, vector):
    feats = {}
    
    for i, x in enumerate(vector):
        feats[prefix + str(i)] = x
    
    return feats

def avg_vector(vectors):
    vect = np.zeros(300)
    
    for v in vectors:
        vect += v
    return vect / len(vectors) if len(vectors) > 0 else vect

def is_important(x):
    return not x.is_stop and not x.pos_ == 'PROPN' and x.ent_iob_ == 'O'

def extract_vector(doc):        
    return vector_to_feats('sent_vect', avg_vector([x.vector for x in doc if is_important(x)]))

In [30]:
clf = make_lrc_classifier(extract_vector)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.76      0.51      0.61       110
        tell       0.79      0.92      0.85       222

    accuracy                           0.78       332
   macro avg       0.77      0.71      0.73       332
weighted avg       0.78      0.78      0.77       332




Scraped:
              precision    recall  f1-score   support

        show       0.86      0.15      0.26        40
        tell       0.42      0.96      0.59        26

    accuracy                           0.47        66
   macro avg       0.64      0.56      0.42        66
weighted avg       0.69      0.47      0.39        66




Katia:
              precision    recall  f1-score   support

        show       0.28      0.33      0.30        15
        tell       0.91      0.89      0.90       118

    accuracy                           0.83       133
   macro avg       0.60      0.61      0.60       133
weighted avg       0.84      0.83      0.83   

In [31]:
def extract_subj_verb_vector(doc):
    feats = {}
    main = find_main_token(doc)
    
    if main.pos_ == 'VERB':
        feats.update(vector_to_feats('main_vect', main.vector))
        subj = None
        for tok in doc:
            if tok.head.dep_ == 'ROOT' and tok.dep_ == 'nsubj':
                subj = tok
                break
        if subj:
            feats.update(vector_to_feats('main_subj_vect', subj.vector))

    return feats

In [32]:
def extract_subj_verb_ctx_vector(doc):
    feats = {}
    main = find_main_token(doc)
    vects = []
    
    if main.pos_ == 'VERB':
        left_ctx, right_ctx = ctx(main, 3, check_important=True)
        
        for x in left_ctx + right_ctx:
            if x:
                vects.append(x.vector)
            
        
        subj = None
        for tok in doc:
            if tok.head.dep_ == 'ROOT' and tok.dep_ == 'nsubj':
                subj = tok
                break
        if subj:
            left_ctx, right_ctx = ctx(subj, 3, check_important=True)
        
            for x in left_ctx + right_ctx:
                if x:
                    vects.append(x.vector)
            
    return vector_to_feats('subj-verb-ctx', avg_vector(vects))

In [33]:
clf = make_lrc_classifier(extract_vector, extract_subj_verb_vector)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.66      0.55      0.60       110
        tell       0.80      0.86      0.83       222

    accuracy                           0.76       332
   macro avg       0.73      0.71      0.72       332
weighted avg       0.75      0.76      0.75       332




Scraped:
              precision    recall  f1-score   support

        show       0.75      0.15      0.25        40
        tell       0.41      0.92      0.57        26

    accuracy                           0.45        66
   macro avg       0.58      0.54      0.41        66
weighted avg       0.62      0.45      0.38        66




Katia:
              precision    recall  f1-score   support

        show       0.11      0.27      0.16        15
        tell       0.89      0.73      0.80       118

    accuracy                           0.68       133
   macro avg       0.50      0.50      0.48       133
weighted avg       0.80      0.68      0.73   

### Таке рішення трішки перевершує бейзлайн по якості.
### Скомбінувавши вектори з попередніми фічами, якість ще трохи поліпшується.

In [544]:
class EmotionsClf():
    def __init__(self, clf):
        self.clf = clf
    def predict(self, X):
        labels = self.clf.predict(X)
        new_labels = []
        for doc, label in zip(X, labels):
            if len([tok for tok in doc if tok.lower_ in emotions]) > 0:
                new_labels.append('tell')
            else:
                new_labels.append(label)
        return new_labels
            

In [39]:
clf = make_lrc_classifier(extract_vector, extract_subj_verb)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.76      0.55      0.63       110
        tell       0.80      0.91      0.85       222

    accuracy                           0.79       332
   macro avg       0.78      0.73      0.74       332
weighted avg       0.79      0.79      0.78       332




Scraped:
              precision    recall  f1-score   support

        show       0.83      0.12      0.22        40
        tell       0.42      0.96      0.58        26

    accuracy                           0.45        66
   macro avg       0.62      0.54      0.40        66
weighted avg       0.67      0.45      0.36        66




Katia:
              precision    recall  f1-score   support

        show       0.21      0.40      0.28        15
        tell       0.91      0.81      0.86       118

    accuracy                           0.77       133
   macro avg       0.56      0.61      0.57       133
weighted avg       0.84      0.77      0.80   

In [60]:
clf.predict_proba([nlp('i am sad')])

array([[0.73234774, 0.26765226]])

In [211]:
from stanfordcorenlp import StanfordCoreNLP

corenlp = StanfordCoreNLP('/Users/oleh.palianytsia/Downloads/stanford-corenlp-full-2018-02-27')

In [219]:
import re
re.findall('\\(ADVP', corenlp.parse('I like cats very much'))

['(ADVP']

In [121]:
with open('/Users/oleh.palianytsia/devel/show-dont-tell/raw_data/stories-sents.txt', 'r') as f:
    stories_lines = [s.strip() for s in f.readlines()]

In [552]:
part = stories_lines[400000:500000]

In [553]:
part = list(nlp.pipe(part))

In [554]:
labels = clf.predict(part)

In [555]:
with open('/Users/oleh.palianytsia/devel/show-dont-tell/to-annotate-2.csv', 'w') as f:
    for doc, label in zip(part, labels):
        if label == 'show' and len(doc.text) > 140:
            f.write(doc.text)
            f.write('\n')

In [471]:
nlp('I was "amazed"')[2].lemma_

'"'

In [None]:
parse_cache = {}

In [228]:
import re

def extract_phrases_num(doc):
    global parse_cache
    if doc.text in parse_cache:
        parse = parse_cache[doc.text]
    else:
        parse = corenlp.parse(doc.text)
        parse_cache[doc.text] = parse

    feats = {}
    
    feats['pp-num'] = len(re.findall('\\(PP', parse)) / len(doc)
    feats['adjp-num'] = len(re.findall('\\(ADJP', parse)) / len(doc)
    feats['advp-num'] = len(re.findall('\\(ADVP', parse)) / len(doc)

    return feats

extract_phrases_num(nlp('I like cats very much'))

{'pp-num': 0.0, 'adjp-num': 0.0, 'advp-num': 0.2}

In [232]:
clf = make_rfc_classifier(extract_phrases_num)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.21      0.16      0.19        61
        tell       0.78      0.83      0.80       215

    accuracy                           0.68       276
   macro avg       0.50      0.50      0.49       276
weighted avg       0.65      0.68      0.67       276




Scraped:
              precision    recall  f1-score   support

        show       0.73      0.20      0.31        40
        tell       0.42      0.88      0.57        26

    accuracy                           0.47        66
   macro avg       0.57      0.54      0.44        66
weighted avg       0.61      0.47      0.41        66




Katia:
              precision    recall  f1-score   support

        show       0.06      0.13      0.09        15
        tell       0.87      0.75      0.80       118

    accuracy                           0.68       133
   macro avg       0.47      0.44      0.44       133
weighted avg       0.78      0.68      0.72   

# Збираю н-грами

In [99]:
from phrasefinder import phrasefinder as pf

def fetch_ngram(text):
#     print('fetching...')
    try:
        query = pf.escape_query_term(text)
        result = pf.search(pf.Corpus.AMERICAN_ENGLISH, query)
        if result.error:
            print('WARN: request failed: {}'.format(result.error['message']))
            return None

        return [phrase.match_count for phrase in result.phrases] + [0]
    except Exception as error:
        print('Fatal error: {}'.format(error))
        return None

def process_ngram(ngram, res_dict):
    def fetch_and_save(text):
        if not text in res_dict:
            freq = fetch_ngram(text)
            if freq is not None:
                res_dict[text] = freq
    
    formatted = ' '.join([x.lower() for x in ngram])
    fetch_and_save(formatted)            
    return res_dict

def collect_ngrams(sents, n, res_dict):
    print('starting...')
    
    for sent in sents:
        ngrams = gen_ngrams(sent, n)
        if ngrams:
            for ngram in ngrams:
                process_ngram(ngram, res_dict)
    
    print('done!')

    return res_dict

In [100]:
def gen_ngrams(toks, n):
    if len(toks) >= n:
        return [toks[i:i+n] for i in range(len(toks) - n + 1)]

def get_freqs(toks):
    return ngrams[' '.join([x.lower() for x in toks])]

def get_or_fetch_freqs(toks):
    process_ngram(toks, ngrams)
    return get_freqs(toks)

In [326]:
import math
import json
from joblib import Parallel, delayed

all_sents = pd.concat([oleh_dataset['sentence'].apply(tokenize), 
                       scraped_dataset['sentence'].apply(tokenize),
                       katia_dataset['sentence'].apply(tokenize)])

with open('ngrams.json', 'r') as f:
    ngrams = json.load(f)

def parallel_collect(sents, ngrams_map):
    n_batches = 200
    batch_size = math.ceil(len(all_sents) / n_batches)
    gen = (delayed(collect_ngrams)(all_sents[i*batch_size:(i+1)*batch_size], n, ngrams)
        for i in range(n_batches) for n in range(1, 5))

    job_results = Parallel(n_jobs=32, verbose=10)(gen)
    
    for d in job_results:
        ngrams_map.update(d)
        
    with open('ngrams.json', 'w') as f:
        json.dump(ngrams_map, f)

parallel_collect(all_sents, ngrams)

[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done   2 tasks      | elapsed:    4.0s
[Parallel(n_jobs=48)]: Done  17 tasks      | elapsed:   26.3s
[Parallel(n_jobs=48)]: Done  32 tasks      | elapsed:  1.3min
[Parallel(n_jobs=48)]: Done  49 tasks      | elapsed:  1.6min
[Parallel(n_jobs=48)]: Done  66 tasks      | elapsed:  2.0min
[Parallel(n_jobs=48)]: Done  85 tasks      | elapsed:  2.4min
[Parallel(n_jobs=48)]: Done 104 tasks      | elapsed:  2.7min
[Parallel(n_jobs=48)]: Done 125 tasks      | elapsed:  3.1min
[Parallel(n_jobs=48)]: Done 146 tasks      | elapsed:  3.5min
[Parallel(n_jobs=48)]: Done 169 tasks      | elapsed:  4.0min
[Parallel(n_jobs=48)]: Done 192 tasks      | elapsed:  4.5min
[Parallel(n_jobs=48)]: Done 217 tasks      | elapsed:  4.9min
[Parallel(n_jobs=48)]: Done 242 tasks      | elapsed:  5.4min
[Parallel(n_jobs=48)]: Done 269 tasks      | elapsed:  5.9min
[Parallel(n_jobs=48)]: Done 296 tasks      | elapsed:  

In [102]:
import numpy as np

def extract_ngram_freqs(doc):
    feats = {}
    
    toks = tokenize(doc)
    
    feats['avg-1-gram-freq'] = np.mean([sum(get_freqs(gram)) for gram in gen_ngrams(toks, 1)])
    
    if len(doc) >= 2:
        feats['avg-2-gram-freq'] = np.mean([sum(get_freqs(gram)) for gram in gen_ngrams(toks, 2)])
    if len(doc) >= 3:
        feats['avg-3-gram-freq'] = np.mean([sum(get_freqs(gram)) for gram in gen_ngrams(toks, 3)])
    if len(doc) >= 4:
        feats['avg-4-gram-freq'] = np.mean([sum(get_freqs(gram)) for gram in gen_ngrams(toks, 4)])
    
    return feats

In [103]:
def extract_subj_verb_ngram_freqs(doc):
    feats = {}
    main = find_main_token(doc)
    
    if main.pos_ == 'VERB':
        subj = None
        for tok in doc:
            if tok.head.dep_ == 'ROOT' and tok.dep_ == 'nsubj':
                subj = tok
                break
        if subj:
            feats['subj-verb-freq'] = sum(get_or_fetch_freqs([subj.text, main.text]))

    return feats

## На самих н-грамах якість поганенька, а LRC взагалі видавала нулі, що в принципі очікувано (нормалізація не помагала)

In [104]:
clf = make_rfc_classifier(extract_ngram_freqs, extract_subj_verb_ngram_freqs)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.50      0.16      0.24        75
        tell       0.74      0.94      0.83       189

    accuracy                           0.72       264
   macro avg       0.62      0.55      0.53       264
weighted avg       0.67      0.72      0.66       264




Scraped:
              precision    recall  f1-score   support

        show       1.00      0.07      0.14        40
        tell       0.41      1.00      0.58        26

    accuracy                           0.44        66
   macro avg       0.71      0.54      0.36        66
weighted avg       0.77      0.44      0.31        66




Katia:
              precision    recall  f1-score   support

        show       0.12      0.13      0.13        15
        tell       0.89      0.88      0.89       118

    accuracy                           0.80       133
   macro avg       0.51      0.51      0.51       133
weighted avg       0.80      0.80      0.80   

### Після комбінації з POS i DEP частотами все одно зле

In [83]:
clf = make_rfc_classifier(extract_pos_freqs, extract_dep_freqs, extract_subj_verb, extract_subj_verb_ctx)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.83      0.17      0.29       110
        tell       0.71      0.98      0.82       222

    accuracy                           0.71       332
   macro avg       0.77      0.58      0.55       332
weighted avg       0.75      0.71      0.64       332




Scraped:
              precision    recall  f1-score   support

        show       1.00      0.05      0.10        40
        tell       0.41      1.00      0.58        26

    accuracy                           0.42        66
   macro avg       0.70      0.53      0.34        66
weighted avg       0.77      0.42      0.29        66




Katia:
              precision    recall  f1-score   support

        show       0.00      0.00      0.00        15
        tell       0.89      0.98      0.93       118

    accuracy                           0.87       133
   macro avg       0.44      0.49      0.47       133
weighted avg       0.79      0.87      0.83   

# Висновки і спостереження:

* фічі, з якими я ще експериментував, але які не дали приросту в якості:
  * усереднений вектор прикментників;
  * конкатенація векторів головного токену і його дітей в дереві залежностей;

* схоже на те, що в кожного своє розуміння show i tell речень :)
  * я розмічав дані за принципом, якщо є хоча б якесь мінімальне перефразування (she felt fear while walking the corridors -> as she walked through the dark corridors, her heartbeat increased with every step), то це речення я вважав `show`;
  * у Каті та на сайтах рекомендацій схоже більш строгі вимоги до show;

In [63]:
class Word():
    def __init__(self,
                 wid,
                 nlet,
                 nphon,
                 nsyl,
                 kf_freq,
                 kf_ncats,
                 kf_nsamp,
                 tl_freq,
                 brown_freq,
                 fam,
                 conc,
                 imag,
                 meanc,
                 meanp,
                 aoa,
                 tq2,
                 wtype,
                 pdwtype,
                 alphasyl,
                 status,
                 var,
                 cap,
                 irreg,
                 word,
                 phon,
                 dphon,
                 stress):
        self.wid = wid
        self.nlet = nlet
        self.nphon = nphon
        self.nsyl = nsyl
        self.kf_freq = kf_freq
        self.kf_ncats = kf_ncats
        self.kf_nsamp = kf_nsamp
        self.tl_freq = tl_freq
        self.brown_freq = brown_freq
        self.fam = fam
        self.conc = conc
        self.imag = imag
        self.meanc = meanc
        self.meanp = meanp
        self.aoa = aoa
        self.tq2 = tq2
        self.wtype = wtype
        self.pdwtype = pdwtype
        self.alphasyl = alphasyl
        self.status = status
        self.var = var
        self.cap = cap
        self.irreg = irreg
        self.word = word
        self.phon = phon
        self.dphon = dphon
        self.stress = stress
        
    def __repr__(self):
        s = "<Word(\
id: %d\n\
nlet: %d\n\
nphon: %d\n\
nsyl: %d\n\
kf_freq: %d\n\
kf_ncats: %d\n\
kf_nsamp: %d\n\
tl_freq: %d\n\
brown_freq: %d\n\
fam: %d\n\
conc: %d\n\
imag: %d\n\
meanc: %d\n\
meanp: %d\n\
aoa: %d\n\
tq2: %s\n\
wtype: %s\n\
pdwtype: %s\n\
alphasyl: %s\n\
status: %s\n\
var: %s\n\
cap: %s\n\
irreg: %s\n\
word: %s\n\
phon: %s\n\
dphon: %s\n\
stress: %s)>\n" % (
        self.wid,
        self.nlet,
        self.nphon,
        self.nsyl,
        self.kf_freq,
        self.kf_ncats,
        self.kf_nsamp,
        self.tl_freq,
        self.brown_freq,
        self.fam,
        self.conc,
        self.imag,
        self.meanc,
        self.meanp,
        self.aoa,
        self.tq2,
        self.wtype,
        self.pdwtype,
        self.alphasyl,
        self.status,
        self.var,
        self.cap,
        self.irreg,
        self.word,
        self.phon,
        self.dphon,
        self.stress)

        return s       
    
def parse_mrc(file):
    words = {}

    i = 0
    with open(file, 'r') as f:
        for line in f:
            line = line.strip()

            word, phon, dphon, stress = line[51:].split('|')

            w = Word(
                    wid = i,
                    nlet = int(line[0:2]),
                    nphon = int(line[2:4]),
                    nsyl = int(line[4]),
                    kf_freq = int(line[5:10]),
                    kf_ncats = int(line[10:12]),
                    kf_nsamp = int(line[12:15]),
                    tl_freq = int(line[15:21]),
                    brown_freq = int(line[21:25]),
                    fam = int(line[25:28]),
                    conc = int(line[28:31]),
                    imag = int(line[31:34]),
                    meanc = int(line[34:37]),
                    meanp = int(line[37:40]),
                    aoa = int(line[40:43]),
                    tq2 = line[43],
                    wtype = line[44],
                    pdwtype = line[45],
                    alphasyl = line[46],
                    status = line[47],
                    var = line[48],
                    cap = line[49],
                    irreg = line[50],
                    word=word,
                    phon=phon,
                    dphon=dphon,
                    stress=stress)

            i+=1
            if i%1000 == 0:
                print(i)
            
            words[word] = w
    return words

In [65]:
mrc = parse_mrc('mrc2.dct')

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000


In [79]:
mrc['APPARENTLY']

<Word(id: 6692
nlet: 10
nphon: 0
nsyl: 4
kf_freq: 125
kf_ncats: 15
kf_nsamp: 100
tl_freq: 355
brown_freq: 25
fam: 0
conc: 0
imag: 0
meanc: 0
meanp: 0
aoa: 0
tq2:  
wtype: A
pdwtype:  
alphasyl:  
status: S
var:  
cap:  
irreg:  
word: APPARENTLY
phon: 
dphon: @'p&r@ntlI
stress: )>