In [1]:
import pandas as pd

oleh_dataset = pd.read_csv('dataset.csv')
oleh_dataset = oleh_dataset[oleh_dataset['label'] != 'unknown']
oleh_dataset['label'].value_counts()

tell    639
show    243
Name: label, dtype: int64

In [2]:
with open('show-validation.txt', 'r') as f:
    show_sents = f.readlines()
    
with open('tell-validation.txt', 'r') as f:
    tell_sents = f.readlines()
    
scraped_dataset = pd.DataFrame({'sentence': show_sents + tell_sents,
                                'label': ['show'] * len(show_sents) + ['tell'] * len(tell_sents)})

scraped_dataset['sentence'] = scraped_dataset['sentence'].str.strip()

scraped_dataset['label'].value_counts()

show    40
tell    26
Name: label, dtype: int64

In [3]:
with open('katia-show.txt', 'r') as f:
    show_sents = f.readlines()
    
with open('katia-tell.txt', 'r') as f:
    tell_sents = f.readlines()

katia_dataset = pd.DataFrame({'sentence': show_sents + tell_sents,
                              'label': ['show'] * len(show_sents) + ['tell'] * len(tell_sents)})

katia_dataset['sentence'] = katia_dataset['sentence'].str.strip()

katia_dataset['label'].value_counts()

tell    118
show     15
Name: label, dtype: int64

# Катя надіслала розмічені речення, тому перерахую бейзлайн

In [4]:
import spacy
nlp = spacy.load("en_core_web_md")

In [5]:
oleh_dataset['sentence'] = oleh_dataset['sentence'].apply(nlp)
scraped_dataset['sentence'] = scraped_dataset['sentence'].apply(nlp)
katia_dataset['sentence'] = katia_dataset['sentence'].apply(nlp)

In [6]:
from sklearn.model_selection import train_test_split
X_oleh_train, X_oleh_test, y_oleh_train, y_oleh_test = train_test_split(oleh_dataset['sentence'], oleh_dataset['label'], random_state=42)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

def tokenize(model):
    return [tok.text for tok in model]

def lemmatize(model):
    return [tok.lemma_ for tok in model]

def make_baseline_clf():
    return Pipeline([('vect', CountVectorizer(lowercase=False, token_pattern=None)),
                     ('nb', MultinomialNB()),
                    ])

def validation_report(clf):
    print('Oleh:')
    print(classification_report(y_oleh_test, clf.predict(X_oleh_test)))
    print('')
    print('')
    print('')
    print('Scraped:')
    print(classification_report(scraped_dataset['label'], clf.predict(scraped_dataset['sentence'])))
    print('')
    print('')
    print('')
    print('Katia:')
    print(classification_report(katia_dataset['label'], clf.predict(katia_dataset['sentence'])))
    print('')
    print('')
    print('')
    print('All:')
    print(classification_report(pd.concat([y_oleh_test, scraped_dataset['label'], katia_dataset['label']]), 
                                clf.predict(pd.concat([X_oleh_test, scraped_dataset['sentence'], katia_dataset['sentence']]))))

In [8]:
clf = make_baseline_clf()
clf.set_params(vect__tokenizer=tokenize)

clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.66      0.34      0.45        56
        tell       0.81      0.94      0.87       165

    accuracy                           0.79       221
   macro avg       0.73      0.64      0.66       221
weighted avg       0.77      0.79      0.76       221




Scraped:
              precision    recall  f1-score   support

        show       0.83      0.12      0.22        40
        tell       0.42      0.96      0.58        26

    accuracy                           0.45        66
   macro avg       0.62      0.54      0.40        66
weighted avg       0.67      0.45      0.36        66




Katia:
              precision    recall  f1-score   support

        show       0.12      0.13      0.13        15
        tell       0.89      0.88      0.89       118

    accuracy                           0.80       133
   macro avg       0.51      0.51      0.51       133
weighted avg       0.80      0.80      0.80   

In [9]:
from sklearn.model_selection import GridSearchCV

parameter_grid = [{'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)],
                   'vect__tokenizer': [tokenize, lemmatize],
                   'nb__alpha': [1e-10, 0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 1],
                   'nb__fit_prior': [True, False],
                  }]

gs_clf = GridSearchCV(make_baseline_clf(), parameter_grid, scoring='f1_macro')
gs_clf.fit(X_oleh_train, y_oleh_train)
gs_clf.best_params_

{'nb__alpha': 0.05,
 'nb__fit_prior': False,
 'vect__ngram_range': (1, 2),
 'vect__tokenizer': <function __main__.tokenize(model)>}

In [10]:
validation_report(gs_clf)

Oleh:
              precision    recall  f1-score   support

        show       0.51      0.62      0.56        56
        tell       0.86      0.79      0.83       165

    accuracy                           0.75       221
   macro avg       0.68      0.71      0.69       221
weighted avg       0.77      0.75      0.76       221




Scraped:
              precision    recall  f1-score   support

        show       0.71      0.25      0.37        40
        tell       0.42      0.85      0.56        26

    accuracy                           0.48        66
   macro avg       0.57      0.55      0.47        66
weighted avg       0.60      0.48      0.45        66




Katia:
              precision    recall  f1-score   support

        show       0.14      0.40      0.20        15
        tell       0.90      0.68      0.77       118

    accuracy                           0.65       133
   macro avg       0.52      0.54      0.49       133
weighted avg       0.81      0.65      0.71   

# Покращена версія

In [11]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

def ds_func(f):
    return lambda X: [f(x) for x in X]

def combine_extractors(funcs):
    def combined(x):
        feats = {}
        for e in funcs:
            feats.update(e(x))
        return feats
    return combined

def make_rfc_classifier(*feature_extractors):
    classifier = Pipeline([('extractor', FunctionTransformer()),
                           ('dict_vect', DictVectorizer()),
                           ('rfc', RandomForestClassifier(random_state=42))])
    params = {'extractor__func': ds_func(combine_extractors(feature_extractors))}
    classifier.set_params(**params)
    
    return classifier

def make_lrc_classifier(*feature_extractors):
    classifier = Pipeline([('extractor', FunctionTransformer()),
                           ('dict_vect', DictVectorizer()),
                           ('lrc', LogisticRegression())])
        
    params = {'lrc__random_state': 42,
              'lrc__solver': 'sag',
              'lrc__multi_class': 'multinomial',
              'lrc__max_iter': 5000,
              'extractor__func': ds_func(combine_extractors(feature_extractors))}
    classifier.set_params(**params)

    return classifier

## Витягую частотність POS i DEP тегів
### Тут використовую Random Forest, бо логістична регресія видає всюди нулі. Певно ці значення не fit-яться лінійною моделлю 🤔

In [12]:
from collections import Counter

def extract_pos_freqs(doc):
    pos_freqs = Counter([tok.pos_ for tok in doc])    
    return {pos + '_num': freq / len(doc) for pos, freq in pos_freqs.items()}

extract_pos_freqs(nlp('I like cats very much.'))

{'PRON_num': 0.16666666666666666,
 'VERB_num': 0.16666666666666666,
 'NOUN_num': 0.16666666666666666,
 'ADV_num': 0.3333333333333333,
 'PUNCT_num': 0.16666666666666666}

In [13]:
clf = make_rfc_classifier(extract_pos_freqs)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.53      0.34      0.41        56
        tell       0.80      0.90      0.85       165

    accuracy                           0.76       221
   macro avg       0.66      0.62      0.63       221
weighted avg       0.73      0.76      0.74       221




Scraped:
              precision    recall  f1-score   support

        show       1.00      0.15      0.26        40
        tell       0.43      1.00      0.60        26

    accuracy                           0.48        66
   macro avg       0.72      0.57      0.43        66
weighted avg       0.78      0.48      0.40        66




Katia:
              precision    recall  f1-score   support

        show       0.13      0.13      0.13        15
        tell       0.89      0.89      0.89       118

    accuracy                           0.80       133
   macro avg       0.51      0.51      0.51       133
weighted avg       0.80      0.80      0.80   

In [14]:
def extract_dep_freqs(doc):
    dep_freqs = Counter([tok.dep_ for tok in doc])
    return {dep + '_num': freq / len(doc) for dep, freq in dep_freqs.items()}

In [15]:
clf = make_rfc_classifier(extract_dep_freqs)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.78      0.32      0.46        56
        tell       0.81      0.97      0.88       165

    accuracy                           0.81       221
   macro avg       0.80      0.65      0.67       221
weighted avg       0.80      0.81      0.77       221




Scraped:
              precision    recall  f1-score   support

        show       1.00      0.10      0.18        40
        tell       0.42      1.00      0.59        26

    accuracy                           0.45        66
   macro avg       0.71      0.55      0.39        66
weighted avg       0.77      0.45      0.34        66




Katia:
              precision    recall  f1-score   support

        show       0.08      0.07      0.07        15
        tell       0.88      0.90      0.89       118

    accuracy                           0.80       133
   macro avg       0.48      0.48      0.48       133
weighted avg       0.79      0.80      0.80   

## Від комбінування цих фіч якість особливо не покращується :(

In [30]:
clf = make_rfc_classifier(extract_pos_freqs, extract_dep_freqs)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.68      0.34      0.45        56
        tell       0.81      0.95      0.87       165

    accuracy                           0.79       221
   macro avg       0.74      0.64      0.66       221
weighted avg       0.78      0.79      0.77       221




Scraped:
              precision    recall  f1-score   support

        show       1.00      0.07      0.14        40
        tell       0.41      1.00      0.58        26

    accuracy                           0.44        66
   macro avg       0.71      0.54      0.36        66
weighted avg       0.77      0.44      0.31        66




Katia:
              precision    recall  f1-score   support

        show       0.12      0.13      0.13        15
        tell       0.89      0.88      0.89       118

    accuracy                           0.80       133
   macro avg       0.51      0.51      0.51       133
weighted avg       0.80      0.80      0.80   

### Збираю інформацію про головні підмет і присудок.

In [65]:
def extract_subj_verb(doc):
    feats = {}
    main = find_main_token(doc)
    
    if main.pos_ == 'VERB':
        feats['main-word'] = main.text
        feats['main-pos'] = main.pos_
        feats['main-lemma'] = main.lemma_
        
        subj = None
        for tok in doc:
            if tok.head.dep_ == 'ROOT' and tok.dep_ == 'nsubj':
                subj = tok
                break
        if subj:
            feats['subj-word'] = subj.text
            feats['subj-pos'] = subj.pos_
            feats['subj-lemma'] = subj.lemma_
            
    return feats

### Якість на рівні з попередніми класифікаторами

In [66]:
clf = make_lrc_classifier(extract_subj_verb)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.67      0.36      0.47        56
        tell       0.81      0.94      0.87       165

    accuracy                           0.79       221
   macro avg       0.74      0.65      0.67       221
weighted avg       0.77      0.79      0.77       221




Scraped:
              precision    recall  f1-score   support

        show       0.50      0.03      0.05        40
        tell       0.39      0.96      0.56        26

    accuracy                           0.39        66
   macro avg       0.45      0.49      0.30        66
weighted avg       0.46      0.39      0.25        66




Katia:
              precision    recall  f1-score   support

        show       0.12      0.20      0.15        15
        tell       0.89      0.82      0.85       118

    accuracy                           0.75       133
   macro avg       0.51      0.51      0.50       133
weighted avg       0.80      0.75      0.78   

### Контекст підмету і присудка. Якість в порівнянні з RF трішки покращилася

In [67]:
def extract_subj_verb_ctx(doc):
    feats = {}
    main = find_main_token(doc)
    
    if main.pos_ == 'VERB':
        feats['main-2-word'] = main.nbor(-2).text if main.i > 1 else '<<<none>>>'
        feats['main-2-pos'] = main.nbor(-2).pos_ if main.i > 1 else '<<<none>>>'
        feats['main-2-lemma'] = main.nbor(-2).lemma_ if main.i > 1 else '<<<none>>>'
        feats['main-1-word'] = main.nbor(-1).text if main.i > 0 else '<<<none>>>'
        feats['main-1-pos'] = main.nbor(-1).pos_ if main.i > 0 else '<<<none>>>'
        feats['main-1-lemma'] = main.nbor(-1).lemma_ if main.i > 0 else '<<<none>>>'
        feats['main+1-word'] = main.nbor(1).text if main.i < len(doc) - 1 else '<<<none>>>'
        feats['main+1-pos'] = main.nbor(1).pos_ if main.i < len(doc) - 1 else '<<<none>>>'
        feats['main+1-lemma'] = main.nbor(1).lemma_ if main.i < len(doc) - 1 else '<<<none>>>'
        feats['main+2-word'] = main.nbor(2).text if main.i < len(doc) - 2 else '<<<none>>>'
        feats['main+2-pos'] = main.nbor(2).pos_ if main.i < len(doc) - 2 else '<<<none>>>'
        feats['main+2-lemma'] = main.nbor(2).lemma_ if main.i < len(doc) - 2 else '<<<none>>>'
        
        subj = None
        for tok in doc:
            if tok.head.dep_ == 'ROOT' and tok.dep_ == 'nsubj':
                subj = tok
                break
        if subj:
            feats['subj-2-word'] = subj.nbor(-2).text if subj.i > 1 else '<<<none>>>'
            feats['subj-2-pos'] = subj.nbor(-2).pos_ if subj.i > 1 else '<<<none>>>'
            feats['subj-2-lemma'] = subj.nbor(-2).lemma_ if subj.i > 1 else '<<<none>>>'
            feats['subj-1-word'] = subj.nbor(-1).text if subj.i > 0 else '<<<none>>>'
            feats['subj-1-pos'] = subj.nbor(-1).pos_ if subj.i > 0 else '<<<none>>>'
            feats['subj-1-lemma'] = subj.nbor(-1).lemma_ if subj.i > 0 else '<<<none>>>'
            feats['subj+1-word'] = subj.nbor(1).text if subj.i < len(doc) - 1 else '<<<none>>>'
            feats['subj+1-pos'] = subj.nbor(1).pos_ if subj.i < len(doc) - 1 else '<<<none>>>'
            feats['subj+1-lemma'] = subj.nbor(1).lemma_ if subj.i < len(doc) - 1 else '<<<none>>>'
            feats['subj+2-word'] = subj.nbor(2).text if subj.i < len(doc) - 2 else '<<<none>>>'
            feats['subj+2-pos'] = subj.nbor(2).pos_ if subj.i < len(doc) - 2 else '<<<none>>>'
            feats['subj+2-lemma'] = subj.nbor(2).lemma_ if subj.i < len(doc) - 2 else '<<<none>>>'
            
    return feats

In [68]:
clf = make_lrc_classifier(extract_subj_verb, extract_subj_verb_ctx)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.66      0.45      0.53        56
        tell       0.83      0.92      0.87       165

    accuracy                           0.80       221
   macro avg       0.74      0.68      0.70       221
weighted avg       0.79      0.80      0.79       221




Scraped:
              precision    recall  f1-score   support

        show       1.00      0.12      0.22        40
        tell       0.43      1.00      0.60        26

    accuracy                           0.47        66
   macro avg       0.71      0.56      0.41        66
weighted avg       0.77      0.47      0.37        66




Katia:
              precision    recall  f1-score   support

        show       0.16      0.27      0.20        15
        tell       0.90      0.82      0.86       118

    accuracy                           0.76       133
   macro avg       0.53      0.54      0.53       133
weighted avg       0.81      0.76      0.78   

### Використовую в якості фіч вектор речення і вектори головних підмета і присудка

In [16]:
def vector_to_feats(prefix, vector):
    feats = {}
    
    for i, x in enumerate(vector):
        feats[prefix + str(i)] = x
    
    return feats

def extract_vector(doc):        
    return vector_to_feats('sent_vect', doc.vector)

In [39]:
clf = make_lrc_classifier(extract_vector)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.60      0.59      0.59        56
        tell       0.86      0.87      0.86       165

    accuracy                           0.80       221
   macro avg       0.73      0.73      0.73       221
weighted avg       0.80      0.80      0.80       221




Scraped:
              precision    recall  f1-score   support

        show       0.92      0.28      0.42        40
        tell       0.46      0.96      0.62        26

    accuracy                           0.55        66
   macro avg       0.69      0.62      0.52        66
weighted avg       0.74      0.55      0.50        66




Katia:
              precision    recall  f1-score   support

        show       0.26      0.40      0.32        15
        tell       0.92      0.86      0.89       118

    accuracy                           0.80       133
   macro avg       0.59      0.63      0.60       133
weighted avg       0.84      0.80      0.82   

In [18]:
def find_main_token(doc):
    return [tok for tok in doc if tok.dep_ == 'ROOT'][0]

def extract_subj_verb_vector(doc):
    feats = {}
    main = find_main_token(doc)
    
    if main.pos_ == 'VERB':
        feats.update(vector_to_feats('main_vect', main.vector))
        subj = None
        for tok in doc:
            if tok.head.dep_ == 'ROOT' and tok.dep_ == 'nsubj':
                subj = tok
                break
        if subj:
            feats.update(vector_to_feats('main_subj_vect', subj.vector))

    return feats

In [78]:
clf = make_lrc_classifier(extract_vector, extract_subj_verb_vector)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.70      0.70      0.70        56
        tell       0.90      0.90      0.90       165

    accuracy                           0.85       221
   macro avg       0.80      0.80      0.80       221
weighted avg       0.85      0.85      0.85       221




Scraped:
              precision    recall  f1-score   support

        show       0.88      0.17      0.29        40
        tell       0.43      0.96      0.60        26

    accuracy                           0.48        66
   macro avg       0.65      0.57      0.44        66
weighted avg       0.70      0.48      0.41        66




Katia:
              precision    recall  f1-score   support

        show       0.16      0.40      0.23        15
        tell       0.91      0.73      0.81       118

    accuracy                           0.69       133
   macro avg       0.53      0.56      0.52       133
weighted avg       0.82      0.69      0.74   

### Таке рішення трішки перевершує бейзлайн по якості.
### Скомбінувавши вектори з попередніми фічами, якість ще трохи поліпшується.

In [79]:
clf = make_lrc_classifier(extract_vector, extract_subj_verb_vector, extract_subj_verb, extract_subj_verb_ctx)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.75      0.75      0.75        56
        tell       0.92      0.92      0.92       165

    accuracy                           0.87       221
   macro avg       0.83      0.83      0.83       221
weighted avg       0.87      0.87      0.87       221




Scraped:
              precision    recall  f1-score   support

        show       0.88      0.17      0.29        40
        tell       0.43      0.96      0.60        26

    accuracy                           0.48        66
   macro avg       0.65      0.57      0.44        66
weighted avg       0.70      0.48      0.41        66




Katia:
              precision    recall  f1-score   support

        show       0.13      0.27      0.17        15
        tell       0.89      0.77      0.83       118

    accuracy                           0.71       133
   macro avg       0.51      0.52      0.50       133
weighted avg       0.81      0.71      0.75   

# Збираю н-грами

In [23]:
from phrasefinder import phrasefinder as pf

def fetch_ngram(text):
#     print('fetching...')
    try:
        query = pf.escape_query_term(text)
        result = pf.search(pf.Corpus.AMERICAN_ENGLISH, query)
        if result.error:
            print('WARN: request failed: {}'.format(result.error['message']))
            return None

        return [phrase.match_count for phrase in result.phrases] + [0]
    except Exception as error:
        print('Fatal error: {}'.format(error))
        return None

def process_ngram(ngram, res_dict):
    def fetch_and_save(text):
        if not text in res_dict:
            freq = fetch_ngram(text)
            if freq is not None:
                res_dict[text] = freq
    
    formatted = ' '.join([x.lower() for x in ngram])
    fetch_and_save(formatted)            
    return res_dict

def collect_ngrams(sents, n, res_dict):
    print('starting...')
    
    for sent in sents:
        ngrams = gen_ngrams(sent, n)
        if ngrams:
            for ngram in ngrams:
                process_ngram(ngram, res_dict)
    
    print('done!')

    return res_dict

In [24]:
def gen_ngrams(toks, n):
    if len(toks) >= n:
        return [toks[i:i+n] for i in range(len(toks) - n + 1)]

def get_freqs(toks):
    return ngrams[' '.join([x.lower() for x in toks])]

def get_or_fetch_freqs(toks):
    process_ngram(toks, ngrams)
    return get_freqs(toks)

In [25]:
import math
import json
from joblib import Parallel, delayed

all_sents = pd.concat([oleh_dataset['sentence'].apply(tokenize), 
                       scraped_dataset['sentence'].apply(tokenize),
                       katia_dataset['sentence'].apply(tokenize)])

with open('ngrams.json', 'r') as f:
    ngrams = json.load(f)

def parallel_collect(sents, ngrams_map):
    n_batches = 200
    batch_size = math.ceil(len(all_sents) / n_batches)
    gen = (delayed(collect_ngrams)(all_sents[i*batch_size:(i+1)*batch_size], n, ngrams)
        for i in range(n_batches) for n in range(1, 5))

    job_results = Parallel(n_jobs=32, verbose=10)(gen)
    
    for d in job_results:
        ngrams_map.update(d)
        
    with open('ngrams2.json', 'w') as f:
        json.dump(ngrams_map, f)

parallel_collect(all_sents, ngrams)

[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   8 tasks      | elapsed:   10.9s
[Parallel(n_jobs=32)]: Done  21 tasks      | elapsed:   22.8s
[Parallel(n_jobs=32)]: Done  34 tasks      | elapsed:   34.4s
[Parallel(n_jobs=32)]: Done  49 tasks      | elapsed:   48.5s
[Parallel(n_jobs=32)]: Done  64 tasks      | elapsed:  1.0min
[Parallel(n_jobs=32)]: Done  81 tasks      | elapsed:  1.3min
[Parallel(n_jobs=32)]: Done  98 tasks      | elapsed:  1.5min
[Parallel(n_jobs=32)]: Done 117 tasks      | elapsed:  1.8min
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:  2.1min
[Parallel(n_jobs=32)]: Done 157 tasks      | elapsed:  2.4min
[Parallel(n_jobs=32)]: Done 178 tasks      | elapsed:  2.8min
[Parallel(n_jobs=32)]: Done 201 tasks      | elapsed:  3.1min
[Parallel(n_jobs=32)]: Done 224 tasks      | elapsed:  3.4min
[Parallel(n_jobs=32)]: Done 249 tasks      | elapsed:  3.8min
[Parallel(n_jobs=32)]: Done 274 tasks      | elapsed:  

In [26]:
def extract_ngram_freqs(doc):
    feats = {}
    
    toks = tokenize(doc)
    
    feats['avg-1-gram-freq'] = np.mean([sum(get_freqs(gram)) for gram in gen_ngrams(toks, 1)])
    
    if len(doc) >= 2:
        feats['avg-2-gram-freq'] = np.mean([sum(get_freqs(gram)) for gram in gen_ngrams(toks, 2)])
    if len(doc) >= 3:
        feats['avg-3-gram-freq'] = np.mean([sum(get_freqs(gram)) for gram in gen_ngrams(toks, 3)])
    if len(doc) >= 4:
        feats['avg-4-gram-freq'] = np.mean([sum(get_freqs(gram)) for gram in gen_ngrams(toks, 4)])
    
    return feats

In [27]:
def extract_subj_verb_ngram_freqs(doc):
    feats = {}
    main = find_main_token(doc)
    
    if main.pos_ == 'VERB':
        subj = None
        for tok in doc:
            if tok.head.dep_ == 'ROOT' and tok.dep_ == 'nsubj':
                subj = tok
                break
        if subj:
            feats['subj-verb-freq'] = sum(get_or_fetch_freqs([subj.text, main.text]))

    return feats

## На самих н-грамах якість поганенька, а LRC взагалі видавала нулі, що в принципі очікувано (нормалізація не помагала)

In [37]:
clf = make_rfc_classifier(extract_ngram_freqs, extract_subj_verb_ngram_freqs)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.57      0.29      0.38        56
        tell       0.79      0.93      0.85       165

    accuracy                           0.76       221
   macro avg       0.68      0.61      0.62       221
weighted avg       0.74      0.76      0.73       221




Scraped:
              precision    recall  f1-score   support

        show       1.00      0.05      0.10        40
        tell       0.41      1.00      0.58        26

    accuracy                           0.42        66
   macro avg       0.70      0.53      0.34        66
weighted avg       0.77      0.42      0.29        66




Katia:
              precision    recall  f1-score   support

        show       0.12      0.13      0.12        15
        tell       0.89      0.87      0.88       118

    accuracy                           0.79       133
   macro avg       0.50      0.50      0.50       133
weighted avg       0.80      0.79      0.80   

### Після комбінації з POS i DEP частотами все одно зле

In [80]:
clf = make_rfc_classifier(extract_pos_freqs, extract_dep_freqs, extract_ngram_freqs, extract_subj_verb_ngram_freqs)
clf.fit(X_oleh_train, y_oleh_train)
validation_report(clf)

Oleh:
              precision    recall  f1-score   support

        show       0.62      0.32      0.42        56
        tell       0.80      0.93      0.86       165

    accuracy                           0.78       221
   macro avg       0.71      0.63      0.64       221
weighted avg       0.76      0.78      0.75       221




Scraped:
              precision    recall  f1-score   support

        show       1.00      0.03      0.05        40
        tell       0.40      1.00      0.57        26

    accuracy                           0.41        66
   macro avg       0.70      0.51      0.31        66
weighted avg       0.76      0.41      0.25        66




Katia:
              precision    recall  f1-score   support

        show       0.12      0.13      0.13        15
        tell       0.89      0.88      0.89       118

    accuracy                           0.80       133
   macro avg       0.51      0.51      0.51       133
weighted avg       0.80      0.80      0.80   

# Висновки і спостереження:

* фічі, з якими я ще експериментував, але які не дали приросту в якості:
  * усереднений вектор прикментників;
  * конкатенація векторів головного токену і його дітей в дереві залежностей;

* схоже на те, що в кожного своє розуміння show i tell речень :)
  * я розмічав дані за принципом, якщо є хоча б якесь мінімальне перефразування (she felt fear while walking the corridors -> as she walked through the dark corridors, her heartbeat increased with every step), то це речення я вважав `show`;
  * у Каті та на сайтах рекомендацій схоже більш строгі вимоги до show;