In [1]:
import numpy as np
import tensorflow as tf
import random as python_random

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.
np.random.seed(42)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.
python_random.seed(42)

# The below set_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/random/set_seed
tf.random.set_seed(42)

In [2]:
%env PYTHONHASHSEED=0

env: PYTHONHASHSEED=0


In [7]:
import gensim, re
import numpy as np
import pandas as pd

In [61]:
oleh_dataset = pd.read_csv('dataset.csv')
oleh_dataset = oleh_dataset[oleh_dataset['label'] != 'unknown']


with open('show-validation.txt', 'r') as f:
    show_sents = f.readlines()
with open('tell-validation.txt', 'r') as f:
    tell_sents = f.readlines()
scraped_dataset = pd.DataFrame({'sentence': show_sents + tell_sents,
                                'label': ['show'] * len(show_sents) + ['tell'] * len(tell_sents)})
scraped_dataset['sentence'] = scraped_dataset['sentence'].str.strip()
scraped_dataset['label'].value_counts()


with open('katia-show.txt', 'r') as f:
    show_sents = f.readlines()
with open('katia-tell.txt', 'r') as f:
    tell_sents = f.readlines()
katia_dataset = pd.DataFrame({'sentence': show_sents + tell_sents,
                              'label': ['show'] * len(show_sents) + ['tell'] * len(tell_sents)})
katia_dataset['sentence'] = katia_dataset['sentence'].str.strip()

In [81]:
def simple_tokenize(sentence):
    cleaned = re.sub(r'([^\s\w]|_)+', '', sentence)
    return cleaned.lower().split()

In [82]:
oleh_tokenized = [simple_tokenize(sentence) for sentence in oleh_dataset['sentence']]
scraped_tokenized = [simple_tokenize(sentence) for sentence in scraped_dataset['sentence']]
katia_tokenized = [simple_tokenize(sentence) for sentence in katia_dataset['sentence']]
all_tokenized = oleh_tokenized + scraped_tokenized + katia_tokenized

In [10]:
file = '/Users/oleh.palianytsia/Downloads/wiki-news-300d-1M.vec'

import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.asarray(tokens[1:], dtype='float32')
    return data

vocab_and_vectors = load_vectors(file)

In [83]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

features = 5000
tokenizer = Tokenizer(num_words = features)
tokenizer.fit_on_texts(all_tokenized)

In [84]:
# get all words that the tokenizer knows
word_index = tokenizer.word_index

# put the tokens in a matrix
X = tokenizer.texts_to_sequences(oleh_tokenized)
X = pad_sequences(X)

In [85]:
sequence_size = X.shape[1]
sequence_size

174

In [98]:
y = pd.get_dummies(oleh_dataset['label'])

Unnamed: 0,show,tell
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
1322,1,0
1323,1,0
1324,0,1
1325,0,1


In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [88]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = vocab_and_vectors.get(word)
    # words that cannot be found will be set to 0
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [89]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from keras.layers import Dropout
# init model
model = Sequential()
# emmbed word vectors
model.add(Embedding(len(word_index)+1,300,input_length=X.shape[1],weights=[embedding_matrix],trainable=False))

model.add(LSTM(300,return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(y.shape[1],activation="softmax"))
# output model skeleton
model.summary()
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=['acc'])

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 174, 300)          1842300   
_________________________________________________________________
lstm_5 (LSTM)                (None, 300)               721200    
_________________________________________________________________
dropout_5 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 602       
Total params: 2,564,102
Trainable params: 721,802
Non-trainable params: 1,842,300
_________________________________________________________________


In [182]:
model2.evaluate(X_test, y_test)



[1.3296596544334687, 0.7831325531005859]

In [91]:
model.evaluate(X_test, y_test)



[0.5158001091106829, 0.7891566157341003]

In [145]:
def model_predict(model, sentences, proba=True):
    tokenized = [simple_tokenize(s) for s in sentences]
    sequences = tokenizer.texts_to_sequences(tokenized)
    sequences = pad_sequences(sequences, maxlen=sequence_size)
    if proba:
        return model.predict(sequences)
    else: 
        return ['tell' if tell_proba > show_proba else 'show' for show_proba, tell_proba in model.predict(sequences)]

In [163]:
model_predict(model, ['Tears are running down Seva\' cheeks.'])

array([[0.8806521 , 0.11934789]], dtype=float32)

In [104]:
import spacy
nlp = spacy.load("en_core_web_md")

In [105]:
def tokenize(model):
    return [tok.text for tok in model]

In [112]:
oleh_docs = oleh_dataset['sentence'].apply(nlp)
scraped_docs = scraped_dataset['sentence'].apply(nlp)
katia_docs = katia_dataset['sentence'].apply(nlp)

In [113]:
from sklearn.model_selection import train_test_split
X_oleh_train, X_oleh_test, y_oleh_train, y_oleh_test = train_test_split(oleh_docs, oleh_dataset['label'], random_state=42)

In [106]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

def ds_func(f):
    return lambda X: [f(x) for x in X]

def combine_extractors(funcs):
    def combined(x):
        feats = {}
        for e in funcs:
            feats.update(e(x))
        return feats
    return combined

def make_rfc_classifier(*feature_extractors):
    classifier = Pipeline([('extractor', FunctionTransformer()),
                           ('dict_vect', DictVectorizer()),
                           ('rfc', RandomForestClassifier(random_state=42))])
    params = {'extractor__func': ds_func(combine_extractors(feature_extractors))}
    classifier.set_params(**params)
    
    return classifier

def make_lrc_classifier(*feature_extractors):
    classifier = Pipeline([('extractor', FunctionTransformer()),
                           ('dict_vect', DictVectorizer()),
                           ('lrc', LogisticRegression())])
        
    params = {'lrc__random_state': 42,
              'lrc__solver': 'sag',
              'lrc__multi_class': 'multinomial',
              'lrc__max_iter': 5000,
              'extractor__func': ds_func(combine_extractors(feature_extractors))}
    classifier.set_params(**params)

    return classifier

In [107]:
def find_main_token(doc):
    return [tok for tok in doc if tok.dep_ == 'ROOT'][0]

def extract_subj_verb(doc):
    feats = {}
    main = find_main_token(doc)
    
    if main.pos_ == 'VERB':
        feats['main-word'] = main.text
        feats['main-pos'] = main.pos_
        feats['main-lemma'] = main.lemma_
        
        subj = None
        for tok in doc:
            if tok.head.dep_ == 'ROOT' and tok.dep_ == 'nsubj':
                subj = tok
                break
        if subj:
            feats['subj-word'] = subj.text
            feats['subj-pos'] = subj.pos_
            feats['subj-lemma'] = subj.lemma_
            
    return feats

In [160]:
def ctx(x, size, check_important=False):
    lefts = x.doc[:x.i]
    rights = x.doc[x.i+1:]

    left_ctx = [x for x in lefts if not check_important or is_important(x)][-size:]
    if len(left_ctx) < size:
        left_ctx = ([None] * (size - len(left_ctx))) + left_ctx
    
    right_ctx = [x for x in rights if not check_important or is_important(x)][:size]
    if len(right_ctx) < size:
        right_ctx = right_ctx + ([None] * (size - len(right_ctx)))

    return list(reversed(left_ctx)), right_ctx

ctx(nlp('My mom likes cats very much')[2], 4)

([mom, My, None, None], [cats, very, much, None])

In [158]:
def extract_subj_verb_ctx(doc):
    feats = {}
    main = find_main_token(doc)
    
    if main.pos_ == 'VERB':
        left_ctx, right_ctx = ctx(main, 3)

#         feats['main-4-word'] = left_ctx[3].text if left_ctx[3] else '<<<none>>>'
#         feats['main-4-pos'] = left_ctx[3].pos_ if left_ctx[3] else '<<<none>>>'
#         feats['main-4-lemma'] = left_ctx[3].lemma_ if left_ctx[3] else '<<<none>>>'
        feats['main-3-word'] = left_ctx[2].lower_ if left_ctx[2] else '<<<none>>>'
        feats['main-2-word'] = left_ctx[1].lower_ if left_ctx[1] else '<<<none>>>'
        feats['main-1-word'] = left_ctx[0].lower_ if left_ctx[0] else '<<<none>>>'
        feats['main+1-word'] = right_ctx[0].lower_ if right_ctx[0] else '<<<none>>>'
        feats['main+2-word'] = right_ctx[1].lower_ if right_ctx[1] else '<<<none>>>'
        feats['main+3-word'] = right_ctx[2].lower_ if right_ctx[2] else '<<<none>>>'
        
        
        feats['main-3-pos'] = left_ctx[2].pos_ if left_ctx[2] else '<<<none>>>'
        feats['main-2-pos'] = left_ctx[1].pos_ if left_ctx[1] else '<<<none>>>'
        feats['main-1-pos'] = left_ctx[0].pos_ if left_ctx[0] else '<<<none>>>'
        feats['main+1-pos'] = right_ctx[0].pos_ if right_ctx[0] else '<<<none>>>'
        feats['main+2-pos'] = right_ctx[1].pos_ if right_ctx[1] else '<<<none>>>'
        feats['main+3-pos'] = right_ctx[2].pos_ if right_ctx[2] else '<<<none>>>'
        
        feats['main-3-lemma'] = left_ctx[2].lemma_ if left_ctx[2] else '<<<none>>>'
        feats['main-2-lemma'] = left_ctx[1].lemma_ if left_ctx[1] else '<<<none>>>'
        feats['main-1-lemma'] = left_ctx[0].lemma_ if left_ctx[0] else '<<<none>>>'
        feats['main+1-lemma'] = right_ctx[0].lemma_ if right_ctx[0] else '<<<none>>>'
        feats['main+2-lemma'] = right_ctx[1].lemma_ if right_ctx[1] else '<<<none>>>'
        feats['main+3-lemma'] = right_ctx[2].lemma_ if right_ctx[2] else '<<<none>>>'

#         feats['main-3-is-emotion'] = left_ctx[2].lower_ in emotions if left_ctx[2] else False
#         feats['main-2-is-emotion'] = left_ctx[1].lower_ in emotions if left_ctx[1] else False
#         feats['main-1-is-emotion'] = left_ctx[0].lower_ in emotions if left_ctx[0] else False
#         feats['main+1-is-emotion'] = right_ctx[0].lower_ in emotions if right_ctx[0] else False
#         feats['main+2-is-emotion'] = right_ctx[1].lower_ in emotions if right_ctx[1] else False
#         feats['main+3-is-emotion'] = right_ctx[2].lower_ in emotions if right_ctx[2] else False

#         feats['main-3-is-emotion'] = left_ctx[2].lower_ in emotions if left_ctx[2] else False
#         feats['main-2-is-emotion'] = left_ctx[1].lower_ in emotions if left_ctx[1] else False
#         feats['main-1-is-emotion'] = left_ctx[0].lower_ in emotions if left_ctx[0] else False
#         feats['main+1-is-emotion'] = right_ctx[0].lower_ in emotions if right_ctx[0] else False
#         feats['main+2-is-emotion'] = right_ctx[1].lower_ in emotions if right_ctx[1] else False
#         feats['main+3-is-emotion'] = right_ctx[2].lower_ in emotions if right_ctx[2] else False

#         feats['main-3-n-synonyms'] = len(pos_synsets(left_ctx[2].lemma_, left_ctx[2].pos_)) if left_ctx[2] else 0
#         feats['main-2-n-synonyms'] = len(pos_synsets(left_ctx[1].lemma_, left_ctx[1].pos_)) if left_ctx[1] else 0
#         feats['main-1-n-synonyms'] = len(pos_synsets(left_ctx[0].lemma_, left_ctx[0].pos_)) if left_ctx[0] else 0
#         feats['main+1-n-synonyms'] = len(pos_synsets(right_ctx[0].lemma_, right_ctx[0].pos_)) if right_ctx[0] else False
#         feats['main+2-n-synonyms'] = len(pos_synsets(right_ctx[1].lemma_, right_ctx[1].pos_)) if right_ctx[1] else False
#         feats['main+3-n-synonyms'] = len(pos_synsets(right_ctx[2].lemma_, right_ctx[2].pos_)) if right_ctx[2] else False

#         feats['main-3-abstract'] = left_ctx[2].lower_ in abstracts if left_ctx[2] else False
#         feats['main-2-abstract'] = left_ctx[1].lower_ in abstracts if left_ctx[1] else False
#         feats['main-1-abstract'] = left_ctx[0].lower_ in abstracts if left_ctx[0] else False
#         feats['main+1-abstract'] = right_ctx[0].lower_ in abstracts if right_ctx[0] else False
#         feats['main+2-abstract'] = right_ctx[1].lower_ in abstracts if right_ctx[1] else False
#         feats['main+3-abstract'] = right_ctx[2].lower_ in abstracts if right_ctx[2] else False




#         feats['main-3-freq-dicr'] = unigram_freq_discr(left_ctx[2].text) if left_ctx[2] else 0
#         feats['main-2-freq-dicr'] = unigram_freq_discr(left_ctx[1].text) if left_ctx[1] else 0
#         feats['main-1-freq-dicr'] = unigram_freq_discr(left_ctx[0].text) if left_ctx[0] else 0
#         feats['main+1-freq-dicr'] = unigram_freq_discr(right_ctx[0].text) if right_ctx[0] else 0
#         feats['main+2-freq-dicr'] = unigram_freq_discr(right_ctx[1].text) if right_ctx[1] else 0
#         feats['main+3-freq-dicr'] = unigram_freq_discr(right_ctx[2].text) if right_ctx[2] else 0

#         feats['main+4-word'] = right_ctx[3].text if right_ctx[3] else '<<<none>>>'
#         feats['main+4-pos'] = right_ctx[3].pos_ if right_ctx[3] else '<<<none>>>'
#         feats['main+4-lemma'] = right_ctx[3].lemma_ if right_ctx[3] else '<<<none>>>'
        
        subj = None
        for tok in doc:
            if tok.head.dep_ == 'ROOT' and tok.dep_ == 'nsubj':
                subj = tok
                break
        if subj:
            left_ctx, right_ctx = ctx(subj, 3)
#             feats['subj-4-word'] = left_ctx[3].text if left_ctx[3] else '<<<none>>>'
#             feats['subj-4-pos'] = left_ctx[3].pos_ if left_ctx[3] else '<<<none>>>'
#             feats['subj-4-lemma'] = left_ctx[3].lemma_ if left_ctx[3] else '<<<none>>>'
            
            feats['subj-3-word'] = left_ctx[2].lower_ if left_ctx[2] else '<<<none>>>'
            feats['subj-2-word'] = left_ctx[1].lower_ if left_ctx[1] else '<<<none>>>'
            feats['subj-1-word'] = left_ctx[0].lower_ if left_ctx[0] else '<<<none>>>'
            feats['subj+1-word'] = right_ctx[0].lower_ if right_ctx[0] else '<<<none>>>'
            feats['subj+2-word'] = right_ctx[1].lower_ if right_ctx[1] else '<<<none>>>'
            feats['subj+3-word'] = right_ctx[2].lower_ if right_ctx[2] else '<<<none>>>'
            
            feats['subj-3-pos'] = left_ctx[2].pos_ if left_ctx[2] else '<<<none>>>'
            feats['subj-2-pos'] = left_ctx[1].pos_ if left_ctx[1] else '<<<none>>>'
            feats['subj-1-pos'] = left_ctx[0].pos_ if left_ctx[0] else '<<<none>>>'
            feats['subj+1-pos'] = right_ctx[0].pos_ if right_ctx[0] else '<<<none>>>'
            feats['subj+2-pos'] = right_ctx[1].pos_ if right_ctx[1] else '<<<none>>>'
            feats['subj+3-pos'] = right_ctx[2].pos_ if right_ctx[2] else '<<<none>>>'
            
            feats['subj-3-lemma'] = left_ctx[2].lemma_ if left_ctx[2] else '<<<none>>>'
            feats['subj-2-lemma'] = left_ctx[1].lemma_ if left_ctx[1] else '<<<none>>>'
            feats['subj-1-lemma'] = left_ctx[0].lemma_ if left_ctx[0] else '<<<none>>>'
            feats['subj+1-lemma'] = right_ctx[0].lemma_ if right_ctx[0] else '<<<none>>>'
            feats['subj+2-lemma'] = right_ctx[1].lemma_ if right_ctx[1] else '<<<none>>>'
            feats['subj+3-lemma'] = right_ctx[2].lemma_ if right_ctx[2] else '<<<none>>>'

#             feats['subj-3-is-emotion'] = left_ctx[2].lower_ in emotions if left_ctx[2] else False
#             feats['subj-2-is-emotion'] = left_ctx[1].lower_ in emotions if left_ctx[1] else False
#             feats['subj-1-is-emotion'] = left_ctx[0].lower_ in emotions if left_ctx[0] else False
#             feats['subj+1-is-emotion'] = right_ctx[0].lower_ in emotions if right_ctx[0] else False
#             feats['subj+2-is-emotion'] = right_ctx[1].lower_ in emotions if right_ctx[1] else False
#             feats['subj+3-is-emotion'] = right_ctx[2].lower_ in emotions if right_ctx[2] else False

#             feats['subj-3-abstract'] = left_ctx[2].lower_ in abstracts if left_ctx[2] else False
#             feats['subj-2-abstract'] = left_ctx[1].lower_ in abstracts if left_ctx[1] else False
#             feats['subj-1-abstract'] = left_ctx[0].lower_ in abstracts if left_ctx[0] else False
#             feats['subj+1-abstract'] = right_ctx[0].lower_ in abstracts if right_ctx[0] else False
#             feats['subj+2-abstract'] = right_ctx[1].lower_ in abstracts if right_ctx[1] else False
#             feats['subj+3-abstract'] = right_ctx[2].lower_ in abstracts if right_ctx[2] else False
            
#             feats['subj-3-n-synonyms'] = len(pos_synsets(left_ctx[2].lemma_, left_ctx[2].pos_)) if left_ctx[2] else 0
#             feats['subj-2-n-synonyms'] = len(pos_synsets(left_ctx[1].lemma_, left_ctx[1].pos_)) if left_ctx[1] else 0
#             feats['subj-1-n-synonyms'] = len(pos_synsets(left_ctx[0].lemma_, left_ctx[0].pos_)) if left_ctx[0] else 0
#             feats['subj+1-n-synonyms'] = len(pos_synsets(right_ctx[0].lemma_, right_ctx[0].pos_)) if right_ctx[0] else False
#             feats['subj+2-n-synonyms'] = len(pos_synsets(right_ctx[1].lemma_, right_ctx[1].pos_)) if right_ctx[1] else False
#             feats['subj+3-n-synonyms'] = len(pos_synsets(right_ctx[2].lemma_, right_ctx[2].pos_)) if right_ctx[2] else False

#             feats['subj-3-freq-dicr'] = unigram_freq_discr(left_ctx[2].text) if left_ctx[2] else 0
#             feats['subj-2-freq-dicr'] = unigram_freq_discr(left_ctx[1].text) if left_ctx[1] else 0
#             feats['subj+1-freq-dicr'] = unigram_freq_discr(right_ctx[0].text) if right_ctx[0] else 0
#             feats['subj+2-freq-dicr'] = unigram_freq_discr(right_ctx[1].text) if right_ctx[1] else 0
#             feats['subj+3-freq-dicr'] = unigram_freq_discr(right_ctx[2].text) if right_ctx[2] else 0

#             feats['subj+4-word'] = right_ctx[3].text if right_ctx[3] else '<<<none>>>'
#             feats['subj+4-pos'] = right_ctx[3].pos_ if right_ctx[3] else '<<<none>>>'
#             feats['subj+4-lemma'] = right_ctx[3].lemma_ if right_ctx[3] else '<<<none>>>'


    return feats

In [110]:
import numpy as np

def vector_to_feats(prefix, vector):
    feats = {}
    
    for i, x in enumerate(vector):
        feats[prefix + str(i)] = x
    
    return feats

def avg_vector(vectors):
    vect = np.zeros(300)
    
    for v in vectors:
        vect += v
    return vect / len(vectors) if len(vectors) > 0 else vect

def is_important(x):
    return not x.is_stop and not x.pos_ == 'PROPN' and x.ent_iob_ == 'O'

def extract_vector(doc):        
    return vector_to_feats('sent_vect', avg_vector([x.vector for x in doc if is_important(x)]))

In [111]:
def extract_subj_verb_vector(doc):
    feats = {}
    main = find_main_token(doc)
    
    if main.pos_ == 'VERB':
        feats.update(vector_to_feats('main_vect', main.vector))
        subj = None
        for tok in doc:
            if tok.head.dep_ == 'ROOT' and tok.dep_ == 'nsubj':
                subj = tok
                break
        if subj:
            feats.update(vector_to_feats('main_subj_vect', subj.vector))

    return feats

In [162]:
clf = make_lrc_classifier(extract_vector, extract_subj_verb)
clf.fit(X_oleh_train, y_oleh_train)
print(classification_report(y_oleh_test, clf.predict(X_oleh_test)))

              precision    recall  f1-score   support

        show       0.76      0.55      0.63       110
        tell       0.80      0.91      0.85       222

    accuracy                           0.79       332
   macro avg       0.78      0.73      0.74       332
weighted avg       0.79      0.79      0.78       332



In [121]:
def lrc_predict(clf, sentences, proba=True):
    docs = [nlp(s) for s in sentences]
    if proba:
        return clf.predict_proba(docs)
    else:
        return clf.predict(docs)

In [133]:
lrc_predict(clf, ['I am sad'])

array([[0.59644908, 0.40355092]])

In [132]:
model_predict(model, ['I am sad'])

array([[0.04585582, 0.95414424]], dtype=float32)

In [135]:
def ensemble_predict(model, clf, sentences):
    model_probas = model_predict(model, sentences)
    lrc_probas = lrc_predict(clf, sentences)
    
    return ['tell' if tell_proba > show_proba else 'show' for show_proba, tell_proba in model_probas + lrc_probas]

In [138]:
print(classification_report(y_oleh_test, ensemble_predict(model, clf, [x.text for x in X_oleh_test])))

              precision    recall  f1-score   support

        show       0.77      0.56      0.65       110
        tell       0.81      0.91      0.86       222

    accuracy                           0.80       332
   macro avg       0.79      0.74      0.75       332
weighted avg       0.79      0.80      0.79       332



In [150]:
print(classification_report(pd.concat([y_oleh_test, 
                                       katia_dataset['label'], 
                                       scraped_dataset['label'],
                                      ]), 
                            model_predict(model, pd.concat([X_oleh_test.apply(lambda x: x.text), 
                                                            katia_dataset['sentence'], 
                                                            scraped_dataset['sentence']]),
                                         proba=False)))

              precision    recall  f1-score   support

        show       0.67      0.52      0.58       165
        tell       0.80      0.89      0.84       366

    accuracy                           0.77       531
   macro avg       0.74      0.70      0.71       531
weighted avg       0.76      0.77      0.76       531



In [155]:
print(classification_report(pd.concat([y_oleh_test,
                                       katia_dataset['label'], 
                                       scraped_dataset['label'],
                                      ]), 
                            lrc_predict(clf, pd.concat([X_oleh_test.apply(lambda x: x.text),
                                                        katia_dataset['sentence'], 
                                                        scraped_dataset['sentence']]),
                                         proba=False)))

              precision    recall  f1-score   support

        show       0.63      0.43      0.51       165
        tell       0.78      0.89      0.83       366

    accuracy                           0.74       531
   macro avg       0.70      0.66      0.67       531
weighted avg       0.73      0.74      0.73       531



In [156]:
print(classification_report(pd.concat([y_oleh_test, 
                                       katia_dataset['label'], 
                                       scraped_dataset['label'],
                                      ]), 
                            ensemble_predict(model, clf, pd.concat([X_oleh_test.apply(lambda x: x.text),
                                                                    katia_dataset['sentence'], 
                                                                    scraped_dataset['sentence']]))))

              precision    recall  f1-score   support

        show       0.68      0.47      0.56       165
        tell       0.79      0.90      0.84       366

    accuracy                           0.77       531
   macro avg       0.74      0.69      0.70       531
weighted avg       0.76      0.77      0.75       531



In [256]:
model_predict(
    model, 
    ['Tears are salty.'], proba=False)

['show']