In [1]:
#  pip install sklearn-crfsuite
#  pip install pymorphy2
#  pip install pandas


In [2]:
import pandas as pd
import pymorphy2

import sklearn
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [3]:
from yargy.tokenizer import MorphTokenizer
from yargy import rule, Parser, or_, and_, not_
from yargy.predicates import eq, type as type_, in_, normalized, gram, is_capitalized, dictionary
from yargy.pipelines import morph_pipeline
from yargy.interpretation import fact

import json 


# Hello once again! In this notebook we will extract data with Conditional Random Fields method 

Conditional random fields (CRFs) are a class of statistical modeling methods often applied in pattern recognition and machine learning and used for structured prediction. A CRF can take context into account, to do so, the predictions are modelled as a graphical model, which represents the presence of dependencies between the predictions. What kind of graph is used depends on the application. In natural language processing, "linear chain" CRFs are popular, for which each prediction is dependent only on its immediate neighbours.

In [4]:
#  We already know that dataset is valid and fine 

df = pd.read_csv(r'test_data.csv')


In [5]:
#  Few words about notebook: 


# Roadmap or what's going to happen

#### The task we are solving is token classification. We pick sklearn_crfsuite.CRF estimator as a tool. 


Since it is classification algorithm there are few steps to take.
We must choose what would be our target variable and features.
As well as decide what train, validation and test data should look like.


While working with our dataset we figured out that we have 4 classes of data:

1) Greeting class, where managers say Hi.

2) Introduction class, where managers introduce themselves.

3) Organisation class, where managers' employer is mentioned

4) Farewell class, where managers say Good Bye

5) All other words we are not interested in 

I will manually apply BIO tagging to label classes as B-GRE, I-GRE, B-NAM, I-NAM, B-ORG, I-ORG, B-FAR, I-FAR and O respectively

As feature samples model needs to know part of speech for every word, so I use pymorphy2 for that case. 

Now, about splitting the data. By obvious reasons we cannot add manager lines to our train set. And at the same time clients' lines dont have Organisation and Introduction classes. Also classes are highly imbalanced.


We split data as all clients' lines go to train set, and all managers' go to test set.

To give model ability to learn we will provide test set with augmented data. We will write some new lines with condition that they wouldn't duplicate test data and give some information about our classes at the same time. Which is hard especially for our Greeting and Farewell classes because of fewer possible words in these classes.

Because we can achieve good results by overfitting with augmented data, after we validate model with our manager test data, we will run additional test with some random examples too see how our model generalizes, hence how it is prone to overfitting. 

### Let's start with tagging our dataset 

In [6]:
morph = pymorphy2.MorphAnalyzer()


In [7]:
#  Retrieving indexes of manager's and client's phrases so we could differ one from another. 

client_index = df[df.role=='client'].index.to_list()
manager_index = df[df.role=='manager'].index.to_list()


In [8]:
# Tokenizing with pos, and setting label O to all words 

tokens = [[(word, morph.parse(word)[0].tag.POS, "O") for i, word in enumerate(line.split())] for line in df.text.to_list()]


In [9]:
#  Now pick your favourite text editor and replace O with actual tag. 
#
#  Since we are not allowed to demostrate original test_data set.
#  Here are first few lines 

sentences = [[('Алло', 'INTJ', 'O')],
 [('Алло', 'INTJ', 'O'), ('здравствуйте', 'INTJ', 'B-GRE')],
 [('Добрый', 'ADJF', 'B-GRE'), ('день', 'NOUN', 'I-GRE')],
 [('Меня', 'NPRO', 'B-NAM'),
  ('зовут', 'VERB', 'I-NAM'),
  ('ангелина', 'NOUN', 'I-NAM'),
  ('компания', 'NOUN', 'B-ORG'),
  ('диджитал', 'ADJF', 'I-ORG'),
  ('бизнес', 'NOUN', 'I-ORG'),
  ('звоним', 'VERB', 'O'),
  ('вам', 'NPRO', 'O'),
  ('по', 'PREP', 'O'),
  ('поводу', 'NOUN', 'O'),
  ('продления', 'NOUN', 'O'),
  ('лицензии', 'NOUN', 'O')]]


In [10]:
#  We strip lables and save them as separate file. 
#  Idea is that when notebook runs with data_test.csv 
#  it zips back words with there pos and tags from the file. 
#
#  Hopefully original data wouldn't change and everything will work fine. 


In [11]:
#  First we want to tokenize words into nested list 

words = [[word for word in line.split()] for line in df.text.to_list()]


In [12]:
#  Secondly we load our hand labeled markup, already saved as pos_and_tags.json

with open('pos_and_tags.json', 'r') as f:
    markup = json.load(f)


In [13]:
#  Let's at least check that our nested lists have same length 

assert len(words) == len(markup)


In [14]:
#  And zip words with markup. 
#  We should have text that looks exactly like the one few blocks higher.  

sentences = [[(w, m[0], m[1]) for w, m in zip(word, mark)] for word, mark in zip(words, markup)]
sentences[:4]


[[('Алло', 'INTJ', 'O')],
 [('Алло', 'INTJ', 'O'), ('здравствуйте', 'INTJ', 'B-GRE')],
 [('Добрый', 'ADJF', 'B-GRE'), ('день', 'NOUN', 'I-GRE')],
 [('Меня', 'NPRO', 'B-NAM'),
  ('зовут', 'VERB', 'I-NAM'),
  ('ангелина', 'NOUN', 'I-NAM'),
  ('компания', 'NOUN', 'B-ORG'),
  ('диджитал', 'ADJF', 'I-ORG'),
  ('бизнес', 'NOUN', 'I-ORG'),
  ('звоним', 'VERB', 'O'),
  ('вам', 'NPRO', 'O'),
  ('по', 'PREP', 'O'),
  ('поводу', 'NOUN', 'O'),
  ('продления', 'NOUN', 'O'),
  ('лицензии', 'NOUN', 'O'),
  ('а', 'CONJ', 'O'),
  ('мы', 'NPRO', 'O'),
  ('с', 'PREP', 'O'),
  ('серым', 'ADJF', 'O'),
  ('у', 'PREP', 'O'),
  ('вас', 'NPRO', 'O'),
  ('скоро', 'ADVB', 'O'),
  ('срок', 'NOUN', 'O'),
  ('заканчивается', 'VERB', 'O')]]

### Generate lines for each class

In [16]:
# Greeting class 

inject2 = [[('добрый', 'ADJF', 'B-GRE'), ('день', 'NOUN', 'I-GRE')], [('добрый', 'ADJF', 'B-GRE'), ('вечер', 'NOUN', 'I-GRE')],
           [('доброе', 'ADJF', 'B-GRE'), ('утро', 'NOUN', 'I-GRE')], [('здравствуйте', 'INTJ', 'B-GRE')],           
          ] * 10


In [17]:
# Introduction class 

inject3 = [[('Меня', 'NPRO', 'B-NAM'),
  ('зовут', 'VERB', 'I-NAM'),
  ('aндрей', 'NOUN', 'I-NAM')], 
  [('Меня', 'NPRO', 'B-NAM'),
  ('сергей', 'NOUN', 'I-NAM'),
   ('зовут', 'VERB', 'I-NAM')],        
    [('Меня', 'NPRO', 'B-NAM'),
  ('зовут', 'VERB', 'I-NAM'),
  ('светлана', 'NOUN', 'I-NAM')], 
  [('Меня', 'NPRO', 'B-NAM'),
  ('елена', 'NOUN', 'I-NAM'),
   ('зовут', 'VERB', 'I-NAM')],              
  [('это', 'PRCL', 'B-NAM'),
  ('игорь', 'NOUN', 'I-NAM')],          
 [('да', 'PRCL', 'B-NAM'),
  ('это', 'PRCL', 'I-NAM'),
  ('александр', 'NOUN', 'I-NAM')],
 [('это', 'PRCL', 'B-NAM'),
  ('максим', 'NOUN', 'I-NAM')]   
] * 10


In [18]:
# Organisation class

inject1 = [[('компания', 'NOUN', 'B-ORG'), ('цифровой', 'ADJF', 'I-ORG'), ('бизнес', "NOUN", 'I-ORG')], [('компания', 'NOUN', 'B-ORG'), ('новые', 'ADJF', 'I-ORG'), ('возможности', "NOUN", 'I-ORG')],
[('компания', 'NOUN', 'B-ORG'), ('новые', 'ADJF', 'I-ORG'), ('горизонты', "NOUN", 'I-ORG')], [('компания', 'NOUN', 'B-ORG'), ('деловые', 'ADJF', 'I-ORG'), ('линии', "NOUN", 'I-ORG')],          
[('компания', 'NOUN', 'B-ORG'), ('инфобизнес', "NOUN", 'I-ORG')], [('компания', 'NOUN', 'B-ORG')],         
[('компания', 'NOUN', 'B-ORG'), ('цифровой', 'ADJF', 'I-ORG'), ('бизнес', "NOUN", 'I-ORG')], [('компания', 'NOUN', 'B-ORG'), ('новые', 'ADJF', 'I-ORG'), ('возможноcти', "NOUN", 'I-ORG')],
[('компания', 'NOUN', 'B-ORG'), ('пиксель', 'ADJF', 'I-ORG'), ('бизнес', "NOUN", 'I-ORG')], [('компания', 'NOUN', 'B-ORG'), ('бизнес', 'NOUN', 'I-ORG')]
          ] * 10


In [19]:
# Farewell class

inject = [[('до', 'PREP', 'B-FAR'),
  ('свидания', 'NOUN', 'I-FAR')],
  [('хорошего', 'ADJF', 'B-FAR'),
  ('дня', 'NOUN', 'I-FAR')],
[('хорошего', 'ADJF', 'B-FAR'),
  ('вечера', 'NOUN', 'I-FAR')],
[('доброй', 'ADJF', 'B-FAR'),
  ('ночи', 'NOUN', 'I-FAR')],
[('всего', 'ADJF', 'B-FAR'),
  ('хорошего', 'ADJF', 'I-FAR')],
[('всего', 'ADJF', 'B-FAR'),
  ('доброго', 'ADJF', 'I-FAR')],
[('до', 'PREP', 'B-FAR'),
  ('новых', 'ADJF', 'I-FAR'),
 ('встреч','NOUN', 'I-FAR')]
] * 10


We increase number of samples to make classes more balanced. 


Now we select features and procces data.

Following block of code is imported from sk-learn docs. 


In [20]:
def word2features(sent, i):
    word = sent[i][0].lower()
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]


def sent2tokens(sent):
    return [token for token, postag, label in sent]


In [21]:
train = [sentences[index] for index in client_index]
test = [sentences[index] for index in manager_index]
train = train + inject + inject1 + inject2 + inject3


In [22]:
X_train = [sent2features(s) for s in train]
y_train = [sent2labels(s) for s in train]

X_test = [sent2features(s) for s in test]
y_test = [sent2labels(s) for s in test]


In [23]:
#  Parameters are chosen empirically.
#  These ones show best results for me. 

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.00009,
    c2=0.00005,
    max_iterations=200,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)




CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=9e-05, c2=5e-05, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=200,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [24]:
#  Remove O label 

labels = list(crf.classes_)
labels.remove('O')
labels


['B-GRE', 'I-GRE', 'B-FAR', 'I-FAR', 'B-ORG', 'I-ORG', 'B-NAM', 'I-NAM']

In [25]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)


0.9475030286714229

### And the results are: 

In [26]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))


              precision    recall  f1-score   support

       B-FAR      0.889     1.000     0.941         8
       I-FAR      0.889     1.000     0.941         8
       B-GRE      1.000     1.000     1.000         4
       I-GRE      1.000     1.000     1.000         1
       B-NAM      0.800     0.800     0.800         5
       I-NAM      0.900     1.000     0.947         9
       B-ORG      1.000     1.000     1.000         4
       I-ORG      1.000     1.000     1.000         7

   micro avg      0.918     0.978     0.947        46
   macro avg      0.935     0.975     0.954        46
weighted avg      0.920     0.978     0.948        46



Recall error is in the line  "Да это Анастасия". 

Predicted labels are 'B-NAM', 'I-NAM', 'I-NAM' 

Ground truth is       'O', 'B-NAM', 'I-NAM'

But anyway algorithm recognised sentence as introduction thus result suits us. 

Precision errors come from pos not supposed to be in the class in real life conversations, so we filter them later without loss of useful information.

### Few words about metrics
I think it's important to mention.
While we want both precision and recall be as high as possible.

In precision-recall tradeoff for this task I choose recall.
It comes out of business logic, I can imagine here a manager control program which collects data and sends it to other managers whose job is to say how recorded behavior corresponds with company standards and more data won't hurt. 
While looking through, manager can mark unconfirmed data as false positive and we will know where model underperforms and improve it. 


Now let's try test model on some fresh data: 


In [27]:
test = [sent2features(s) for s in [
        [('здравствуйте', 'INTJ', 'B-GRE'), ('заказ', 'NOUN', 'O'), ('получили', "VERB", 'O')], 
        [('доброе','ADJF','B-GRE'), ('утро','NOUN','I-GRE'), ('меня','ADJ','O'), ('просили','VERB','O'), ('перезвонить','VERB','O')], 
        [('меня', 'ADJF', 'B-NAM'), ('зовут', 'VERB', 'I-NAM'), ('валентина', "NOUN", 'I-NAM'), ('андреевна', 'NOUN', 'I-NAM')],
        [('хорошо', 'ADJF', 'O'), ('поработали', 'VERB', 'O'), ('всего', "ADJF", 'B-FAR'), ('хорошего','NOUN','I-FAR')],
        [('компания', 'NOUN', 'B-ORG'), ('сбербанк', 'NOUN', 'I-ORG')],
        [('компания', 'NOUN', 'B-ORG'), ('самсунг', 'NOUN', 'I-ORG'), ('выиграла', 'VERB', 'O'), ('тендер', 'NOUN', 'O')],
        [('совещание', 'NOUN', 'O'), ('перенесено', 'NOUN', 'O'), ('на', 'PREP', 'O'), 
        ('понедельник', 'NOUN', 'O'), ('до', 'PREP', 'B-FAR'), ('свидания', 'NOUN', 'I-FAR')]]]

crf.predict(test)


[['B-GRE', 'O', 'O'],
 ['B-GRE', 'I-GRE', 'O', 'O', 'O'],
 ['B-NAM', 'I-NAM', 'I-NAM', 'I-NAM'],
 ['O', 'O', 'B-FAR', 'I-FAR'],
 ['B-ORG', 'I-ORG'],
 ['B-ORG', 'I-ORG', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'B-FAR', 'I-FAR']]

Algorithm recognised all tags

### Now we shall parse the results and sort collected data by tag. 

In [28]:
greet = rule(eq('GRE'))
greet = Parser(greet)

fare = rule(eq('FAR'))
fare = Parser(fare)

org = rule(eq('ORG'))
org = Parser(org)

intro = rule(eq('NAM'))
intro = Parser(intro)


In [29]:
name = []
list_ = []
greet_ = []
company = []
farewell = []
introduction = []
company_label = []

name_dlg = {}
fare_dict = {}
name_dict = {}
greet_dict = {} 
intro_dict = {}
company_dict = {}
company_label_dict = {}

df['name'] = False
df['greet'] = False
df['has_name'] = False
df['farewell'] = False
df['is_polite'] = False 
df['has_greet'] = False
df['introduction'] = False
df['has_farewell'] = False
df['company_label'] = False
df['has_introduction'] = False
df['has_company_label'] = False


#  We repeat same steps as in the Rule-Based notebook except we add few filters to the result.

for sentence, index, label, dial in zip(y_pred, df[df.role=='manager'].index.to_list(), df[df.role=='manager'].text.to_list(), df[df.role=='manager'].dlg_id.to_list()):
    for word, l in zip(sentence, label.split()):
        greet_ = list(greet.findall(word))
        intro_ = list(intro.findall(word))
        company_ = list(org.findall(word))
        farewell_ = list(fare.findall(word))
        if greet_:
            list_.append(greet_)
            df['has_greet'].iloc[index] = True
            if word[0] == "B": 
                greet_dict.update({index: l})
                company.append(l)
            else:
                greet_dict[index] += ' '+l
                company.extend([l])         
        elif intro_:
            list_.append(intro_) 
            introduction.append(l)
            if 'Name' in morph.parse(l)[0].tag:
                df['has_name'].iloc[index] = True
                name_dict.update({index: l})
                name.append(l)
                name_dlg.update({dial: l})
            if word[0] == "B": 
                intro_dict.update({index: l})
            else:
                intro_dict[index] += ' '+l                  
        elif company_:
            list_.append(company_)
            if word[0] == "B": 
                company_label_dict.update({index: l})
                company.append(l)
            else:
                company_label_dict[index] += ' '+l
                company.extend([l])
            df['has_company_label'].iloc[index] = True
        elif farewell_:
            list_.append(farewell_)           
            if 'anim' not in morph.parse(label)[0].tag:
                df['has_farewell'].iloc[index] = True
                if word[0] == "B": 
                    fare_dict.update({index: l})
                else:
                    fare_dict[index] += ' '+l
        else:
            list_.append(False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [30]:
#  Let's put data from dicts to dataframe. 

column_name = [('greet', 'has_greet'), ('introduction', 'has_introduction'), 
               ('name', 'has_name'), ('company_label', 'has_company_label'), 
               ('farewell', 'has_farewell')]

dict_list = [greet_dict, intro_dict, name_dict, company_label_dict, fare_dict]

def dict2df(dictionary, name):   
    for key, value in dictionary.items():
        df[name[0]].iloc[key] = value
        df[name[1]].iloc[key] = True

        
[dict2df(dictionary, name) for (dictionary, name) in zip(dict_list, column_name)]    


[None, None, None, None, None]

In [31]:
#  And check which managers are polite. 

polite_dict = {}
polite_list = []

for i in df.dlg_id.unique():   
    if (True in df[(df.dlg_id==i) & df.has_greet].has_greet.to_list()) & (True in df[(df.dlg_id==i) & df.has_farewell].has_farewell.to_list()):
        df.is_polite[(df.dlg_id==i) & df.has_greet] = True
        df.is_polite[(df.dlg_id==i) & df.has_farewell] = True
        polite_dict.update({i: True})
        polite_list.append(f'Call #{i} manager {name_dlg.get(i).capitalize()} is polite')
    else: 
        polite_dict.update({i: False})
        if name_dlg.get(i) == None:
            polite_list.append(f'Call #{i} manager {name_dlg.get(i)} is NOT polite')
        else: 
            polite_list.append(f'Call #{i} manager {name_dlg.get(i).capitalize()} is NOT polite')
           

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [32]:
#  We can use df[df.is_polite] command to see greeting and farewell lines of polite managers.

df[df.is_polite].head()


Unnamed: 0,dlg_id,line_n,role,text,name,greet,has_name,farewell,is_polite,has_greet,introduction,has_farewell,company_label,has_introduction,has_company_label
1,0,1,manager,Алло здравствуйте,False,здравствуйте,False,False,True,True,False,False,False,False,False
108,0,108,manager,Всего хорошего до свидания,False,False,False,до свидания,True,False,False,True,False,False,False
110,1,1,manager,Алло здравствуйте,False,здравствуйте,False,False,True,True,False,False,False,False,False
162,1,53,manager,Угу да вижу я эту почту хорошо тогда исправлю ...,False,False,False,всего хорошего,True,False,False,True,False,False,False
163,1,54,manager,До свидания,False,False,False,До свидания,True,False,False,True,False,False,False


# The answers are: 

a) Извлекать реплики с приветствием – где менеджер поздоровался.

In [33]:
greet_dict


{1: 'здравствуйте',
 110: 'здравствуйте',
 166: 'здравствуйте',
 250: 'добрый день'}

In [34]:
#  1: 'здравствуйте',
#  110: 'здравствуйте',
#  166: 'здравствуйте',
#  250: 'добрый день'


b) Извлекать реплики, где менеджер представил себя. 

In [35]:
intro_dict


{3: 'Меня зовут ангелина',
 111: 'Меня зовут ангелина',
 167: 'Меня зовут ангелина',
 251: 'меня максим зовут',
 338: 'Да это анастасия'}

In [36]:
#  3: 'Меня зовут ангелина',
#  111: 'Меня зовут ангелина',
#  167: 'Меня зовут ангелина',
#  251: 'меня максим зовут',
#  338: 'Да это анастасия'


c) Извлекать имя менеджера. 

In [37]:
name_dict


{3: 'ангелина',
 111: 'ангелина',
 167: 'ангелина',
 251: 'максим',
 338: 'анастасия'}

In [38]:
#  3: 'ангелина',
#  111: 'ангелина',
#  167: 'ангелина',
#  251: 'максим',
#  338: 'анастасия'


d) Извлекать название компании.

In [39]:
company_label_dict 


{3: 'компания диджитал бизнес',
 111: 'компания диджитал бизнес',
 167: 'компания диджитал бизнес',
 251: 'компания китобизнес'}

In [40]:
#  3: 'компания диджитал бизнес',
#  111: 'компания диджитал бизнес',
#  167: 'компания диджитал бизнес',
#  251: 'компания китобизнес'


e) Извлекать реплики, где менеджер попрощался.

In [41]:
fare_dict


{108: 'до свидания',
 162: 'всего хорошего',
 163: 'До свидания',
 300: 'всего доброго',
 335: 'до свидания',
 479: 'хорошего вечера'}

In [42]:
#  108: 'до свидания',
#  162: 'всего хорошего',
#  163: 'До свидания',
#  300: 'всего доброго',
#  335: 'до свидания',
#  479: 'хорошего вечера'


f) Проверять требование к менеджеру: «В каждом диалоге обязательно необходимо поздороваться и попрощаться с клиентом»

In [43]:
polite_list


['Call #0 manager Ангелина is polite',
 'Call #1 manager Ангелина is polite',
 'Call #2 manager Ангелина is NOT polite',
 'Call #3 manager Максим is polite',
 'Call #4 manager None is NOT polite',
 'Call #5 manager Анастасия is NOT polite']

In [44]:
#  'Call #0 manager Ангелина is polite',
#  'Call #1 manager Ангелина is polite',
#  'Call #2 manager Ангелина is NOT polite',
#  'Call #3 manager Максим is polite',
#  'Call #4 manager None is NOT polite',
#  'Call #5 manager Анастасия is NOT polite'


# Seems like we found all data we need. Hooray! 

### For convenience we store all data in pandas data frame 

In [45]:
df.head()


Unnamed: 0,dlg_id,line_n,role,text,name,greet,has_name,farewell,is_polite,has_greet,introduction,has_farewell,company_label,has_introduction,has_company_label
0,0,0,client,Алло,False,False,False,False,False,False,False,False,False,False,False
1,0,1,manager,Алло здравствуйте,False,здравствуйте,False,False,True,True,False,False,False,False,False
2,0,2,client,Добрый день,False,False,False,False,False,False,False,False,False,False,False
3,0,3,manager,Меня зовут ангелина компания диджитал бизнес з...,ангелина,False,True,False,False,False,Меня зовут ангелина,False,компания диджитал бизнес,True,True
4,0,4,client,Ага,False,False,False,False,False,False,False,False,False,False,False


# Pros, Cons and Afterthoughts

Implementation of CRF method is somewhat very close to Rule-Based method we used before.

The difference is in Rule-Based method we explicitly set rules in which order which word to collect.

While in CRF we show different examples to the model and it finds out rules itself based on probability.

Maybe we could use something that would look into words' actual context?

Yes we can, try my transformer model at huggingface.co 


# https://huggingface.co/OlegOrwell/LaBSE_ner_manager
### Jump in! 