In [3]:
import pandas as pd
import numpy as np
import matplotlib as mp
import seaborn as sns
%matplotlib inline

pd.set_option('display.max_columns', 100)
mp.rcParams['figure.figsize'] = (20,15)

Описание POS тэгов (частей речи)

Number | Tag | Description
:--- | :--- | :---
1.	| CC | Coordinating conjunction
2.	| CD | Cardinal number
3.	| DT | Determiner
4.	| EX | Existential there
5.	|FW	|Foreign word
6.	|IN	|Preposition or subordinating conjunction
7.	|JJ	|Adjective
8.	|JJR |	Adjective, comparative
9.	|JJS|	Adjective, superlative
10.	|LS	|List item marker
11.	|MD	|Modal
12.	|NN	|Noun, singular or mass
13.	|NNS|	Noun, plural
14.	|NNP|	Proper noun, singular
15.	|NNPS|	Proper noun, plural
16.	|PDT|	Predeterminer
17.	|POS|	Possessive ending
18.	|PRP|	Personal pronoun
19.	|PRP\$|	Possessive pronoun
20.	|RB	|Adverb
21.	|RBR|	Adverb, comparative
22.	|RBS|	Adverb, superlative
23.	|RP	|Particle
24.	|SYM|	Symbol
25.	|TO	|to
26.	|UH	|Interjection
27.	|VB	|Verb, base form
28.	|VBD|	Verb, past tense
29.	|VBG|	Verb, gerund or present participle
30.	|VBN|	Verb, past participle
31.	|VBP|	Verb, non-3rd person singular present
32.	|VBZ|	Verb, 3rd person singular present
33.	|WDT|	Wh-determiner
34.	|WP	|Wh-pronoun
35.	|WP\$|	Possessive wh-pronoun
36.	|WRB|	Wh-adverb

Описание параметров

Колонка | Значение
:---: | :---
POS | форма слова, участвующая в тесте
word | слово
expected | лемма слова
real | результат лемматизации
word_operator | тестируемое средство
result | совпадает ли результат с ожидаемым

In [4]:
x = pd.read_csv('test_all.csv')

In [5]:
x.head()

Unnamed: 0,POS,word,expected,real,word_operator,result
0,NN,1-dodecanol,1-dodecanol,1-dodecanol,NLTK_lemmatizer,1
1,NN,1-hitter,1-hitter,1-hitter,NLTK_lemmatizer,1
2,JJ,1000th,1000th,1000th,NLTK_lemmatizer,1
3,JJ,100th,100th,100th,NLTK_lemmatizer,1
4,JJ,10th,10th,10th,NLTK_lemmatizer,1


In [6]:
x['word_operator'].unique()

array(['NLTK_lemmatizer', 'PHPMorphy_inflector', 'SimpleNLG_inflector',
       'combined_tool', 'dictionary_lemmatizer',
       'en_inflectors_inflector', 'ruby_lemmatizer', 'skyeng_lemmatizer',
       'word_operators_inflector'], dtype=object)

In [7]:
x.groupby('POS').count()/9

Unnamed: 0_level_0,word,expected,real,word_operator,result
POS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
JJ,55363.0,55363.0,55299.666667,55364.0,55364.0
JJR,2461.0,2461.0,2455.555556,2461.0,2461.0
JJS,2115.0,2115.0,2112.555556,2115.0,2115.0
NN,78873.0,78873.0,78529.555556,78875.0,78875.0
NNS,71774.0,71772.0,71686.888889,71774.0,71774.0
VBD,8349.0,8349.0,8331.777778,8349.0,8349.0
VBG,8344.0,8344.0,8274.777778,8344.0,8344.0
VBN,8348.0,8348.0,8331.333333,8348.0,8348.0
VBP,8573.0,8573.0,8564.888889,8573.0,8573.0
VBZ,8356.0,8356.0,8329.444444,8356.0,8356.0


Посчитаем средние показатели для каждого средства

In [8]:
x.groupby('word_operator')['result'].mean()

word_operator
NLTK_lemmatizer             0.834411
PHPMorphy_inflector         0.887812
SimpleNLG_inflector         0.603190
combined_tool               0.969263
dictionary_lemmatizer       0.995122
en_inflectors_inflector     0.843831
ruby_lemmatizer             0.383087
skyeng_lemmatizer           0.840505
word_operators_inflector    0.928967
Name: result, dtype: float64

Сделаем удобную таблицу результатов для анализа пар слов, с которыми не справляются средства

In [9]:
exact_word = pd.crosstab([x['POS'],x['expected'],x['word']],[x['result'],x['word_operator']])[1]
exact_word

Unnamed: 0_level_0,Unnamed: 1_level_0,word_operator,NLTK_lemmatizer,PHPMorphy_inflector,SimpleNLG_inflector,combined_tool,dictionary_lemmatizer,en_inflectors_inflector,ruby_lemmatizer,skyeng_lemmatizer,word_operators_inflector
POS,expected,word,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
JJ,1000th,1000th,1,1,1,1,1,1,0,1,1
JJ,100th,100th,1,1,1,1,1,1,0,1,1
JJ,10th,10th,1,1,1,1,1,1,0,1,1
JJ,11th,11th,1,1,1,1,1,1,0,1,1
JJ,12th,12th,1,1,1,1,1,1,0,1,1
JJ,13th,13th,1,1,1,1,1,1,0,1,1
JJ,14th,14th,1,1,1,1,1,1,0,1,1
JJ,15th,15th,1,1,1,1,1,1,0,1,1
JJ,16th,16th,1,1,1,1,1,1,0,1,1
JJ,17th,17th,1,1,1,1,1,1,0,1,1


Для взятия нужных строк/столбцов в такой структуре необходимо изпользовать мультииндекс

In [10]:
idx = pd.IndexSlice
exact_word.loc[idx[['VBD','VBN'],:,:],idx['NLTK_lemmatizer','PHPMorphy_inflector']]

Unnamed: 0_level_0,Unnamed: 1_level_0,word_operator,NLTK_lemmatizer,PHPMorphy_inflector
POS,expected,word,Unnamed: 3_level_1,Unnamed: 4_level_1
VBD,abandon,abandoned,1,1
VBD,abase,abased,1,1
VBD,abash,abashed,1,1
VBD,abate,abated,1,1
VBD,abbreviate,abbreviated,1,1
VBD,abdicate,abdicated,1,1
VBD,abduce,abduced,1,1
VBD,abduct,abducted,1,1
VBD,aberrate,aberrated,1,1
VBD,abet,abetted,1,1


Распрямим таблицу для выборки по столбцам

In [11]:
plane_exact_word = exact_word.reset_index()
plane_exact_word

word_operator,POS,expected,word,NLTK_lemmatizer,PHPMorphy_inflector,SimpleNLG_inflector,combined_tool,dictionary_lemmatizer,en_inflectors_inflector,ruby_lemmatizer,skyeng_lemmatizer,word_operators_inflector
0,JJ,1000th,1000th,1,1,1,1,1,1,0,1,1
1,JJ,100th,100th,1,1,1,1,1,1,0,1,1
2,JJ,10th,10th,1,1,1,1,1,1,0,1,1
3,JJ,11th,11th,1,1,1,1,1,1,0,1,1
4,JJ,12th,12th,1,1,1,1,1,1,0,1,1
5,JJ,13th,13th,1,1,1,1,1,1,0,1,1
6,JJ,14th,14th,1,1,1,1,1,1,0,1,1
7,JJ,15th,15th,1,1,1,1,1,1,0,1,1
8,JJ,16th,16th,1,1,1,1,1,1,0,1,1
9,JJ,17th,17th,1,1,1,1,1,1,0,1,1


In [12]:
plane_exact_word['word'].str[-1:].nunique()

42

In [13]:
plane_exact_word['word'].str[-2:].nunique()

569

In [14]:
plane_exact_word['word'].str[-3:].nunique()

4230

In [15]:
plane_exact_word['word'].str[-4:].nunique()

18756

In [16]:
plane_exact_word['word'].str[-5:].nunique()

49947

Для взятия суффикса у нужного столбца 

In [17]:
plane_exact_word['word'].str[-3:]

0         0th
1         0th
2         0th
3         1th
4         2th
5         3th
6         4th
7         5th
8         6th
9         7th
10        8th
11        9th
12        1st
13        0th
14        1th
15        2th
16        3rd
17        4th
18        5th
19        6th
20        7th
21        8th
22        9th
23         2d
24        2nd
25        3-D
26        0th
27        3rd
28        4-H
29        0th
         ... 
252524    aks
252525    ers
252526    nks
252527    aps
252528    rns
252529    aws
252530    wns
252531    wps
252532    ans
252533    rns
252534    lls
252535    ows
252536    lps
252537    ses
252538    lds
252539    ips
252540    els
252541    kes
252542    wls
252543    cks
252544    uks
252545    aps
252546    ros
252547    ags
252548    ncs
252549    ngs
252550    ips
252551    ers
252552    nes
252553    oms
Name: word, Length: 252554, dtype: object

Сохраним результат в csv файл

In [18]:
plane_exact_word['suff'] = plane_exact_word['word'].str[-3:]#.sample(frac=0.8,random_state=200)

In [19]:
suff_sum = plane_exact_word[['POS','suff','NLTK_lemmatizer','PHPMorphy_inflector','SimpleNLG_inflector','en_inflectors_inflector','skyeng_lemmatizer','word_operators_inflector']].groupby(['POS', 'suff']).sum()

In [20]:
suff_sum

Unnamed: 0_level_0,word_operator,NLTK_lemmatizer,PHPMorphy_inflector,SimpleNLG_inflector,en_inflectors_inflector,skyeng_lemmatizer,word_operators_inflector
POS,suff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
JJ,-Ju,1,0,1,1,1,1
JJ,-OK,1,1,1,1,1,1
JJ,-am,1,1,1,1,1,1
JJ,-be,2,2,2,2,2,2
JJ,-by,2,2,2,2,2,2
JJ,-da,1,0,1,1,1,1
JJ,-di,1,0,1,1,1,1
JJ,-do,2,2,2,2,2,2
JJ,-ed,1,1,1,1,1,1
JJ,-fi,1,1,1,1,1,1


In [21]:
max_lemmatizer = suff_sum.idxmax(axis=1)

In [22]:
top_list = suff_sum.apply(lambda s: s.nlargest(suff_sum.shape[1]).index.tolist(), axis=1)

In [23]:
top_list

Unnamed: 0_level_0,word_operator,NLTK_lemmatizer,PHPMorphy_inflector,SimpleNLG_inflector,en_inflectors_inflector,skyeng_lemmatizer,word_operators_inflector
POS,suff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
JJ,-Ju,NLTK_lemmatizer,SimpleNLG_inflector,en_inflectors_inflector,skyeng_lemmatizer,word_operators_inflector,PHPMorphy_inflector
JJ,-OK,NLTK_lemmatizer,PHPMorphy_inflector,SimpleNLG_inflector,en_inflectors_inflector,skyeng_lemmatizer,word_operators_inflector
JJ,-am,NLTK_lemmatizer,PHPMorphy_inflector,SimpleNLG_inflector,en_inflectors_inflector,skyeng_lemmatizer,word_operators_inflector
JJ,-be,NLTK_lemmatizer,PHPMorphy_inflector,SimpleNLG_inflector,en_inflectors_inflector,skyeng_lemmatizer,word_operators_inflector
JJ,-by,NLTK_lemmatizer,PHPMorphy_inflector,SimpleNLG_inflector,en_inflectors_inflector,skyeng_lemmatizer,word_operators_inflector
JJ,-da,NLTK_lemmatizer,SimpleNLG_inflector,en_inflectors_inflector,skyeng_lemmatizer,word_operators_inflector,PHPMorphy_inflector
JJ,-di,NLTK_lemmatizer,SimpleNLG_inflector,en_inflectors_inflector,skyeng_lemmatizer,word_operators_inflector,PHPMorphy_inflector
JJ,-do,NLTK_lemmatizer,PHPMorphy_inflector,SimpleNLG_inflector,en_inflectors_inflector,skyeng_lemmatizer,word_operators_inflector
JJ,-ed,NLTK_lemmatizer,PHPMorphy_inflector,SimpleNLG_inflector,en_inflectors_inflector,skyeng_lemmatizer,word_operators_inflector
JJ,-fi,NLTK_lemmatizer,PHPMorphy_inflector,SimpleNLG_inflector,en_inflectors_inflector,skyeng_lemmatizer,word_operators_inflector


In [24]:
top_list.to_csv('combined_tool_POS_suffix_rules')

In [26]:
y = x.drop(['word', 'expected', 'real'], axis=1)
res = y.pivot_table(index='word_operator', columns='POS', values='result', aggfunc='mean')
res.T[['PHPMorphy_inflector','word_operators_inflector','combined_tool','dictionary_lemmatizer']]

word_operator,PHPMorphy_inflector,word_operators_inflector,combined_tool,dictionary_lemmatizer
POS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
JJ,0.793151,0.998718,0.998663,0.999982
JJR,0.709061,0.779358,0.605851,0.905323
JJS,0.803783,0.791962,0.804255,0.892671
NN,0.887252,0.999962,0.999924,1.0
NNS,0.924569,0.931075,0.961226,0.991306
VBD,0.961792,0.545215,0.653851,0.99461
VBG,0.966683,0.549976,0.966683,0.994487
VBN,0.962746,0.544562,0.967657,0.994729
VBP,0.987286,0.9993,0.99965,0.999767
VBZ,0.948899,0.931067,0.991024,0.998803
