In [1]:
import re
from itertools import chain
from tqdm import tqdm
import numpy as np

DICT = []

STATE_WORD = 0
STATE_TYPE = 1
STATE_MEANING_START = 2

class DictProcessor():
    def __init__(self):
        self.word = ""
        self.meaning = ""
        
        self.state = STATE_WORD
        
        self.dict = []

    def update(self, line):
        if line == "":
            self.go(STATE_WORD)
            return    
        elif re.match(r"^\s*\d+\.(.*)", line):
            m = re.match(r"^\s*\d+\.(.*)", line)
            line = m.group(1)
            self.go(STATE_TYPE)
        elif re.match(r"^\s*\d+\)", line):   
            m = re.match(r"^\s*\d+\)(.*)", line)
            line = m.group(1)
            self.go(STATE_MEANING_START)
        
        self._process(line)
        
    def _process(self, line):
        if self.state == STATE_WORD:
            self.word = line
            self.go(STATE_TYPE)
        elif self.state == STATE_TYPE:
            pass
            self.go(STATE_MEANING_START)
        elif self.state == STATE_MEANING_START:
            self.meaning += " " + line.strip()
    
    def _publish(self):
        self.meaning = self.meaning.strip()
        if self.word != "" and self.meaning != "" and len(self.word) >= 3:
            self.dict.append((self.word, self.meaning))
            
            #print("!!!| [%s] - [%s]" % (self.word, self.meaning))        
        self.meaning = ""                        
    
    def go(self, new_state):
        if self.state == STATE_MEANING_START:
            self._publish()
        
        self.state = new_state        

processor = DictProcessor()        
        
with open("efremova.txt") as fin:
    row = -1
    
    state = STATE_WORD
    
    word = ""
    for line in fin:
        row += 1
        
        if row < 3:
            continue
            
        if row > 100 and False:
            break
        
        line = line.strip()                
        
        #print("%d: %s" % (processor.state, line))
        
        processor.update(line)                        

In [2]:
print(len(processor.dict))

200438


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
words_tr = TfidfVectorizer(analyzer='char', ngram_range=(2,4))
f_words = words_tr.fit_transform((w_m[0] for w_m in processor.dict))

In [5]:
meaning_tr = TfidfVectorizer()
f_meaning = meaning_tr.fit_transform((w_m[1] for w_m in processor.dict))

In [6]:
print(f_meaning[2, :])

  (0, 106004)	0.132571711899
  (0, 24984)	0.125008011772
  (0, 62522)	0.144527232452
  (0, 35912)	0.0991084534657
  (0, 113747)	0.1523103571
  (0, 58000)	0.210757796084
  (0, 17469)	0.174762436976
  (0, 58987)	0.196088421603
  (0, 9507)	0.159402648552
  (0, 127830)	0.14003835089
  (0, 50968)	0.13680143011
  (0, 107135)	0.123325436002
  (0, 31478)	0.18967221548
  (0, 33430)	0.15654078812
  (0, 115357)	0.158933159225
  (0, 8814)	0.158745790771
  (0, 102857)	0.127035558528
  (0, 116050)	0.195158471063
  (0, 35046)	0.323022333065
  (0, 25535)	0.0820956811163
  (0, 42898)	0.0967201781281
  (0, 52965)	0.18884296145
  (0, 33525)	0.154812590315
  (0, 113013)	0.161405639527
  (0, 86939)	0.180879056573
  (0, 6389)	0.219600805085
  (0, 127950)	0.240150367708
  (0, 27818)	0.245994313375
  (0, 50966)	0.201360106812
  (0, 33526)	0.15488881518
  (0, 113010)	0.167074915993
  (0, 60511)	0.240150367708


In [21]:
BAD_W = 1

from scipy.sparse import vstack, hstack
from random import randint

X = []
Y = []

for i in tqdm(range(f_words.shape[0] // 1)):
    f_w = f_words[i:i+1, :]
    f_m = f_meaning[i:i+1, :]
        
    Y.append(1)
    X.append(hstack([f_w, f_m]))
    for j in range(BAD_W):        
        rand_i = randint(0, f_words.shape[0] - 1)
        if rand_i == i:
            continue
        Y.append(0)    
        X.append(hstack([f_w, f_meaning[rand_i:rand_i+1, :]]))
        
    #break    
X = vstack(X)    
Y = np.array(Y)

100%|██████████| 200438/200438 [02:49<00:00, 1184.94it/s]


In [22]:
print(X.shape)
print(Y.shape)
print(f_words.shape)
print(f_meaning.shape)

(400875, 198144)
(400875,)
(200438, 65623)
(200438, 132521)


In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

In [24]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

In [25]:
t = "svm"
#t = "forest"

if t == "svm":
    svm = LinearSVC()
    clf = CalibratedClassifierCV(svm) 
if t == "forest":
    clf = RandomForestClassifier(200)
    
clf.fit(X_train, y_train)

y_proba = clf.predict_proba(X_test)

In [26]:
from sklearn.metrics import roc_auc_score
print(y_proba)
print(y_test)
print(roc_auc_score(y_test, y_proba[:, 1]))

[[ 0.48125201  0.51874799]
 [ 0.34268572  0.65731428]
 [ 0.50400647  0.49599353]
 ..., 
 [ 0.3466567   0.6533433 ]
 [ 0.83588198  0.16411802]
 [ 0.46138521  0.53861479]]
[1 1 1 ..., 1 0 1]
0.758785085511


In [28]:
#0.9 0.2 0.4 0.5
#1 1 0 0

diff = np.abs(y_proba[:, 1] - y_test)
diff_order = np.argsort(diff)[::-1] #diff[diff_order] - decreasing

N = 10

#print(diff[diff_order])
print(diff[diff_order[:N]])
print(y_test[diff_order[:N]])

docs_nums = diff_order[:N] // (1 + BAD_W)
print(docs_nums)

for i in docs_nums:
    print(processor.dict[i])

[ 0.83051754  0.81638499  0.80956265  0.80938154  0.80700392  0.8060262
  0.79909287  0.79485564  0.79400914  0.79321944]
[1 0 1 0 1 1 1 0 1 1]
[11687 12738  4131 15571  8492  1039  5531 15758 15671 11643]
('валиться', 'перен. разг. Доставаться, выпадать кому-л. в большом количестве (о горе, заботах, обидах и т.п.).')
('велюр', 'Драп или фетр с коротким, очень густым и мягким ворсом.')
('аэростатный', 'Свойственный аэростату, характерный для него.')
('вмораживать', 'Прочно закреплять что-л. в массе снега, льда, давая замерзнуть.')
('близнец', 'см. близнецы.')
('ажитация', 'Состояние по знач. глаг.: ажитироваться.')
('бегло', 'Быстро, свободно, без затруднений.')
('внутриконтинентальный', 'Находящийся внутри континента.')
('внеурочный', 'Происходящий, производящийся помимо или сверх положенного, установленного времени; сверхурочный.')
('валенки', 'Зимние теплые сапоги, свалянные из шерсти.')
