In [1]:
import numpy as np
import pandas as pd

import pickle

In [2]:
hfreq_threshold, lfreq_threshold = 7.8, 1.4  # was: 300, 10

In [3]:
data = pd.read_csv('rusfreq/freqrnc2011.csv', sep = '\t')

In [4]:
hfreq = data[data['Freq(ipm)'] > hfreq_threshold]
mfreq = data[(data['Freq(ipm)'] >= lfreq_threshold) & (data['Freq(ipm)'] <= hfreq_threshold)]
lfreq = data[data['Freq(ipm)'] < lfreq_threshold]

In [5]:
print(hfreq.shape[0], mfreq.shape[0], lfreq.shape[0])

10296 20497 21345


In [6]:
hfreq_words = set(hfreq['Lemma'])
mfreq_words = set(mfreq['Lemma'])
lfreq_words = set(lfreq['Lemma'])

In [7]:
print(len(hfreq_words), len(mfreq_words), len(lfreq_words))

10186 20456 21305


In [8]:
print(len(hfreq_words.intersection(mfreq_words)), len(hfreq_words.intersection(lfreq_words)), len(mfreq_words.intersection(lfreq_words)))

93 54 73


In [9]:
uhfreq = hfreq_words - mfreq_words - lfreq_words
umfreq = mfreq_words - lfreq_words - hfreq_words
ulfreq = lfreq_words - hfreq_words - mfreq_words

In [10]:
with open('rus/freq/high_freq.pkl', 'wb') as f:
    pickle.dump(file=f, obj=uhfreq)
    
with open('rus/freq/medium_freq.pkl', 'wb') as f:
    pickle.dump(file=f, obj=umfreq)
    
with open('rus/freq/low_freq.pkl', 'wb') as f:
    pickle.dump(file=f, obj=ulfreq)

In [11]:
with open('rus/freq/high_freq.pkl', 'rb') as f:
    uhfreq = pickle.load(file=f)
    
with open('rus/freq/medium_freq.pkl', 'rb') as f:
    umfreq = pickle.load(file=f)
    
with open('rus/freq/low_freq.pkl', 'rb') as f:
    ulfreq = pickle.load(file=f)

In [12]:
import gensim

from gensim.models import word2vec
from gensim.models import KeyedVectors

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [13]:
ozh = pd.read_csv('OZHEGOV.TXT', sep='|', encoding='cp1251')

In [14]:
vocab = set(ozh['VOCAB'])
print(len(vocab))

38955


In [15]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('w2v_models/all_norm-sz500-w10-cb0-it3-min5.w2v', binary=True, unicode_errors='ignore')

In [16]:
train_words, test_words = [], []

for word in vocab:
    try:
        emb = w2v.get_vector(word)
        train_words.append(word)
    except KeyError:
        test_words.append(word)
        
print(len(train_words), len(test_words))

37746 1209


In [17]:
%%time

train_wde = []   # word, definition, embedding
test_wd = []    # word, definition

for word in train_words:
    defin = ozh[ozh['VOCAB'] == word].reset_index().iloc[0]['DEF']
    emb = w2v.get_vector(word)
    train_wde.append((word, defin, emb))
    
for word in test_words:
    defin = ozh[ozh['VOCAB'] == word].reset_index().iloc[0]['DEF']
    test_wd.append((word, defin))

CPU times: user 1min 52s, sys: 31.8 ms, total: 1min 52s
Wall time: 1min 52s


In [18]:
ozhegov_emb = pd.DataFrame(train_wde, columns=['word', 'definition', 'embedding'])
ozhegov_no_emb = pd.DataFrame(test_wd, columns=['word', 'definition'])

assert set(ozhegov_emb['word']).intersection(set(ozhegov_no_emb['word'])) == set()
print(ozhegov_emb.shape[0], ozhegov_no_emb.shape[0])

37746 1209


In [19]:
ozhegov_emb['definition'].apply(lambda x: '<=' in str(x)).value_counts()

False    33757
True      3989
Name: definition, dtype: int64

In [20]:
tmp = ozhegov_emb[ozhegov_emb['definition'].apply(lambda x: '<=' in str(x))]

In [21]:
ozhegov_emb[ozhegov_emb['word'] == 'лог']['definition'].iloc[0]

'широкий и длинный овраг'

In [22]:
def find_embedding(defin):
    if ('<=' in str(defin)) or ('==' in str(defin)):
        word = str(defin)
        if ((word[0] == '<') or (word[0] == '=')) and (word[1] == '='):
            word = word[3:]
            res = ''
            for letter in word:
                if letter.isalpha():
                    res += letter
                else:
                    break
            # find word
            if (ozhegov_emb[ozhegov_emb['word'] == res].shape[0] == 1):
                return ozhegov_emb[ozhegov_emb['word'] == res]['definition'].iloc[0]
            elif (ozhegov_no_emb[ozhegov_no_emb['word'] == res].shape[0] == 1):
                return ozhegov_no_emb[ozhegov_no_emb['word'] == res]['definition'].iloc[0]
            else:
                return str(defin)
    return str(defin)

In [23]:
ozhegov_emb_new = ozhegov_emb['definition'].apply(lambda x: find_embedding(x))
ozhegov_no_emb_new = ozhegov_no_emb['definition'].apply(lambda x: find_embedding(x))

In [24]:
(ozhegov_emb_new == ozhegov_emb['definition']).value_counts()

True     31909
False     5837
Name: definition, dtype: int64

In [25]:
ozhegov_emb['definition'] = ozhegov_emb_new
ozhegov_no_emb['definition'] = ozhegov_no_emb_new

In [26]:
ozhegov_emb = ozhegov_emb[ozhegov_emb['definition'].apply(lambda defin: len(defin) > 3)]
ozhegov_no_emb = ozhegov_no_emb[ozhegov_no_emb['definition'].apply(lambda defin: len(defin) > 3)]

In [27]:
bert_mfreq = ozhegov_emb[ozhegov_emb.apply(lambda x: True if (x['word'] in umfreq) else False, axis = 1)]
bert_lfreq = ozhegov_emb[ozhegov_emb.apply(lambda x: True if (x['word'] in ulfreq) else False, axis = 1)]

assert set(bert_mfreq['word']).intersection(set(bert_lfreq['word'])) == set()
print(bert_mfreq.shape, bert_lfreq.shape)

(11577, 3) (8448, 3)


In [28]:
ozhegov_emb['embedding'] = ozhegov_emb['embedding'].apply(lambda x: x.tolist())
bert_mfreq['embedding'] = bert_mfreq['embedding'].apply(lambda x: x.tolist())
bert_lfreq['embedding'] = bert_lfreq['embedding'].apply(lambda x: x.tolist())

ozhegov_emb.to_csv('rus/ozhegov/ozhegov_emb.csv', index=None)
ozhegov_no_emb.to_csv('rus/ozhegov/ozhegov_no_emb.csv', index=None)

bert_mfreq.to_csv('rus/bert/bert_mfreq.csv', index=None)
bert_lfreq.to_csv('rus/bert/bert_lfreq.csv', index=None)

bert_train = bert_mfreq[:(bert_mfreq.shape[0] * 4 // 5)]
bert_valid = bert_mfreq[(bert_mfreq.shape[0] * 4 // 5):]

bert_train.to_csv('rus/bert/bert_train.csv')
bert_valid.to_csv('rus/bert/bert_valid.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
