In [1]:
import numpy as np
import pandas as pd

import pickle

In [2]:
hfreq_threshold, lfreq_threshold = 300, 10

In [3]:
data = pd.read_csv('rusfreq/freqrnc2011.csv', sep = '\t')

In [4]:
hfreq = data[data['Freq(ipm)'] > hfreq_threshold]
mfreq = data[(data['Freq(ipm)'] >= lfreq_threshold) & (data['Freq(ipm)'] <= hfreq_threshold)]
lfreq = data[data['Freq(ipm)'] < lfreq_threshold]

In [5]:
print(hfreq.shape[0], mfreq.shape[0], lfreq.shape[0])

340 8421 43377


In [6]:
hfreq_words = set(hfreq['Lemma'])
mfreq_words = set(mfreq['Lemma'])
lfreq_words = set(lfreq['Lemma'])

In [7]:
print(len(hfreq_words), len(mfreq_words), len(lfreq_words))

331 8361 43208


In [8]:
print(len(hfreq_words.intersection(mfreq_words)), len(hfreq_words.intersection(lfreq_words)), len(mfreq_words.intersection(lfreq_words)))

30 27 115


In [9]:
uhfreq = hfreq_words - mfreq_words - lfreq_words
umfreq = mfreq_words - lfreq_words - hfreq_words
ulfreq = lfreq_words - hfreq_words - mfreq_words

In [10]:
with open('rus/freq/high_freq.pkl', 'wb') as f:
    pickle.dump(file=f, obj=uhfreq)
    
with open('rus/freq/medium_freq.pkl', 'wb') as f:
    pickle.dump(file=f, obj=umfreq)
    
with open('rus/freq/low_freq.pkl', 'wb') as f:
    pickle.dump(file=f, obj=ulfreq)

In [11]:
with open('rus/freq/high_freq.pkl', 'rb') as f:
    uhfreq = pickle.load(file=f)
    
with open('rus/freq/medium_freq.pkl', 'rb') as f:
    umfreq = pickle.load(file=f)
    
with open('rus/freq/low_freq.pkl', 'rb') as f:
    ulfreq = pickle.load(file=f)

In [12]:
import gensim

from gensim.models import word2vec
from gensim.models import KeyedVectors

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [13]:
ozh = pd.read_csv('OZHEGOV.TXT', sep='|', encoding='cp1251')

In [14]:
vocab = set(ozh['VOCAB'])
print(len(vocab))

38955


In [15]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('w2v_models/all_norm-sz500-w10-cb0-it3-min5.w2v', binary=True, unicode_errors='ignore')

In [16]:
train_words, test_words = [], []

for word in vocab:
    try:
        emb = w2v.get_vector(word)
        train_words.append(word)
    except KeyError:
        test_words.append(word)
        
print(len(train_words), len(test_words))

37746 1209


In [17]:
%%time

train_wde = []   # word, definition, embedding
test_wd = []    # word, definition

for word in train_words:
    defin = ozh[ozh['VOCAB'] == word].reset_index().iloc[0]['DEF']
    emb = w2v.get_vector(word)
    train_wde.append((word, defin, emb))
    
for word in test_words:
    defin = ozh[ozh['VOCAB'] == word].reset_index().iloc[0]['DEF']
    test_wd.append((word, defin))

CPU times: user 1min 47s, sys: 32.1 ms, total: 1min 47s
Wall time: 1min 47s


In [18]:
ozhegov_emb = pd.DataFrame(train_wde, columns=['word', 'definition', 'embedding'])
ozhegov_no_emb = pd.DataFrame(test_wd, columns=['word', 'definition'])

assert set(ozhegov_emb['word']).intersection(set(ozhegov_no_emb['word'])) == set()
print(ozhegov_emb.shape[0], ozhegov_no_emb.shape[0])

37746 1209


In [19]:
bert_train = ozhegov_emb[ozhegov_emb.apply(lambda x: True if (x['word'] in umfreq) else False, axis = 1)]
bert_valid = ozhegov_emb[ozhegov_emb.apply(lambda x: True if (x['word'] in ulfreq) else False, axis = 1)]

assert set(bert_train['word']).intersection(set(bert_valid['word'])) == set()
print(bert_train.shape, bert_valid.shape)

(6251, 3) (21403, 3)


In [20]:
ozhegov_emb['embedding'] = ozhegov_emb['embedding'].apply(lambda x: x.tolist())
bert_train['embedding'] = bert_train['embedding'].apply(lambda x: x.tolist())
bert_valid['embedding'] = bert_valid['embedding'].apply(lambda x: x.tolist())

ozhegov_emb.to_csv('rus/ozhegov/ozhegov_emb.csv', index=None)
ozhegov_no_emb.to_csv('rus/ozhegov/ozhegov_no_emb.csv', index=None)

bert_train.to_csv('rus/bert/bert_train.csv', index=None)
bert_valid.to_csv('rus/bert/bert_valid.csv', index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
