In [3]:
import re
import pandas as pd
from nltk.corpus import stopwords
import fasttext

# Data

In [4]:
data = pd.read_csv('data/input/perfumery.zip')
data.dropna(inplace=True)
data.head()

Unnamed: 0,comment_text
0,"–°—Ä–∞–∑—É —Å–∫–∞–∂—É, —á—Ç–æ –∞—Ä–æ–º–∞—Ç –Ω–∞ –ª—é–±–∏—Ç–µ–ª—è. –ù–æ –º–µ–Ω—è –æ..."
1,"–∫–æ–≥–¥–∞ –º–Ω–µ –¥–∞–ª–∏ –ø–æ—Å–ª—É—à–∞—Ç—å —ç—Ç–æ—Ç –∞—Ä–æ–º–∞—Ç, –º–Ω–µ —Ç–∞–∫ ..."
2,–ó–∞–º–µ—á–∞—Ç–µ–ª—å–Ω–∞—è –ø–∞—Ä–∞ –∫ –∂–µ–Ω—Å–∫–æ–π –Ω–æ–≤–∏–Ω–∫–µ —ç—Ç–æ–π –º–∞—Ä–∫...
3,–û—Ç–ª–∏—á–Ω—ã–π —Ñ—Ä—É–∫—Ç–æ–≤—ã–π –∞—Ä–æ–º–∞—Ç!üçä
4,"–û—á–µ–Ω—å –∫–ª–∞—Å—Å–Ω—ã–π –Ω–∞–±–æ—Ä, –æ—Ç–ª–∏—á–Ω–æ –ø–æ–¥–æ–π–¥—ë—Ç –Ω–∞ –ø–æ–¥–∞..."


### Data cleanup

In [5]:
regex = re.compile("[–ê-–Ø–Å–∞-—è—ëA-z]+")
mystopwords = stopwords.words(fileids='russian')

def words_only(text, regex=regex):
    try:
        return regex.findall(text.lower())
    except:
        return []

def remove_stopwords(lemmas, stopwords = mystopwords):
    return [w for w in lemmas if not w in stopwords and len(w) > 1]

def clean_text(text):
    tokens = words_only(text)
    return ' '.join(remove_stopwords(tokens))

### Saving a set for model training

In [68]:
with open("p.txt", "w", encoding='UTF-8') as file:
    for line in data.comment_text.values:
        if len(line) > 300:
            file.write(clean_text(line) + '\n')

# Model 

In [14]:
# original BIN model loading
model = fasttext.load_model("model/adaptation/perfumery.bin")
lines = []

# get all words from model
words = model.get_words()

with open('model.vec','w') as file_out:
    
    # the first line must contain number of total words and vector dimension
    file_out.write(str(len(words)) + " " + str(model.get_dimension()) + "\n")

    # line by line, you append vectors to VEC file
    for w in words:
        v = model.get_word_vector(w)
        vstr = ""
        for vi in v:
            vstr += " " + str(vi)
        try:
            file_out.write(w + vstr+'\n')
        except:
            pass

In [7]:
import fasttext

In [5]:
model = fasttext.load_model("model/adaptation/perfumery_epoch_1.bin")

In [8]:
print("building vocabulary...")
model.build_vocab('perfumery.txt', update=True)



building vocabulary...


model

In [10]:
model.wv.most_similar('–ø–∞—Ä—Ñ—é–º')

[('–ø–∞—Ä—Ñ—é–º—ã', 0.7377110123634338),
 ('–∞—Ä–æ–º–∞—Ç', 0.7371137142181396),
 ('–ø–∞—Ä—Ñ—é–º–µ—Ä–Ω—ã–π', 0.7119106650352478),
 ('–ü–∞—Ä—Ñ—é–º', 0.7044569849967957),
 ('–ø–∞—Ä—Ñ—é–º.', 0.6883740425109863),
 ('–ø–∞—Ä—Ñ—é–º–∞', 0.6814783811569214),
 ('–¥—É—Ö–∏', 0.6683558821678162),
 ('–¥–µ–∑–æ–¥–æ—Ä–∞–Ω—Ç', 0.6603888273239136),
 ('–ø–∞—Ä—Ñ—é–º–æ–≤', 0.6553675532341003),
 ('–ø–∞—Ä—Ñ—É–º', 0.6519331336021423)]

In [11]:
model.wv.most_similar('—Ñ—É—Ä—É–∫—Ç–æ–≤—ã–π')

[('–ì—É–º–∞–Ω–∏—Ç–∞—Ä–Ω–æ-—ç–∫–æ–Ω–æ–º–∏—á–µ—Å–∫–∏–π', 0.561374306678772),
 ('–º–∞–≥–∏—Å—Ç—Ä–∞—Ç—É—Ä—ã—É—á–µ–±–Ω–æ-–º–µ—Ç–æ–¥–∏—á–µ—Å–∫–∏–π', 0.5601953268051147),
 ('–í—Ä–∞—á–µ–±–Ω–æ-—Ñ–∏–∑–∫—É–ª—å—Ç—É—Ä–Ω—ã–π', 0.5565395355224609),
 ('–∞–≥—Ä–∞—Ä–Ω–æ-—Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—á–µ—Å–∫–∏–π', 0.5547544360160828),
 ('—Ä–µ–≤–º–∞—Ç–æ–ª–æ–≥–∏—á–µ—Å–∫–∏–π', 0.5517967343330383),
 ('–ì—É–º–∞–Ω–∏—Ç–∞—Ä–Ω–æ-–ø–µ–¥–∞–≥–æ–≥–∏—á–µ—Å–∫–∏–π', 0.5509157180786133),
 ('–∫–∞—Ä–¥–∏–æ—Ä–µ–≤–º–∞—Ç–æ–ª–æ–≥–∏—á–µ—Å–∫–∏–π', 0.5496220588684082),
 ('–ø–µ–∫—Ç–∏–Ω—è–±–ª–æ—á–Ω—ã–π', 0.5401729941368103),
 ('–∞–≥—Ä–∞—Ä–Ω–æ-—ç–∫–æ–Ω–æ–º–∏—á–µ—Å–∫–∏–π', 0.5355527997016907),
 ('–∏—Å—Ç–æ—Ä–∏–∫–æ-–∫—É–ª—å—Ç—É—Ä–æ–ª–æ–≥–∏—á–µ—Å–∫–∏–π', 0.5343062877655029)]