In [1]:
from pymystem3 import Mystem
import gensim
from judicial_splitter import split_paragraph, get_sentences
import string
import re
import json
import os
import pickle
from gensim import matutils
import numpy as np 
from tqdm import tqdm_notebook as tqdm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from math import log
from collections import defaultdict, Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec, KeyedVectors
import warnings
warnings.filterwarnings('ignore')
mystem = Mystem()

In [2]:
russian_stopwords = set(stopwords.words('russian'))

База авито

In [3]:
main_dir = '/data'

Файл - сслыка на объявление

In [84]:
def all_data_maker(main_dir, del_stop=True, stopwords={}):
    """
    {название файла: {сслыка на объявление авито, title, text, len}}
    """
    all_data = defaultdict(dict) 
    word_count = defaultdict(dict) # word : {id: count}

    for root, dirs, files in os.walk(main_dir):
        for name in files:
            if not '.DS_Store' in name:
                with open(os.path.join(root, name), 'r', encoding='utf-8') as f:
                    text_lines = f.readlines()
                    text = re.sub('\n', '', (' '.join(text_lines[:-2])))
                    words = preprocessing(text, stopwords=stopwords, del_stopwords=del_stop, del_digit=True)
                    all_data[name] = {"link": text_lines[-1],
                                      'title': text_lines[-2], 
                                      'text': text,
                                      'len': len(words)}
                    prob = Counter(words)
                    for word in prob:
                        word_count[word][name] = prob[word]
    return all_data, word_count
                    

Вспомогательные функции

In [5]:
def preprocessing(input_text, stopwords={}, del_stopwords=True, del_digit=True):
    
    words = [x.lower().strip(string.punctuation+'»«–…') for x in word_tokenize(input_text)]
    lemmas = [mystem.lemmatize(x)[0] for x in words if x]
    lemmas_arr = []
    for lemma in lemmas:
        if del_stopwords:
            if lemma in stopwords:
                continue
        if del_digit:
            if lemma.isdigit():
                continue
        lemmas_arr.append(lemma)
    return lemmas_arr

In [6]:
def splitting(main_dir, stopwords={}, del_stop=True):

    for root, dirs, files in os.walk(main_dir):
        for name in files:
            if not '.DS_Store' in name:
                with open(os.path.join(root, name), 'r', encoding='utf-8') as f:
                    text = f.read() 
                    parts = split_paragraph(get_sentences(text), 4)
                    for part in parts:
                        clean_part = preprocessing(part, stopwords, del_stopwords=del_stop, del_digit=True)
                        yield (clean_part, name)


Собираем данные в один массив

In [94]:
def save_base(main_dir, model_w2v, model_d2v, stopwords={}, del_stop=True):
    """Индексирует всю базу для поиска
    [{id, w2v, d2v}]
    """

    all_data = []

    for part in splitting(main_dir, stopwords=stopwords, del_stop=del_stop):
        vec_info = {}
        vec_info['id'] = part[1]
        vec_info['w2v_vec'] = get_w2v_vectors(part[0], model_w2v)
        vec_info['d2v_vec'] = get_d2v_vectors(part[0], model_d2v)
        all_data.append(vec_info)

    return all_data 

### Word2vec

In [8]:
def get_w2v_vectors(lemmas, model): 

    lemmas_vectors = []
    for lemma in lemmas:
        try:
            lemmas_vectors.append(model.wv[lemma])
        except:
            None
    if lemmas_vectors:
        doc_vec = sum(lemmas_vectors)
        normalized_vec = matutils.unitvec(doc_vec)
        return list(normalized_vec)
    else: 
        return [0] * 300

In [9]:
def similarity(v1, v2):
    v1_norm = matutils.unitvec(np.array(v1))
    v2_norm = matutils.unitvec(np.array(v2))
    return np.dot(v1_norm, v2_norm)


def similarity(vec1, vec2):
    return np.dot(vec1, vec2)


def culc_sim_score(all_data, vec, model_type):

    answer = defaultdict(float)  # id : score
    
    for part in all_data:

        if model_type == 'word2v':
            sim = similarity(part['w2v_vec'], vec)
        elif model_type == 'doc2v':
            sim = similarity(part['d2v_vec'], vec)
        else: raise ValueError
            
        if answer[part['id']] == 0.0: answer[part['id']] = float('-inf')
    
        if sim > answer[part['id']]: answer[part['id']] = sim

    return answer

In [39]:
def search_w2v(string, model, info_data, vec_data, stopwords={}, amount=10, del_stop=True):
    
    if not isinstance(string, str):
        raise ValueError('enter correct data')
    
    words = preprocessing(string, stopwords=stopwords, del_stopwords=del_stop, del_digit=True)
    vec = get_w2v_vectors(words, model)
    answer = culc_sim_score(vec_data, vec, 'word2v')
    
    for index, ans in enumerate(sorted(answer.items(), reverse=True, key=lambda x: x[1])):
        if index >= amount: break
        yield (ans[0], info_data[ans[0]], ans[1])

### Doc2vec

In [None]:
def tagged_data_creator(main_dir, stopwords={}, del_stop=False):
    
    tagged_data = []
    i = 0
    for part in splitting(main_dir, stopwords=stopwords, del_stop=del_stop):
        tagged_data.append(TaggedDocument(words=part[0], tags=[i]))
        i += 1
    return tagged_data   

In [None]:
def train_doc2vec(tagged_data, epo=100):
    
    model = Doc2Vec(vector_size=100, min_count=5, alpha=0.025, 
                min_alpha=0.025, epochs=epo, workers=4, dm=1)
    
    print('building vocabulary')
    model.build_vocab(tagged_data)
    print('starting training...')
    model.random.seed(42)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    print('model is trained')
    return model

In [11]:
def get_d2v_vectors(text, model):
    """Получает вектор документа"""
    return model.infer_vector(text)

In [38]:
def search_d2v(string, model, info_data, vec_data, stopwords={}, del_stop=False, amount=10):
    
    if not isinstance(string, str):
        raise ValueError('enter correct data')
    
    words = preprocessing(string, stopwords=stopwords, del_stopwords=del_stop, del_digit=True)
    vec = get_d2v_vectors(words, model)
    answer = culc_sim_score(vec_data, vec, 'doc2v')
    
    for index, ans in enumerate(sorted(answer.items(), reverse=True, key=lambda x: x[1])):
        if index >= amount: break
        yield (ans[0], info_data[ans[0]], ans[1])

Собираем все в одном месте:

In [13]:
model_w2v = Word2Vec.load('araneum_none_fasttextcbow_300_5_2018/araneum_none_fasttextcbow_300_5_2018.model')
model_d2v = Doc2Vec.load('my_d2v_model') 

In [93]:
info_data, word_count_del = all_data_maker(main_dir, del_stop=True, stopwords=russian_stopwords)
info_data, word_count_not_del = all_data_maker(main_dir, del_stop=False, stopwords=russian_stopwords)

In [95]:
vec_data_del = save_base(main_dir, model_w2v, model_d2v, del_stop=True, stopwords=russian_stopwords)
vec_data_not_del = save_base(main_dir, model_w2v, model_d2v, del_stop=False, stopwords=russian_stopwords)

In [96]:
with open('info_data.pickle', 'wb') as f:
    pickle.dump(info_data, f)
with open('word_count_del.pickle', 'wb') as f:
    pickle.dump(word_count_del, f)
with open('word_count_not_del.pickle', 'wb') as f:
    pickle.dump(word_count_not_del, f)
with open('vec_data_del.pickle', 'wb') as f:
    pickle.dump(vec_data_del, f)
with open('vec_data_not_del.pickle', 'wb') as f:
    pickle.dump(vec_data_not_del, f)

In [21]:
avgdl = np.mean([i['len'] for i in info_data.values()])

In [14]:
import pickle

In [15]:
with open('info_data.pickle', 'rb') as f:
    info_data = pickle.load(f)
with open('word_count_del.pickle', 'rb') as f:
    word_count_del = pickle.load(f)
with open('word_count_not_del.pickle', 'rb') as f:
    word_count_not_del = pickle.load(f)
with open('vec_data_del.pickle', 'rb') as f:
    vec_data_del = pickle.load(f)
with open('vec_data_not_del.pickle', 'rb') as f:
    vec_data_not_del = pickle.load(f)

##### Okapi

In [40]:

def score_BM25(qf, dl, avgdl, k1, b, N, n):
    """
    Compute similarity score between search query and documents from collection
    :return: score

    qf - кол - во вхождений слова в документе
    dl - длина документа

    """

    tf = qf / dl
    idf = log(N - n + 0.5 / n + 0.5)
    a = (k1 + 1) * tf
    b = tf + k1*(1 - b + b*(dl / avgdl))

    return (a / b) * idf


def compute_sim(words, doc, info_data, word_count, N, avgdl):
    """
    Compute similarity score between search query and documents from collection
    :return: score
    """

    k1 = 2.0
    b = 0.75
    ans = 0

    for word in words:
        if word_count[word] != {}:

            try: qf = word_count[word][doc]
            except KeyError: qf = 0

            dl = info_data[doc]['len']
            n = len(word_count[word])
            ans += score_BM25(qf, dl, avgdl, k1, b, N, n)

    return ans


def get_search_result(text, info_data, word_count, stopwords={}, del_stop=True, amount=10):
    """
    Compute sim score between search query and all documents in collection
    Collect as pair (doc_id, score)
    :param query: input text
    :return: list of lists with (doc_id, score)
    """

    if not isinstance(text, str):
        raise ValueError
    
    words = preprocessing(text, stopwords=stopwords, del_stopwords=del_stop, del_digit=True)
    answer = {}
    N = len(info_data)
   
    for doc in info_data:
        answer[doc] = compute_sim(words, doc, info_data, word_count, N, avgdl)

    for index, ans in enumerate(sorted(answer.items(), reverse=True, key=lambda x: x[1])):
        if index >= amount: break
        yield (ans[0], info_data[ans[0]], ans[1])

Тест

In [67]:
def merging_all_3(w2v, d2v, okapi, all_):
    
    ans = {}

    for item in all_:

        try: it_w = w2v[item][1]
        except KeyError: it_w = 0

        try: it_d = d2v[item][1]
        except KeyError: it_d = 0
            
        try: it_o = okapi[item][1]
        except KeyError: it_o = 0

        score = ((it_o * 0.8) + ((it_d * 0.2 + it_w + 0.8) / 2)*0.2) / 2
        ans[item] = score

    return ans

In [52]:
def serach_w2_d2_ok(string, model_w2v, model_d2v, info_data, vec_data, word_count, stopwords={}, del_stop=False, amount=10):

    w2v = {i[0]:(i[1], i[2]) for i in search_w2v(string, model_w2v, info_data, vec_data, stopwords=stopwords, amount=amount, del_stop=del_stop)}
    d2v = {i[0]:(i[1], i[2]) for i in search_d2v(string, model_d2v, info_data, vec_data, stopwords=stopwords, amount=amount, del_stop=del_stop)}
    okapi = {i[0]:(i[1], i[2]) for i in get_search_result(string,  info_data, word_count, stopwords=stopwords, del_stop=del_stop, amount=amount)}
    
    all_ = set(w2v.keys()) | set(d2v.keys()) | set(okapi.keys())
    answer = merging_all_3(w2v, d2v, okapi, all_)
    
    for index, ans in enumerate(sorted(answer.items(), reverse=True, key=lambda x: x[1])):
        if index >= amount: break
        yield (ans[0], info_data[ans[0]], ans[1])


In [68]:
def search(string, search_method, model_w2v, model_d2v, info_data, vec_data_del, vec_data_not_del, word_count_del, word_count_not_del, amount=10, del_stop=True, stopwords={}):
        
    if search_method == 'inverted_index':
        if del_stop is not True:
            search_result = ((i) for i in get_search_result(string, info_data, word_count_not_del, stopwords=stopwords, del_stop=del_stop, amount=amount))
        else:
            search_result = (i for i inget_search_result(string, info_data, word_count_del, stopwords=stopwords, del_stop=del_stop, amount=amount))

    elif search_method == 'word2vec':
        if del_stop is not True:
            search_result = (i for i in search_w2v(string, model_w2v, info_data, vec_data_not_del, stopwords=stopwords, amount=amount, del_stop=del_stop))
        else:
            search_result = (i for i in search_w2v(string, model_w2v, info_data, vec_data_del, stopwords=stopwords, amount=amount, del_stop=del_stop))

    elif search_method == 'doc2vec':
        if del_stop is not True:
            search_result = (i for i in search_d2v(string, model_d2v, info_data, vec_data_not_del, stopwords=stopwords, amount=amount, del_stop=del_stop))
        else:
            search_result = (i for i in search_d2v(string, model_d2v, info_data, vec_data_del, stopwords=stopwords, amount=amount, del_stop=del_stop))
    
    elif search_method == 'all':
        if del_stop is not True:
            search_result =  (i for i in serach_w2_d2_ok(string, model_w2v, model_d2v, info_data, vec_data_not_del, word_count_not_del, del_stop=del_stop, stopwords=stopwords, amount=amount))
        else:
            search_result =  (i for i in serach_w2_d2_ok(string, model_w2v, model_d2v, info_data, vec_data_del, word_count_del, del_stop=del_stop, stopwords=stopwords, amount=amount))
    
    else:
        raise TypeError('unsupported search method')
    
    return search_result