In [1]:
import os
from collections import defaultdict, Counter
import numpy as np
from preprocessing import *
from defaultlist import defaultlist

In [40]:
import re

In [2]:
main_dir = './Friends/'
files_list = []

In [3]:
for root, dirs, files in os.walk(main_dir):
    for file in files:
        files_list.append(os.path.abspath(os.path.join(root, file)))

In [4]:
prepro_files = defaultlist()
files_length = defaultlist()

for file in files_list:
    with open(file, 'r', encoding='utf-8') as f:
        lemmas_list = preprocessing(f.read())
        prepro_files[files_list.index(file)] = lemmas_list
        files_length[files_list.index(file)] = len(lemmas_list)
        # print(files_list[files_list.index(file)])

In [27]:
import json

In [61]:
with open('files_list.json', 'w', encoding='utf-8') as fw:
    json.dump(files_list, fw)

In [9]:
with open('lemmad.json', 'w', encoding='utf-8') as fw:
    json.dump(prepro_files, fw)

In [5]:
# матрица терм-документ:
# словарь, где ключи -- леммы, а значения -- список частотностей в коллекции документов

def get_term_doc_matrix(prepro_files):
    
    term_doc_matrix = defaultdict(list)
    n = len(prepro_files)
    
    for indx, lemmas in enumerate(prepro_files):
        for lemma in lemmas:
            if lemma in term_doc_matrix:
                term_doc_matrix[lemma][indx] += 1
            else:
                term_doc_matrix[lemma] = [0] * len(prepro_files)
                term_doc_matrix[lemma][indx] += 1

    return term_doc_matrix

In [6]:
# обратный индекс:
# словарь, где ключи -- леммы, а значения -- список документов, где встретилась эта лемма

def inverted_index(prepro_files) -> dict:
    """
    Create inverted index by input doc collection
    :return: inverted index
    """
    
    term_doc_matrix = get_term_doc_matrix(prepro_files)
    inverted_index = defaultdict(list)
    
    for lemma in term_doc_matrix:
        for indx, doc in enumerate(term_doc_matrix[lemma]):
            if doc > 0:
                inverted_index[lemma].append(indx)

    return inverted_index, term_doc_matrix

In [7]:
inverted_index, term_doc_matrix = inverted_index(prepro_files)

In [8]:
from math import log

In [34]:
def score_BM25(qf, dl, avgdl, N, n) -> float:
    """
    Compute similarity score between search query and documents from collection
    :return: score
    """
    k1 = 2.0
    b = 0.75

    score = log((N - n + 0.5) / (n + 0.5)) * (k1 + 1) * qf / (qf + k1 * (1 - b + b * (dl / avgdl)))
    
    return score 

In [11]:
def compute_sim(lemma, inverted_index, term_doc_matrix, files_length) -> float:
    """
    Compute similarity score between search query and documents from collection
    :return: score
    """
    
    relevance_score = {}
    avgdl = sum(files_length) / len(files_length)
    N = len(files_length)

    for doc in range(N):
        if lemma in term_doc_matrix:
            qf = term_doc_matrix[lemma][doc]
            n = len(inverted_index[lemma])
        else:
            qf = 0
            n = 0

        relevance_score[doc] = score_BM25(qf, files_length[doc], avgdl, N, n)

    return relevance_score

In [47]:
with open('inverted_index.json', 'w', encoding='utf-8') as fw:
    json.dump(inverted_index, fw)

In [56]:
with open('term_doc_matrix.json', 'w', encoding='utf-8') as fw:
    json.dump(term_doc_matrix, fw)

In [57]:
with open('files_length.json', 'w', encoding='utf-8') as fw:
    json.dump(files_length, fw)

In [13]:
def search_inv_index(query, inverted_index, term_doc_matrix, files_length, n_results) -> list:
    """
    Compute sim score between search query and all documents in collection
    Collect as pair (doc_id, score)
    :param query: input text
    :return: list of lists with (doc_id, score)
    """
    
    relevance_dict = defaultdict(float)
    lemmas = preprocessing(query)
    
    for lemma in lemmas:
        sims = compute_sim(lemma, inverted_index, term_doc_matrix, files_length)
        for doc in sims:
            relevance_dict[doc] += sims[doc]
            
    result = sorted(relevance_dict, key=relevance_dict.get, reverse=True)[:n_results]

    return [(files_list[doc].split('/Friends/Friends - ')[1].strip('.ru.txt'),
                                         relevance_dict[doc]) for doc in result] 

In [22]:
term_doc_matrix['ходить']

[0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 3,
 0,
 0,
 0,
 2,
 0,
 1,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 1,
 1,
 0,
 1,
 7,
 0,
 2,
 2,
 2,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 2,
 1,
 0,
 2,
 0,
 2,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 2,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 4,
 1,
 1,
 2,
 0,
 0,
 1,
 3,
 1,
 0,
 0,
 1,
 0,
 1,
 2,
 0,
 2,
 1,
 0,
 3,
 2,
 1,
 2,
 1,
 0,
 0,
 0,
 4,
 2,
 2,
 1,
 3,
 0,
 0,
 4,
 1,
 0,
 0,
 1,
 2,
 3,
 0,
 1,
 3,
 0,
 2,
 2,
 2,
 1,
 1,
 6,
 1,
 1,
 0,
 7,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 2,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 2,
 0,
 2,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0]

In [23]:
search_inv_index('ходить', inverted_index, term_doc_matrix, files_length, 10)

defaultdict(<class 'float'>, {0: 0.0, 1: 0.0, 2: -0.06450907055724057, 3: -0.06083843040843744, 4: 0.0, 5: -0.05469781192451975, 6: 0.0, 7: 0.0, 8: 0.0, 9: -0.08979895728075477, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: -0.06074501254551583, 17: 0.0, 18: -0.060343495039614295, 19: -0.06270271578105113, 20: 0.0, 21: 0.0, 22: 0.0, 23: -0.10773631893792934, 24: 0.0, 25: 0.0, 26: 0.0, 27: -0.07725970396196148, 28: 0.0, 29: -0.05988675138700505, 30: 0.0, 31: 0.0, 32: -0.08862544452376406, 33: 0.0, 34: 0.0, 35: 0.0, 36: 0.0, 37: 0.0, 38: -0.06046647238967371, 39: 0.0, 40: -0.09024278783825525, 41: -0.05958607757839268, 42: -0.059556176169649366, 43: 0.0, 44: -0.06105752683590743, 45: -0.14593360265834607, 46: 0.0, 47: -0.08787083709443606, 48: -0.09451909682939202, 49: -0.09296429485521526, 50: 0.0, 51: -0.06290211474358813, 52: -0.059407118151878, 53: 0.0, 54: 0.0, 55: -0.05931804094608508, 56: -0.06276904148834643, 57: -0.09355053083404649, 58: -0.06182084470933862, 59: 0.0

[('season 7/Friends - 7x06 - The One With The Nap Partners', 0.0),
 ("season 7/Friends - 7x19 - The One With Ross And Monica's Cousin", 0.0),
 ("season 7/Friends - 7x18 - The One With Joey's Award", 0.0),
 ('season 7/Friends - 7x10 - The One With The Holiday Armadillo', 0.0),
 ('season 7/Friends - 7x13 - The One Where Rosita Dies', 0.0),
 ('season 7/Friends - 7x09 - The One With All The Candy', 0.0),
 ("season 7/Friends - 7x15 - The One With Joey's New Brain", 0.0),
 ("season 7/Friends - 7x02 - The One With Rachel's Book", 0.0),
 ("season 7/Friends - 7x24-25 - The One With Chandler And Monica's Wedding (2)",
  0.0),
 ("season 7/Friends - 7x12 - The One Where They're Up All Nigh", 0.0)]

In [24]:
res = search_inv_index('ходить', inverted_index, term_doc_matrix, files_length, 10)
for elem in res:
    print('{}: {}'.format(elem[0], elem[1]))

defaultdict(<class 'float'>, {0: 0.0, 1: 0.0, 2: -0.06450907055724057, 3: -0.06083843040843744, 4: 0.0, 5: -0.05469781192451975, 6: 0.0, 7: 0.0, 8: 0.0, 9: -0.08979895728075477, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: -0.06074501254551583, 17: 0.0, 18: -0.060343495039614295, 19: -0.06270271578105113, 20: 0.0, 21: 0.0, 22: 0.0, 23: -0.10773631893792934, 24: 0.0, 25: 0.0, 26: 0.0, 27: -0.07725970396196148, 28: 0.0, 29: -0.05988675138700505, 30: 0.0, 31: 0.0, 32: -0.08862544452376406, 33: 0.0, 34: 0.0, 35: 0.0, 36: 0.0, 37: 0.0, 38: -0.06046647238967371, 39: 0.0, 40: -0.09024278783825525, 41: -0.05958607757839268, 42: -0.059556176169649366, 43: 0.0, 44: -0.06105752683590743, 45: -0.14593360265834607, 46: 0.0, 47: -0.08787083709443606, 48: -0.09451909682939202, 49: -0.09296429485521526, 50: 0.0, 51: -0.06290211474358813, 52: -0.059407118151878, 53: 0.0, 54: 0.0, 55: -0.05931804094608508, 56: -0.06276904148834643, 57: -0.09355053083404649, 58: -0.06182084470933862, 59: 0.0

# Word2Vec

In [14]:
from gensim.models import Word2Vec

In [15]:
import pickle

In [16]:
model = Word2Vec.load('./flask/model/araneum_none_fasttextskipgram_300_5_2018.model')

In [18]:
def get_w2v_vectors(model, lemmas):
    """Получает вектор документа"""
    
    vec_list = []
    
    for lemma in lemmas:
        if lemma in model.wv:
            vec_list.append(model.wv[lemma])
        
    doc_vec = sum(vec_list) / len(vec_list)
    
    return doc_vec

In [19]:
def save_w2v_base(files_list, prepro_files, model):
    """Индексирует всю базу для поиска через word2vec"""
    
    doc_index = []    
    
    for lemmas in prepro_files:

        vec = get_w2v_vectors(model, lemmas)
            
        file_index = {'index': files_list[prepro_files.index(lemmas)], 'vec': vec}
        doc_index.append(file_index)
    
    with open('w2v_indexed_base' + '.pkl', 'wb') as fw:
            pickle.dump(doc_index, fw)

    return doc_index

In [20]:
with open('w2v_indexed_base.pkl', 'rb') as f:
    w2v_base = pickle.load(f)

In [44]:
w2v_base = save_w2v_base(files_list, prepro_files, model)

Функция измерения близости между векторами

In [21]:
from gensim import matutils
import numpy as np 

def similarity(v1, v2):
    v1_norm = matutils.unitvec(np.array(v1))
    v2_norm = matutils.unitvec(np.array(v2))
    return np.dot(v1_norm, v2_norm)

Функция поиска по Word2Vec

In [22]:
def search_w2v(query, model, w2v_base, n_results):
    
    query_vec = get_w2v_vectors(model, preprocessing(query))
    
    similarities = {}
    
    for doc in w2v_base:
        sim = similarity(query_vec, doc['vec'])
        similarities[sim] = doc['index']
        
    results = [re.split('/Friends - season [0-9]/Friends - ', similarities[sim].strip('.ru.txt'))[1]
               for sim in sorted(similarities, reverse=True)[:n_results]]
    
    return results

In [37]:
def search(query, search_method, n_results=10):

    if search_method == 'inverted_index':
        search_result = search_inv_index(query, inverted_index, term_doc_matrix, files_length, n_results)

    elif search_method == 'word2vec':
        search_result = search_w2v(query, model, w2v_base, n_results)
        
    else:
        raise TypeError('unsupported search method')
        
    return search_result

In [49]:
model['ывшршыа']

  """Entry point for launching an IPython kernel.


KeyError: "word 'ывшршыа' not in vocabulary"

In [38]:
search('рождественские каникулы', 'inverted_index')

[('season 7/Friends - 7x10 - The One With The Holiday Armadillo',
  9.774760701134205),
 ("season 6/Friends - 6x19 - The One With Joey's Fridge", 7.8319086214455185),
 ('season 3/Friends - 3x10 - The One Where Rachel Quits', 5.600988808869542),
 ("season 2/Friends - 2x09 - The One With Phoebe's Dad", 4.78695819559373),
 ('season 1/Friends - 1x17 - The One With Two Parts (2)', 4.140265683391886),
 ("season 4/Friends - 4x03 - The One With The 'Cuffs", 4.120980621964985),
 ('season 1/Friends - 1x16 - The One With Two Parts (1)', 4.053326905862368),
 ('season 4/Friends - 4x10 - The One With The Girl From Poughkeepsie',
  4.02566179468818),
 ('season 6/Friends - 6x12 - The One With The Joke', 3.4605118646078226),
 ('season 6/Friends - 6x09 - The One Where Ross Got High', 3.4152600172747283)]

In [41]:
search('рождественские каникулы', 'word2vec')

['7x10 - The One With The Holiday Armadillo',
 '2x22 - The One With The Two Parties',
 "2x09 - The One With Phoebe's Dad",
 '4x10 - The One With The Girl From Poughkeepsie',
 '1x09 - The One Where Underdog Gets Away',
 '6x10 - The One With The Routine',
 '7x11 - The One With All The Cheesecakes',
 '6x09 - The One Where Ross Got High',
 '3x10 - The One Where Rachel Quits',
 "7x02 - The One With Rachel's Book"]