### Imports library

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora, models
import collections
import numpy as np
import smart_open
import pymorphy2
import random
import gensim
import json
import os

Using TensorFlow backend.


### Corpus preparation

In [2]:
def get_percentile_counts(tokenized_lengths):
    minCount=np.min(tokenized_lengths)
    maxCount=np.max(tokenized_lengths)
    print(minCount)
    print(maxCount)
    for i in range(0,11):
        lower_border = i*(maxCount-minCount)/10+minCount
        upper_border = (i+1)*(maxCount-minCount)/10+minCount
        print(lower_border)
        print("Count: ",np.count_nonzero(tokenized_lengths*((tokenized_lengths>=lower_border)&(tokenized_lengths<=upper_border))))

def get_russian_lemma(token, lemmatizer):
    lemma = lemmatizer.parse(token.lower())[0]
    return lemma.normal_form

def get_lemmatized_sequence(sequence, lemmatizer):
    lemmas = []
    for token in sequence:
        lemma = get_russian_lemma(token, lemmatizer)
        if lemma.strip() != "":
            lemmas.append(lemma)
    return ' '.join(lemmas)

def get_lemmatized_document(fullfilename, lemmatizer):
    with open(fullfilename, 'r') as file:
        return get_lemmatized_sequence(gensim.utils.simple_preprocess(file.read().replace('\n', ' ')), lemmatizer)  

def build_lemmatized_corpora(path, targetfilename, lemmatizer):
    with open(targetfilename+'.cor', 'w') as corporafile, open(targetfilename+'.cfn', 'w') as filenamefile:
        for filename in os.listdir(path):
            line = get_lemmatized_document(path+os.sep+filename, lemmatizer)
            print(line, file=corporafile)
            print(filename,file=filenamefile)
            files_processed+=1
    print("Total files processed: ", files_processed)

def read_corpus(fname, tokens_only=False, preserve=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if preserve:
                yield line.decode()[:-1]
            elif tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])


def split_corpus(full_corpus_filenames, full_corpus, split_fraction):
    random_filenames = np.random.rand(len(full_corpus_filenames)) < 1-split_fraction
    train_corpus=[]
    train_corpus_filenames=[]
    test_corpus=[]
    test_corpus_filenames = []

    for i in range(0, len(random_filenames)): 
        if i%4==0:#full_corpus_filenames[i][-5:-4]==str(1):
            test_corpus.append(full_corpus[i])
            test_corpus_filenames.append(full_corpus_filenames[i])
        else:
            train_corpus.append(full_corpus[i])
            train_corpus_filenames.append(full_corpus_filenames[i])
    return train_corpus_filenames, train_corpus, test_corpus_filenames, test_corpus

### Training models

In [3]:
def TrainLDAModel(train_corpus):
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(train_corpus)
    
    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in train_corpus]

    # generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=lda_topics, id2word = dictionary, passes=20)
    return ldamodel, dictionary

def TrainLSIModel(train_corpus):
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(train_corpus)
    
    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in train_corpus]

    # generate LSI model
    lsimodel = gensim.models.lsimodel.LsiModel(corpus, num_topics=lsi_topics, id2word = dictionary)
    return lsimodel, dictionary

def TrainTFIDFModel(train_corpus):
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(train_corpus)
    
    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in train_corpus]

    # generate TFIDF model
    tfidfmodel = gensim.models.TfidfModel(corpus)
    return tfidfmodel, dictionary

def TrainW2VModel(train_corpus, dimensionality, iterations):
    sentences = train_corpus
    #sentences = SentenceModel(train_corpus)
    model = gensim.models.Word2Vec(sentences, size=dimensionality, iter=iterations)
    return model

def TrainP2VModel(train_corpus, dimensionality, iterations):
    corpus_tagged = []
    for i in range(0,len(train_corpus)):
        corpus_tagged.append(gensim.models.doc2vec.TaggedDocument(train_corpus[i], [i]))

    model = gensim.models.doc2vec.Doc2Vec(dm=1, size=dimensionality, min_count=2, iter=iterations)
    model.build_vocab(corpus_tagged)
    model.train(corpus_tagged)
    return model

### Assembling vectors for corpus documents

In [4]:
def GetVectors(model, dictionary, dimensionality, corpus):
    dim_sum = 0
    vectors = []
    for i in range(0, len(corpus)):        
        prevector = model[dictionary.doc2bow(corpus[i])]
        dim_sum += len(prevector)
        vector = [0]*dimensionality
        for k,v in prevector:
            vector[k]=v
        vectors.append(vector)
    return vectors

def GetAverageW2Vector(model, doc, dimensionality):
    wordcount = 0
    docvector = [0]*dimensionality
    for word in doc:
        if word in model.vocab:
            wordcount+=1
            docvector=[x + y for x, y in zip(docvector, model[word])]
    docvector=[x / wordcount for x in docvector]
    return docvector

def GetAverageW2VectorsCorpus(model, corpus):
    vectors = []
    for doc in corpus:
        vectors.append(GetAverageW2Vector(model, doc))
    return vectors

def GetWeightedAverageW2Vector(model, weights, dictionary, doc, dimensionality):
    wordcount = 0
    docvector = [0]*dimensionality
    
    d = dict(weights[dictionary.doc2bow(doc)])

    for word in doc:
        if 1==1:
            weight=dictionary.doc2bow([word])
            #if (word in model.vocab) and (len(weight)>0):
            if (word in model.wv.vocab) and (len(weight)>0):
                wordcount+=1
                w=weight[0][0]
                if not (w in d):
                    w=0
                else:
                    w=d[w]
                docvector=[(x + (y*w)) for x, y in zip(docvector, model[word])]
    if wordcount == 0:
        docvector=[x / wordcount for x in docvector]
    return docvector

def GetWeightedAverageW2VectorsCorpus(model, weights, dictionary, corpus, dimensionality):
    vectors = []
    for doc in corpus:
        vectors.append(GetWeightedAverageW2Vector(model, weights, dictionary, doc, dimensionality))
    return vectors

def GetP2VectorsCorpus(model, corpus):
    vectors = []
    for doc in corpus:
        vectors.append(model.infer_vector(doc))
    return vectors

### Obtaining simple search query results

In [5]:
def most_similar_docs(docs, model, doc, topn=10):
    similar_docs = []
    
    inferred_vector = model.infer_vector(doc)
    vecs=model.docvecs.most_similar([inferred_vector], topn=topn)
    
    for i,sim in vecs:
        similar_docs.append(tuple((docs[i], sim)))
    return similar_docs

def most_similar_docs_w2v(docs, model, tfidfmodel, dictionary, vectors, doc, topn=10):
    similar_docs = []

    inferred_vector = GetWeightedAverageW2VectorsCorpus(model, tfidfmodel, dictionary, [doc], 200)
    
    sims=[]
    
    for i in range(0,len(vectors)):
        sims.append((corpus_names[i], cosine_similarity(np.reshape(vectors[i], (1,-1)), np.reshape(inferred_vector, (1,-1)))[0][0]))
                    
    similar_docs=sorted(sims, key=lambda x: x[1], reverse=True)

    return similar_docs[:topn]

def most_similar_docs_lda_lsi_tfidf(docs, model, dictionary, topics, doc, topn=10):
    similar_docs = []

    vectors = GetVectors(model, dictionary, topics, corpus)    
    inferred_vector = GetVectors(model, dictionary, topics, [doc])[0]
    
    sims=[]
    
    for i in range(0,len(vectors)):
        sims.append((corpus_names[i], cosine_similarity(np.reshape(vectors[i], (1,-1)), np.reshape(inferred_vector, (1,-1)))[0][0]))
                    
    similar_docs=sorted(sims, key=lambda x: x[1], reverse=True)
    return similar_docs[:topn]

### Packing results

In [8]:
def pack_similar_documents(docs):
    results = {}
    for key in docs.keys():
        results[key]=docs[key]
    return results

def pack_sims(model_name, id, query_names, query_data, topn=10):
    results={}
    results['Filename']=query_names[id]
    search_results={}
    #search_results['p2v']=most_similar_docs(corpus_names, p2vmodel, query_data, topn=topn)
    #search_results['lsi']=most_similar_docs_lda_lsi_tfidf(corpus_names, lsimodel, lsidictionary, lsi_topics, query_data, topn=topn)
    #search_results['lda']=most_similar_docs_lda_lsi_tfidf(corpus_names, ldamodel, ldadictionary, lda_topics, query_data, topn=topn)
    #search_results['tfidf']=most_similar_docs_lda_lsi_tfidf(corpus_names, tfidfmodel, tfidfdictionary, len(tfidfdictionary), query_data, topn=topn)
    search_results[model_name]=most_similar_docs_w2v(corpus_names, waw2vmodel, w2vtfidfmodel, w2vdictionary, w2vvectors, query_corpus[id], topn=topn)
    #search_results[]=

    results['Results']=pack_similar_documents(search_results)
    
    return results

def pack_it_all(model_name, filename, query_names, query_data, topn=10):
    sims=[]
    for i in range(0,len(query_names)):
        sims.append(pack_sims(model_name, i, query_names, query_data, topn))
        print ('Packed {} - {}'.format(i, query_names[i]))

    res=json.dumps(sims, ensure_ascii=False)

    with open(filename+'.json', 'w', encoding="utf-8") as f:
        print(res, file=f)

### Run

In [9]:
data_dir = "D:\\Разработка\\Python\\Julius\\data\\"

#mooc_preserved = 'corpus_mooc.txt'
mooc_corpus = 'courses.cor'
mooc_names = 'courses.cfn'

#rpd_preserved = 'test_rpds.txt'
rpd_names='docs.cfn'
rpd_corpus='docs.cor'

### Lemmatization

In [10]:
#morph = pymorphy2.MorphAnalyzer()
#%time build_lemmatized_corpora(data_dir, mooc_preserved, morph)
#%time build_lemmatized_corpora(data_dir, rpd_preserved, morph)

### Read corpus

In [11]:
corpus = list(read_corpus(data_dir + mooc_corpus, tokens_only=True))
corpus_names = list(read_corpus(data_dir + mooc_names, preserve=True))

query_corpus = list(read_corpus(data_dir + rpd_corpus, tokens_only=True))
query_names = list(read_corpus(data_dir + rpd_names, preserve=True))

print('Length corpus: ', len(corpus))
print('Length query_corpus', len(query_corpus))

Length corpus:  1276
Length query_corpus 29


### Train model

In [22]:
print('Training time TF-IDF model')
%time tfidfmodel, tfidfdictionary = TrainTFIDFModel(corpus)

dimensionality = 200
iterations = 55
print('Training time Word2Vec model')
%time waw2vmodel = TrainW2VModel(corpus, dimensionality, iterations)

#%time p2vmodel = TrainP2VModel(corpus, 50, 55)

lda_topics = 25
#%time ldamodel, ldadictionary = TrainLDAModel(corpus)

lsi_topics = 25
#%time lsimodel, lsidictionary = TrainLSIModel(corpus)

Training time TF-IDF model
Wall time: 1.4 s
Training time Word2Vec model
Wall time: 1min 2s


In [23]:
# remove \r from corpus_name
corpus_names = [s.replace('\r', '') for s in corpus_names]

print('length vocabulary w2v:', len(waw2vmodel.wv.vocab))
print('length vocabulary tf-idf:', len(tfidfdictionary))

length vocabulary w2v: 9495
length vocabulary tf-idf: 28811


### Create out vectors of w2v model

In [24]:
outv = gensim.models.KeyedVectors()
outv.vocab = waw2vmodel.wv.vocab  # same
outv.index2word = waw2vmodel.wv.index2word  # same
outv.syn0 = waw2vmodel.syn1neg  # different

### "IN - OUT" matrix example

In [25]:
test_word = 'язык'
print('Vocab:', test_word)

print('IN - IN similar')
display(waw2vmodel.most_similar(positive=[waw2vmodel[test_word]]))
print('OUT - OUT similar')
display(outv.most_similar(positive=[outv[test_word]]))
print('IN - OUT similar')
display(waw2vmodel.most_similar(positive=[outv[test_word]]))
print('OUT - IN similar')
display(outv.most_similar(positive=[waw2vmodel[test_word]]))

Vocab: язык
IN - IN similar


[('язык', 0.9999999403953552),
 ('синтаксис', 0.4423679709434509),
 ('интерпретатор', 0.41489899158477783),
 ('грамматик', 0.38827142119407654),
 ('лексика', 0.36520522832870483),
 ('лисп', 0.3629491329193115),
 ('java', 0.33973807096481323),
 ('грамматика', 0.33232033252716064),
 ('python', 0.3297003507614136),
 ('семантика', 0.3292355239391327)]

OUT - OUT similar


[('язык', 1.0),
 ('markup', 0.8441022038459778),
 ('письменность', 0.8416407108306885),
 ('modeling', 0.8401807546615601),
 ('полувычислимый', 0.8394925594329834),
 ('иероглиф', 0.837383508682251),
 ('турбо', 0.8345221281051636),
 ('эскизный', 0.8336510062217712),
 ('сокращённо', 0.8334447741508484),
 ('smalltalk', 0.8317399024963379)]

IN - OUT similar


[('фортран', 0.2721642851829529),
 ('sh', 0.24759642779827118),
 ('sml', 0.22450587153434753),
 ('autolisp', 0.22380654513835907),
 ('dcl', 0.21880140900611877),
 ('полувычислимый', 0.21604400873184204),
 ('tsg', 0.21088218688964844),
 ('ocl', 0.20522364974021912),
 ('автоматный', 0.19804765284061432),
 ('английский', 0.19609320163726807)]

OUT - IN similar


[('русский', 0.1406673789024353),
 ('английский', 0.12973621487617493),
 ('си', 0.11544454097747803),
 ('ocl', 0.11124130338430405),
 ('фортран', 0.09091351181268692),
 ('java', 0.08889391273260117),
 ('python', 0.08672577887773514),
 ('pascal', 0.08471836149692535),
 ('tsg', 0.08415255695581436),
 ('пролог', 0.08167275786399841)]

In [26]:
# Обучение на in матрице
w2vvectors = GetWeightedAverageW2VectorsCorpus(waw2vmodel, tfidfmodel, tfidfdictionary, corpus, dimensionality)

# Обучение на out матрице
#w2vvectors = GetWeightedAverageW2VectorsCorpus(outv, tfidfmodel, tfidfdictionary, corpus, dimensionality)

### Export model 

In [27]:
w2vtfidfmodel = tfidfmodel
w2vdictionary = tfidfdictionary

%time pack_it_all(model_name='w2v_xattab_in_in_200_i55', filename='w2v_xattab_in_in_200_i55', query_names=query_names, query_data=query_corpus, topn=20)

Packed 0 - 21_РПД_Теория вероятностей и математическая статистика
Packed 1 - 15_РПД_Защита информации
Packed 2 - 24_ РПД _Алгоритмы и анализ сложности 
Packed 3 - 4_РПД Экономика
Packed 4 - 30_ РПД _Объектно-ориентированный анализ и программирование
Packed 5 - РПД БИ Маг Переговоры
Packed 6 - РПД _Информационные системы и технологии ПИ_09.03.03(2)
Packed 7 - 6_РПД_Линейная алгебра и аналитическая геометрия
Packed 8 - 10_РПД _Базы данных
Packed 9 - 9_РПД_Операционные системы
Packed 10 - 41_1_РПД _Теория автоматов и формальных языков
Packed 11 - 11_РПД _Программирование
Packed 12 - 37_РПД _Программная инженерия
Packed 13 - 15_РПД_Сети и телекоммуникации
Packed 14 - 31_РПД _Программирование на Java
Packed 15 - РПД Менеджмент 38.03.05 очка
Packed 16 - РПД_ теория принятия решений_ маг БИ (вер2)
Packed 17 - 5_РПД _Математический анализ, Дифференциальные и разностные уравнения
Packed 18 - 35_РПД _ Локальные и глобальные вычислительные сети
Packed 19 - 45_2_РПД_Программирование .Net Framework