In [0]:
from google.colab import files
uploaded = files.upload()

Saving quora_question_pairs_rus.csv.zip to quora_question_pairs_rus.csv.zip


In [0]:
import zipfile
import io
import csv

In [0]:
zf = zipfile.ZipFile(io.BytesIO(uploaded['quora_question_pairs_rus.csv.zip']), "r")
zf.extractall()

In [0]:
import pandas as pd
pd.read_csv('quora_question_pairs_rus.csv')[:5]

Unnamed: 0.1,Unnamed: 0,question1,question2,is_duplicate
0,0,Какова история кохинор кох-и-ноор-бриллиант,"что произойдет, если правительство Индии украд...",0
1,1,как я могу увеличить скорость моего интернет-с...,как повысить скорость интернета путем взлома ч...,0
2,2,"почему я мысленно очень одинок, как я могу это...","найти остаток, когда математика 23 ^ 24 матема...",0
3,3,которые растворяют в воде быстро сахарную соль...,какая рыба выживет в соленой воде,0
4,4,астрология: я - луна-колпачок из козерога и кр...,Я тройная луна-козерог и восхождение в козерог...,1


In [0]:
with open('quora_question_pairs_rus.csv', 'r') as f:
  reader = csv.reader(f)
  corpus = list(reader)

In [0]:
corpus.pop(0)

['', 'question1', 'question2', 'is_duplicate']

In [0]:
for doc in corpus:
  doc = doc.pop(3)

In [0]:
for doc in corpus:
  doc = doc.pop(0)

In [0]:
corpus[:5]

[['Какова история кохинор кох-и-ноор-бриллиант',
  'что произойдет, если правительство Индии украдет кохинор кох-и-ноор-алмаз назад'],
 ['как я могу увеличить скорость моего интернет-соединения, используя vpn',
  'как повысить скорость интернета путем взлома через dns'],
 ['почему я мысленно очень одинок, как я могу это решить',
  'найти остаток, когда математика 23 ^ 24 математика разделена на 24 23'],
 ['которые растворяют в воде быстро сахарную соль метан и углеродный диоксид',
  'какая рыба выживет в соленой воде'],
 ['астрология: я - луна-колпачок из козерога и крышка, поднимающая то, что это говорит обо мне',
  'Я тройная луна-козерог и восхождение в козероге, что это говорит обо мне']]

In [0]:
from itertools import chain

In [0]:
corpus = list(chain.from_iterable(corpus))

In [0]:
corpus[:5]

['Какова история кохинор кох-и-ноор-бриллиант',
 'что произойдет, если правительство Индии украдет кохинор кох-и-ноор-алмаз назад',
 'как я могу увеличить скорость моего интернет-соединения, используя vpn',
 'как повысить скорость интернета путем взлома через dns',
 'почему я мысленно очень одинок, как я могу это решить']

# Preprocessing

In [0]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
mystopwords = stopwords.words('russian')
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
def preprocess(data):
  data = [word.lower().strip('!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n') for word in data if word not in mystopwords]

  return data

In [0]:
corpus = preprocess(corpus)

In [0]:
corpus[:5]

['какова история кохинор кох-и-ноор-бриллиант',
 'что произойдет, если правительство индии украдет кохинор кох-и-ноор-алмаз назад',
 'как я могу увеличить скорость моего интернет-соединения, используя vpn',
 'как повысить скорость интернета путем взлома через dns',
 'почему я мысленно очень одинок, как я могу это решить']

# Inverted Index

In [0]:
from collections import defaultdict

In [0]:
lemmas = []
for doc in corpus:
  lemmas.append(doc.split())

In [0]:
lemmas[:3]

[['какова', 'история', 'кохинор', 'кох-и-ноор-бриллиант'],
 ['что',
  'произойдет,',
  'если',
  'правительство',
  'индии',
  'украдет',
  'кохинор',
  'кох-и-ноор-алмаз',
  'назад'],
 ['как',
  'я',
  'могу',
  'увеличить',
  'скорость',
  'моего',
  'интернет-соединения,',
  'используя',
  'vpn']]

In [0]:
with open('Lemmas.txt', 'w', encoding='utf-8') as f:
  f.write(str(lemmas))

In [0]:
termdict = {}

In [0]:
for idx, element in enumerate(lemmas):
  termdict[idx] = element

In [0]:
termdict[10]

['я', 'должен', 'купить', 'tiago']

In [0]:
inv_idx = {}

In [0]:
inv_idx = defaultdict(list)
for k,value in termdict.items():
    for v in value:
        inv_idx[v].append(k)

In [0]:
inv_idx['каникулы']

[35417,
 35418,
 46956,
 47464,
 60238,
 113282,
 152069,
 156476,
 156477,
 157966,
 157967,
 210303,
 229658,
 284283,
 347097,
 423173,
 428202,
 439312,
 482061,
 512449,
 572439,
 678546,
 765026,
 769925]

In [0]:
import json

In [0]:
with open('Inverted_idx.json', 'w', encoding='utf-8') as fp:
  json.dump(inv_idx, fp)

# TF-IDF

In [0]:
from collections import Counter
import numpy as np

In [0]:
DF = {}
N = len(lemmas)
for i in range(N):
    tokens = lemmas[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

for i in DF:
    DF[i] = len(DF[i])

In [0]:
total_vocab_size = len(DF)

In [0]:
total_vocab_size

242396

In [0]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

In [0]:
doc = 0

tf_idf = {}

for i in range(N):
    
    tokens = lemmas[i]
    
    counter = Counter(tokens)
    words_count = len(tokens)
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        
        tf_idf[doc, token] = tf*idf

    doc += 1

In [0]:
tf_idf

{(0, 'история'): 1.610920289566181,
 (0, 'какова'): 0.9707169073793591,
 (0, 'кох-и-ноор-бриллиант'): 2.998383787657047,
 (0, 'кохинор'): 2.759505926400188,
 (1, 'если'): 0.3279965025445213,
 (1, 'индии'): 0.4115829808475412,
 (1, 'кох-и-ноор-алмаз'): 1.389373419377131,
 (1, 'кохинор'): 1.2264470784000834,
 (1, 'назад'): 0.7698235190275403,
 (1, 'правительство'): 0.6933697547382738,
 (1, 'произойдет,'): 0.6173804117532455,
 (1, 'украдет'): 1.2803923912647168,
 (1, 'что'): 0.18376430137876557,
 (2, 'vpn'): 0.9001905391686179,
 (2, 'интернет-соединения,'): 1.389373419377131,
 (2, 'используя'): 0.6867332624528294,
 (2, 'как'): 0.14678551605978518,
 (2, 'могу'): 0.2501845617000761,
 (2, 'моего'): 0.5653536395847069,
 (2, 'скорость'): 0.7215456585972352,
 (2, 'увеличить'): 0.6122357817248782,
 (2, 'я'): 0.1866106056594821,
 (3, 'dns'): 1.298012154774261,
 (3, 'взлома'): 0.9530109122701459,
 (3, 'интернета'): 0.9489632360822658,
 (3, 'как'): 0.16513370556725834,
 (3, 'повысить'): 0.930484709

In [0]:
!pip install ujson

Collecting ujson
[?25l  Downloading https://files.pythonhosted.org/packages/16/c4/79f3409bc710559015464e5f49b9879430d8f87498ecdc335899732e5377/ujson-1.35.tar.gz (192kB)
[K     |█▊                              | 10kB 14.2MB/s eta 0:00:01[K     |███▍                            | 20kB 2.2MB/s eta 0:00:01[K     |█████▏                          | 30kB 3.2MB/s eta 0:00:01[K     |██████▉                         | 40kB 2.1MB/s eta 0:00:01[K     |████████▌                       | 51kB 2.2MB/s eta 0:00:01[K     |██████████▎                     | 61kB 2.7MB/s eta 0:00:01[K     |████████████                    | 71kB 2.5MB/s eta 0:00:01[K     |█████████████▋                  | 81kB 2.4MB/s eta 0:00:01[K     |███████████████▍                | 92kB 2.6MB/s eta 0:00:01[K     |█████████████████               | 102kB 2.5MB/s eta 0:00:01[K     |██████████████████▊             | 112kB 2.5MB/s eta 0:00:01[K     |████████████████████▌           | 122kB 2.5MB/s eta 0:00:01[K     |

In [0]:
import pprint

In [0]:
with open ('TF-IDF.json', 'w', encoding='utf-8') as file:
    pprint.pprint(tf_idf, indent=4, stream=file)
    file.close()

# TF-IDF Matching Score Ranking

In [0]:
def preprocess_query(query):
    query = query.lower()
    query = query.strip('!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n')
    
    return query

In [0]:
def matching_score(k, query):
    preprocessed_query = preprocess_query(query)
    tokens = str(preprocessed_query).split()

#    print("Matching Score")
#    print("\nQuery:", query)
#    print("")
#    print(tokens)
    
    query_weights = {}

    for key in tf_idf:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)

    print("")
    
    l = []
    
    for i in query_weights[:10]:
        l.append(i[0])
    
#    print(l)
    return(tokens, l)

In [0]:
matching_score(10, "рождественские каникулы")




(['рождественские', 'каникулы'],
 [729631,
  156476,
  163782,
  441804,
  527891,
  584126,
  618607,
  618608,
  778514,
  35417])

# TF-IDF Cosine Similarity Ranking

In [0]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [0]:
#from itertools import islice

In [0]:
#def split_dict_equally(input_dict, chunks=100):
#    return_list = [dict() for idx in range(chunks)]
#    idx = 0
#    for k,v in input_dict.items():
#        return_list[idx][k] = v
#        if idx < chunks-1:  
#            idx += 1
#        else:
#            idx = 0
#    return return_list

In [0]:
#chunks = split_dict_equally(DF)

In [0]:
#for idx, element in enumerate(chunks[1]):
#  print(idx)

In [0]:
total_vocab = [x for x in DF]

In [0]:
D = np.zeros((N, total_vocab_size)) #session crushes after using all available RAM
for i in tf_idf:
    try:
        ind = total_vocab.index(i[1])
        D[i[0]][ind] = tf_idf[i]
    except:
        pass

In [0]:
def gen_vector(tokens):

    Q = np.zeros((len(total_vocab)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = math.log((N+1)/(df+1))

        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

In [0]:
def cosine_similarity(k, query):
    print("Cosine Similarity")
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    d_cosines = []
    
    query_vector = gen_vector(tokens)
    
    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))
        
    out = np.array(d_cosines).argsort()[-k:][::-1]
    
    print("")
    
    print(out)

In [0]:
Q = cosine_similarity(10, "рождественские каникулы")

# Cosine Similarity (SKlearn)

In [0]:
from sklearn.metrics.pairwise import cosine_similarity

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [0]:
documents = []
for idx, element in enumerate(lemmas):
  for el in element:
    documents.append(el)

In [0]:
cv = CountVectorizer()

In [0]:
cv.fit_transform(documents)

<7551870x183728 sparse matrix of type '<class 'numpy.int64'>'
	with 6811249 stored elements in Compressed Sparse Row format>

In [0]:
tf_matrix = cv.transform(documents).toarray() #session crushes after using all available RAM

In [0]:
tf_matrix.shape

In [0]:
tfidfTran = TfidfTransformer(norm="l2")
tfidfTran.fit(tf_matrix)

In [0]:
def idf(n,df):
    result = math.log((n+1.0)/(df+1.0)) + 1
    return result

In [0]:
tfidf_matrix = tfidfTran.transform(tf_matrix)

In [0]:
cos_similarity_matrix = (tfidf_matrix * tfidf_matrix.T).toarray()

In [0]:
cos_similarity_matrix

# Okapi BM25

In [0]:
from math import log

In [0]:
def compute_avdl(documents): 
    N = len(documents)
    sum_text_len = 0
    for text in documents:
        sum_text_len += len(text)
        
    avdl = sum_text_len / N
    avdl = round(avdl, 0)    
    return avdl

def compute_K(dl, avdl):
    k1 = 2.0
    b = 0.75
    return k1 * ((1-b) + b * (float(dl)/float(avdl)))

In [0]:
k1 = 2.0
b = 0.75
N = len(lemmas)
for q in inv_idx:
  n = len(inv_idx[q])

avdl = compute_avdl(lemmas)
dl = len(inv_idx)

fq = 0
for doc in lemmas:
  for word in doc:
        if q == word:
            fq += 1  

K = compute_K(dl, avdl)
IDF = log((N - n + 0.5) / (n + 0.5))
frac = ((k1 + 1) * fq) / (K + fq)

In [0]:
print(IDF * frac)

0.0009799926148190263


# Fasttext

In [0]:
import sys
import gensim
from gensim.models.keyedvectors import KeyedVectors
import zipfile

In [0]:
!wget http://vectors.nlpl.eu/repository/11/181.zip

--2019-10-24 22:06:58--  http://vectors.nlpl.eu/repository/11/181.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.225
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.225|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2622716217 (2.4G) [application/zip]
Saving to: ‘181.zip’


2019-10-24 22:09:25 (17.1 MB/s) - ‘181.zip’ saved [2622716217/2622716217]



In [0]:
!unzip 181.zip -d 'fasttext_model'

Archive:  181.zip
  inflating: fasttext_model/meta.json  
  inflating: fasttext_model/model.model  
  inflating: fasttext_model/model.model.vectors_ngrams.npy  
  inflating: fasttext_model/model.model.vectors.npy  
  inflating: fasttext_model/model.model.vectors_vocab.npy  
  inflating: fasttext_model/README   


In [0]:
model_file = './fasttext_model/model.model'

In [0]:
model= KeyedVectors.load(model_file)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
lemmas_new = list(chain.from_iterable(lemmas))

In [0]:
# создаем маски для векторов 
lemmas_vectors = np.zeros((len(lemmas_new), model.vector_size))
vec = np.zeros((model.vector_size,))

In [0]:
for idx, lemma in enumerate(lemmas_new):
    if lemma in model.wv:
        lemmas_vectors[idx] = model.wv[lemma]

  


AttributeError: ignored

# ELMO

In [0]:
!wget http://vectors.nlpl.eu/repository/11/170.zip

--2019-10-24 22:21:22--  http://vectors.nlpl.eu/repository/11/170.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.225
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.225|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 474536163 (453M) [application/zip]
Saving to: ‘170.zip’


2019-10-24 22:21:54 (14.3 MB/s) - ‘170.zip’ saved [474536163/474536163]



In [0]:
!mkdir rus_model
!cp 170.zip rus_model
!ls rus_model

170.zip


In [0]:
from zipfile import ZipFile
with ZipFile('rus_model/170.zip', 'r') as zipObj:
  zipObj.extractall('./rus_model')

In [0]:
!rm rus_model/170.zip

In [0]:
!ls rus_model

char.dic     encoder.pkl  README	      word.dic
config.json  meta.json	  token_embedder.pkl


In [0]:
!git clone https://github.com/HIT-SCIR/ELMoForManyLangs

Cloning into 'ELMoForManyLangs'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 174 (delta 0), reused 2 (delta 0), pack-reused 171
Receiving objects: 100% (174/174), 86.67 KiB | 1.84 MiB/s, done.
Resolving deltas: 100% (85/85), done.


In [0]:
!head rus_model/config.json

{"seed": 1, "gpu": 2, "train_path": "/users4/conll18st/raw_text/Russian/ru-20m.raw", "valid_path": null, "test_path": null, "config_path": "/users4/conll18st/elmo/configs/cnn_50_100_512_4096_sample.json", "word_embedding": null, "optimizer": "adam", "lr": 0.001, "lr_decay": 0.8, "model": "/users4/conll18st/elmo/src/final_models/ru.model", "batch_size": 32, "max_epoch": 10, "clip_grad": 5, "max_sent_len": 20, "min_count": 3, "max_vocab_size": 150000, "save_classify_layer": false, "valid_size": 0, "eval_steps": 10000}


In [0]:
!pip install -e ELMoForManyLangs/

Obtaining file:///content/ELMoForManyLangs
Collecting overrides
  Downloading https://files.pythonhosted.org/packages/7a/b2/2cb6a3fc8ee1dc8617e07e476be19723748ddfcce0c6b9db7a5f2d5b9598/overrides-2.0.tar.gz
Building wheels for collected packages: overrides
  Building wheel for overrides (setup.py) ... [?25l[?25hdone
  Created wheel for overrides: filename=overrides-2.0-cp36-none-any.whl size=4222 sha256=c172c1de6129c8b7d0ffdd3a3b045e79f1971a28558ade8a83c95399f1a775b3
  Stored in directory: /root/.cache/pip/wheels/67/ab/57/d68b6dad468ff96b792770a83229451add2b347b0c12a10300
Successfully built overrides
Installing collected packages: overrides, elmoformanylangs
  Running setup.py develop for elmoformanylangs
Successfully installed elmoformanylangs overrides-2.0


In [0]:
!ls ELMoForManyLangs/configs

cnn_0_100_512_4096_sample.json	cnn_50_100_512_4096_sample.json


In [0]:
import json
with open('rus_model/config.json') as json_file:
    data = json.load(json_file)

In [0]:
data['config_path'] = '../ELMoForManyLangs/configs/cnn_50_100_512_4096_sample.json'

In [0]:
with open('rus_model/config.json', 'w') as outfile:
    json.dump(data, outfile)

In [0]:
from ELMoForManyLangs.elmoformanylangs import Embedder
e = Embedder('./rus_model/')

2019-10-24 22:23:27,547 INFO: char embedding size: 3896
2019-10-24 22:23:29,098 INFO: word embedding size: 329681
2019-10-24 22:23:32,030 INFO: Model(
  (token_embedder): ConvTokenEmbedder(
    (word_emb_layer): EmbeddingLayer(
      (embedding): Embedding(329681, 100, padding_idx=3)
    )
    (char_emb_layer): EmbeddingLayer(
      (embedding): Embedding(3896, 50, padding_idx=3893)
    )
    (convolutions): ModuleList(
      (0): Conv1d(50, 32, kernel_size=(1,), stride=(1,))
      (1): Conv1d(50, 32, kernel_size=(2,), stride=(1,))
      (2): Conv1d(50, 64, kernel_size=(3,), stride=(1,))
      (3): Conv1d(50, 128, kernel_size=(4,), stride=(1,))
      (4): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
      (5): Conv1d(50, 512, kernel_size=(6,), stride=(1,))
      (6): Conv1d(50, 1024, kernel_size=(7,), stride=(1,))
    )
    (highways): Highway(
      (_layers): ModuleList(
        (0): Linear(in_features=2048, out_features=4096, bias=True)
        (1): Linear(in_features=2048, out_fe

In [0]:
res = e.sents2elmo(lemmas_new) #session crushes after using all available RAM

In [0]:
len(res)

In [0]:
import json

# Flask

In [0]:
import socket
print(socket.gethostbyname(socket.getfqdn(socket.gethostname())))

from flask import Flask
from flask import url_for, render_template, request, redirect

172.28.0.2


In [0]:
app = Flask(__name__)


def preprocess_query(query):
    query = query.lower()
    query = query.strip('!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n')
    
    return query

  
def matching_score(k, query):
    preprocessed_query = preprocess_query(query)
    tokens = str(preprocessed_query).split()
    
    query_weights = {}

    for key in tf_idf:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)

    print("")
    
    l = []
    
    for i in query_weights[:10]:
        l.append(i[0])
    
    return tokens, l 

  
@app.route('/')
def index():
    urls = {'main_page': url_for('index'),
           'results_data': url_for('results')}
    return render_template('index.html', urls=urls)
  
@app.route('/results')
def show_results():
    with open('TF-IDF.json', 'r', encoding = 'utf-8') as file:
          tf_idf = file.read()
    input_query = request.args
    input_query = preprocess_query(input_query)
    tokens, l = matching_score(10, input_query)    
    return render_template('results.html', query=query, tokens=tokens, l=l)


if __name__ == '__main__':
    app.run(debug=True) 

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on


OSError: ignored

In [0]:
import threading
threading.Thread(target=app.run, kwargs={'host':'0.0.0.0','port':80}).start() 

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on


 * Running on http://0.0.0.0:80/ (Press CTRL+C to quit)
