In [1]:
from sentence_transformers import SentenceTransformer, util
import os
import csv
import time
import torch
import pickle
from tqdm.notebook import tqdm

In [2]:
DATASET_DIR = "../quora-question-pairs-dataset/"

In [6]:
dataset_path = DATASET_DIR + "train.csv"

corpus_sentences = set()
with open(dataset_path) as fIn:
    reader = csv.DictReader(fIn)
    for row in tqdm(reader):
        for key in ['question1', 'question2']:
            que = row[key]
            if que:
                corpus_sentences.add(que)

corpus_sentences = list(corpus_sentences)

0it [00:00, ?it/s]

In [8]:
with open(DATASET_DIR + 'corpus_sentences.list', 'wb') as fp:
    pickle.dump(corpus_sentences, fp)

In [3]:
with open (DATASET_DIR + 'corpus_sentences.list', 'rb') as fp:
    corpus_sentences = pickle.load(fp)

In [4]:
len(corpus_sentences)

537361

In [29]:
model_name = 'quora-distilbert-multilingual'
model = SentenceTransformer(model_name)

In [16]:
corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_tensor=True, num_workers=4, batch_size=1024)

Batches:   0%|          | 0/16793 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# torch.save(corpus_embeddings, DATASET_DIR+'corpus_embeddings.pt')

In [6]:
corpus_embeddings = torch.load(DATASET_DIR+'corpus_embeddings.pt')

In [9]:
def search(inp_question, top_k=10):
    start_time = time.time()
    question_embedding = model.encode(inp_question, convert_to_tensor=True)
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    end_time = time.time()
    hits = hits[0]
    print("Input question:", inp_question)
    print("Results (after {:.3f} seconds):".format(end_time-start_time))
    for hit in hits:
        print("\t{:.3f}\t{}".format(hit['score'], corpus_sentences[hit['corpus_id']]))

In [13]:
search("How can I learn Python online?")

Input question: How can I learn Python online?
Results (after 0.981 seconds):
	0.988	How and from where can I learn Python language via video tutorials online?
	0.988	Which is the best and free way to learn Python?
	0.987	What is the best online resources to learn Python?
	0.986	What are the best sources for learning Python online?
	0.983	Which is the best resource to learn Python? Online, books, video tutes?


In [17]:
#German: How can I learn Python online?
search("Wie kann ich Python online lernen?")

Input question: Wie kann ich Python online lernen?
Results (after 1.157 seconds):
	0.989	What are the best sources for learning Python online?
	0.989	What is the best online resources to learn Python?
	0.988	How and from where can I learn Python language via video tutorials online?
	0.987	Which is the best and free way to learn Python?
	0.985	Which is the best resource to learn Python? Online, books, video tutes?


In [10]:
#Chinese: How can I learn Python online?
search("如何在线学习Python")

Input question: 如何在线学习Python
Results (after 0.991 seconds):
	0.987	How can I start learning the developing of websites using Python?
	0.985	Which is the best and free way to learn Python?
	0.983	How and from where can I learn Python language via video tutorials online?
	0.983	What are the best sources for learning Python online?
	0.983	What is the best online resources to learn Python?


## Chat bot test

In [3]:
info_model = SentenceTransformer('msmarco-distilroberta-base-v2')

In [4]:
answer_set = ["This is a forum to answer question related to programming",
            "I am an assistant bot which can answer your questions about this forum",
            "The special thing about this forum is that it uses NLP to prevent redundant questions.",
            "We use BERT to compare and find questions that are similar.",
            "Bidirectional Encoder Representations or BERT is a from Transformers is a Transformer-based machine learning technique for natural language processing pre-training developed by Google.",
            "If your question matches some which already exists on our forum, you will be shown the matching the questions. If your answer is still not available, you can decide and still post your question."
            ]

In [11]:
len(answer_set)

6

In [7]:
answer_embedding = info_model.encode(answer_set, show_progress_bar=True, convert_to_tensor=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
answer_embedding.shape

torch.Size([6, 768])

In [57]:
query = "Who are you?"

In [58]:
query_embedding = info_model.encode(query, convert_to_tensor=True)

In [59]:
scores = util.pytorch_cos_sim(query_embedding, answer_embedding)

In [60]:
answer_set[scores.argmax().item()]

'We use BERT to compare and find questions that are similar.'

## Question answer set

In [28]:
question_answer_set = {"What is the forum for?": "This is a forum to answer question related to programming",
                    "Hello": "Hello! How are you today?",
                    "Who are you?": "I am an assistant bot which can answer your questions about this forum",
                    "What is special about this forum?": "The special thing about this forum is that it uses NLP to prevent redundant questions, even if it is in a different language.",
                    "How does the site find similar question?": "We use BERT to compare and find questions that are similar.",
                    "What is BERT?": "Bidirectional Encoder Representations or BERT is a from Transformers is a Transformer-based machine learning technique for natural language processing pre-training developed by Google.",
                    "How can I post questions on this blog?" : "You can enter your question in the ask-question box. If a similar question already exist, we will show you the existing questions and promt you again.",
                    "What if my question already exist?": "If your question matches some which already exists on our forum, you will be shown the matching the questions. If your answer is still not available, you can decide and still post your question."
                    }

In [117]:
import json

In [121]:
with open("question_answer.json","w") as f:
    json.dump(question_answer_set, f, indent=4)

In [31]:
len(question_answer_set)

8

In [43]:
questions = list(question_answer_set.keys())

In [44]:
question_answer_embedding = model.encode(questions, show_progress_bar=True, convert_to_tensor=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [45]:
question_answer_embedding.shape

torch.Size([8, 768])

In [110]:
query = "How are you"

In [111]:
question_embedding = model.encode(query, convert_to_tensor=True)
hits = util.semantic_search(question_embedding, question_answer_embedding)

In [112]:
hits = hits[0]

In [126]:
hits

[{'corpus_id': 2, 'score': 0.8867191},
 {'corpus_id': 5, 'score': 0.8249192},
 {'corpus_id': 4, 'score': 0.82229954},
 {'corpus_id': 6, 'score': 0.8110063},
 {'corpus_id': 7, 'score': 0.7986413},
 {'corpus_id': 3, 'score': 0.7799907},
 {'corpus_id': 1, 'score': 0.7749933},
 {'corpus_id': 0, 'score': 0.71964705}]

In [122]:
questions[hits[0]['corpus_id']]

'Who are you?'

In [114]:
question_answer_set[questions[hits[0]['corpus_id']]]

'I am an assistant bot which can answer your questions about this forum'

In [115]:
"Sorry, I did not get your question. Can you be more specific?"

'Sorry, I did not get your question. Can you be more specific?'