# Prerequisities

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [97]:
import os
from os import listdir
from os.path import join
import random
import re
import numpy as np
import json
from operator import itemgetter
import string
from typing import Dict, List, Set, Tuple

In [98]:
# main path to your data folder
main_path = 'gdrive/MyDrive/bot/data'

# path to fasttext vectors file
ft_path = os.path.join(main_path, 'fasttext/cc.pl.100.bin')

# path to vectors generated from open_subtitles
vectors_path = os.path.join(main_path, 'open_subtitles/1MB_vectors')

# path to a dictionary that is build from corpora and consists from pairs: "word": [nr_of_line, another_nr_of_line, ...]
index_dict_path = os.path.join(main_path, 'sentence_similarity/index_dict2.json')

# path to the corpora with only the most popular words
corpus_with_populars_path = os.path.join(main_path, 'sentence_similarity/corpus_with_populars2.txt')

## Fastext

In [None]:
!pip install fasttext

In [100]:
import fasttext

In [None]:
ft = fasttext.load_model(ft_path)

## Bert

In [None]:
!pip install transformers
!pip install sacremoses

In [104]:
from transformers import AutoTokenizer, AutoModel
from transformers import BertForMaskedLM
from transformers import BertTokenizer
from transformers import pipeline

In [105]:
def get_pred_model():
    """Get Bert model ready for usage."""
    model = BertForMaskedLM.from_pretrained("allegro/herbert-large-cased")
    tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-large-cased")
    pred_pipeline = pipeline("fill-mask", model=model, tokenizer=tokenizer)
    return pred_pipeline

In [None]:
bert_model = get_pred_model()

## Morfeusz

In [None]:
!pip install morfeusz2

In [108]:
import morfeusz2
morf = morfeusz2.Morfeusz()

# Source sentence_similarity

In [110]:
word_vectors_dict = {}

In [111]:
def get_sentence_vector(sentence: str) -> float:
    """Split the sentence into words, get rid of special signs and 
    use fasttext to get their vector representation. 
    Fill the word_vectors_dict to speed up the process in the future.

    The vector of the sentence is the sum of vectors of words 
    divided by the number of words.
    """
    words = sentence.split()
    words_without_special_signs = [
        re.sub(r'[\W_]+', '', word) for word in words]

    word_vectors = []
    for word in words_without_special_signs:
        if word not in word_vectors_dict:
            word_vectors_dict[word] = ft.get_word_vector(word)
        word_vectors.append(word_vectors_dict[word])

    word_vectors = list(map(np.array, word_vectors))
    vectors_sum = np.add.reduce(word_vectors)
    return vectors_sum / len(words_without_special_signs)
    

In [36]:
def read_indexes_dict() -> Dict[str, List[int]]:
    """Read indexes dict from a json file."""
    with open(index_dict_path, "r", encoding="utf-8") as indexes:
        index_dict = json.load(indexes)
    return index_dict

In [37]:
index_dict = read_indexes_dict()

In [44]:
def load_corpus_line_offset() -> List[int]:
    """Read the corpus with popular words once and build
       a list of line offsets.
    """
    with open(corpus_with_populars_path, "rb") as corpus:
        line_offset = []
        offset = 0
        for line in corpus:
            line_offset.append(offset)
            offset += len(line)
        corpus.seek(0)
    return line_offset

In [42]:
line_offset = load_corpus_line_offset()

In [43]:
def find_cosine_similarity(vector_A: float, vector_B: float) -> np.ndarray:
    """Calculate cosine similarity for two vectors."""
    return np.dot(vector_A, vector_B) / (
        np.linalg.norm(vector_A) * np.linalg.norm(vector_B)
    )

In [95]:
def get_synonyms_with_weights(word: str) -> Tuple[List[str], List[float]]:
    """Get synonyms of given word using fasttext."""
    neighbours = ft.get_nearest_neighbors(word, k=5)
    synonyms = [x[1] for x in neighbours]
    synonyms = [
        x.translate(str.maketrans("", "", string.punctuation)) for x in synonyms
    ]
    weights = [x[0] for x in neighbours]
    return synonyms, weights


def get_all_synonyms(word: str) -> List[str]:
    """Get all synonyms of a word"""
    return get_synonyms_with_weights(word)[0]


def get_best_synonym(word: str) -> str:
    """Determine which synonym has the biggest score and return it."""
    synonyms, weights = get_synonyms_with_weights(word)
    index = weights.index(max(weights))
    return synonyms[index]


def get_random_synonym(word: str) -> str:
    """Get random synonym from list of synonyms generated with fasttext."""
    return random.choice(get_all_synonyms(word))


def get_synonyms_for_all_sentence(words: List[str]) -> Set[str]:
    """Get synonyms for all words in the sentence packed in the 'words' list."""
    all_sentence_synonyms = []
    for word in words:
        all_sentence_synonyms += get_all_synonyms(word)
    return set(all_sentence_synonyms)


In [50]:
def similarity_by_embeddings(input_message: str, words: List[str]) -> str:
    """Find best answer in the corpora using cosine similarity 
    between input message and lines from corpora.
    """
    mini_index_dict = {}
    words = get_synonyms_for_all_sentence(words)
    for word in words:
        if word in index_dict:
            mini_index_dict[word] = set(index_dict[word])
    corpora_sentences = load_lines_from_corpora(mini_index_dict)
    input_message_vector = get_sentence_vector(input_message)
    sentence2cosine_similarity = {}
    for sentence in corpora_sentences:
        sentence2cosine_similarity[sentence] = find_cosine_similarity(
            input_message_vector, get_sentence_vector(sentence)
        )
    sorted_sentence2cosine_similarity = {
        k: v
        for k, v in sorted(
            sentence2cosine_similarity.items(), key=lambda item: item[1], reverse=True
        )
    }
    best_answer = list(sorted_sentence2cosine_similarity.keys())[0]
    return best_answer


def load_lines_from_corpora(mini_index_dict: Dict[str, List[int]]) -> Set[str]:
    """Load lines of found indexes from corpora."""
    corpora_sentences = set()
    with open(corpus_with_populars_path, "r", encoding="utf-8") as file:
        for indexes in mini_index_dict.values():
            for line_number in indexes:
                file.seek(line_offset[line_number])
                try:
                    corpora_sentences.add(file.readline())
                except:
                    continue
    return corpora_sentences


In [49]:
similarity_by_embeddings(
    "Bardzo lubię czytać czasopisma", ["lubię", "czytać", "czasopisma"]
)


'Naprawdę lubisz czytać komiksy !\n'

# Source word_generator

In [112]:
def find_all_subjects_and_verbs(sentence: str) -> Tuple[List[str], List[str]]:
    """Find subjects and verbs in a sentence using Morfeusz analysis."""
    analysis = morf.analyse(sentence)
    subjects, verbs = [], []
    verbs_tags = [
        "verb",
        "refl",
        "nonrefl",
        "perf",
        "imperf",
        "imperf.perf",
        "praet",
        "inf",
        "fin",
    ]
    for i, j, interp in analysis:
        first_tag = interp[2].split(":")[0]
        if first_tag == "subst":
            subjects.append(interp[0])
        elif first_tag in verbs_tags:
            verbs.append(interp[1])
    return subjects, verbs


In [113]:
def get_first_person(verb: str) -> str:
    """Take verb and find its form for the first person singular."""
    generated_verbs = morf.generate(verb)
    for elem in generated_verbs:
        if "sg" in elem[2] and "pri" in elem[2]:
            return elem[0]
    return ""


In [114]:
def get_noun_and_verb(sentence: str) -> Tuple[str, str]:
    """Get random noun and verb calculated from given sentence."""
    nouns, verbs = find_all_subjects_and_verbs(sentence)
    verbs_first = list(map(get_first_person, verbs))
    nouns_synonyms = list(map(get_random_synonym, nouns))
    if nouns_synonyms:
        random_noun = random.choice(nouns_synonyms)
    else:
        random_noun = "."
    if verbs_first:
        random_verb = random.choice(verbs_first)
    else:
        random_verb = "jestem"
    return random_noun, random_verb


In [129]:
def generate_answer(sentence: str) -> str:
    """Generate answer using Bert model and synonyms and verbs in first person singular."""
    noun, verb = get_noun_and_verb(sentence)
    first_gen = bert_model(f"{verb} {bert_model.tokenizer.mask_token} {noun}")[0]["sequence"]
    second_gen = bert_model(f"{bert_model.tokenizer.mask_token} {first_gen}")[1]["sequence"]
    third_gen = bert_model(f"{second_gen} {bert_model.tokenizer.mask_token}.")[0]["sequence"]
    return third_gen


In [None]:
generate_answer("Czy lubisz jeść czekoladę?")

In [None]:
generate_answer("Czy chciałbyś umieć latać?")

In [None]:
generate_answer("Gdzie jedziesz na wakacje?")

In [None]:
generate_answer("O której się budzisz?")