In [1]:
!pip install pymorphy2 -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m609.1 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for docopt (setup.py) ... [?25l[?25hdone


In [2]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
import pymorphy2

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
df = pd.read_csv('/content/drive/MyDrive/МФТИ/nlp2/chatbot/Data/dialogues.csv')
df

Unnamed: 0,Q,A,Context
0,What was I doing? Your Uncle Marshall was tak...,"(on the phone) hey, so you know how I've alwa...",What was I doing? Your Uncle Marshall was tak...
1,"Hey, you wanna do something tonight?","Okay, meet me at the bar in fifteen minutes, ...","Hey, you wanna do something tonight? | Hey."
2,Hey.,Where's your suit!? Just once when I say suit...,Hey. | I did that one time.
3,I did that one time.,It was a blazer!,"I did that one time. | You know, ever since ..."
4,"You know, ever since college it's been Marsha...",I see what this is about. Have you forgotten ...,"You know, ever since college it's been Marsha..."
...,...,...,...
4849,"In the spring of 2013, Barney and Robin were j...",The Ho-be-gone Sleep System,"In the spring of 2013, Barney and Robin were j..."
4850,My son.,"Ted, wait.",My son. | now you don't have to go to the ba...
4851,Coat rack?,And Liddy would be all...,Coat rack? | Really?
4852,catastrophic,"Mind-bl*wing entertainment, like a naked f*re...",catastrophic | I told you no clown.


In [5]:
def lemma_tokenizer(text):
    lemmatizer = pymorphy2.MorphAnalyzer()
    token_text = word_tokenize(text)
    lemma_text = [lemmatizer.parse(token)[0].normal_form for token in token_text]
    return lemma_text

vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=5024, tokenizer=lemma_tokenizer)
matrix_tfidf = vectorizer.fit_transform(df[['Q', 'A', 'Context']].apply(lambda x: ' '.join(x.astype(str)), axis=1))



In [6]:
def get_indices_by_cosine_similarity(text, matrix, top_k=5, relevant=True):
    query_vector = vectorizer.transform([text])
    cosine_similarities = cosine_similarity(query_vector, matrix).flatten()
    sorted_indices = np.argsort(cosine_similarities, axis=0)
    relevant_indices = sorted_indices[::-1][:top_k] if relevant else sorted_indices[:top_k]
    return relevant_indices

def get_most_relevant(text, matrix, text_list, top_k=2):
    inds = get_indices_by_cosine_similarity(text, matrix, top_k=top_k, relevant=True)
    return [text_list[ind] for ind in inds]

def get_least_relevant(text, matrix, text_list, top_k=2):
    inds = get_indices_by_cosine_similarity(text, matrix, top_k=top_k, relevant=False)
    return [text_list[ind] for ind in inds]

def get_relevant_phrase(text, matrix, text_list, relevantness=1.0, rel_random=0):
    cosine_similarities = cosine_similarity(vectorizer.transform([text]), matrix).flatten()
    relevant_indices = np.argsort(cosine_similarities, axis=0)
    k_random = random.random() * rel_random
    relevantness = min(1, relevantness + k_random)
    ind = relevant_indices[int((len(relevant_indices) - 1) * relevantness)]
    return text_list[ind]

In [7]:
count_labels = 3
df["label"] = 0
data_frames = [df]

for i in range(1, count_labels):
    df_new = df.copy()
    relevantness = 1 - i / (count_labels - 1)
    df_new.loc[df_new.index, "label"] = i

    df_new["A"] = df_new["A"].apply(get_relevant_phrase, args=(matrix_tfidf, df_new['A']), relevantness=relevantness, rel_random=0.05)
    data_frames.append(df_new)

In [8]:
combined_df = pd.concat(data_frames, axis=0)

In [9]:
combined_df.to_csv("Barney.csv", index_label=False)