In [None]:
import pandas as pd
import json 
import numpy as np 


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
# Функция для преобработки строк
def preprocess_sentence(sentence):
   # Tokenize
   tokens = word_tokenize(sentence.lower())
   
   # Убираем слова-паразиты (stopwords)
   stop_words = set(stopwords.words('english'))

   tokens = [token for token in tokens if token not in stop_words]
   
   # Лематизация
   lemmatizer = WordNetLemmatizer()
   tokens = [lemmatizer.lemmatize(token) for token in tokens]
   
   return ' '.join(tokens)


def get_most_similar_sentence(user_input, df):
   preprocessed_sentences = df['phrase'].tolist()
   # Create TF-IDF vectorizer
   vectorizer = TfidfVectorizer(max_df=0.3,      # drop words that occur in more than X percent of documents
                           stop_words='english', # remove stop words
                           lowercase=True,       # Convert everything to lower case 
                           use_idf=True,         # Use idf
                           norm=u'l2',           # Normalization
                           smooth_idf=True       # Prevents divide-by-zero errors
                           )
   
   # Generate TF-IDF matrix
   tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
   return tfidf_matrix, vectorizer

In [None]:
def search(tfidf_matrix, model, request):
    request_transform = model.transform([request])
    similarity = np.dot(request_transform,np.transpose(tfidf_matrix))
    x = np.array(similarity.toarray()[0])
    indices=np.argsort(x)[-5:][::-1]
    sim = np.sort(x)[-5:][::-1]
    return indices, sim


def print_result(request_content,indices,X, sim):
    print('\nSearch: ' + request_content)
    print('\nBest Results')
    for j, i in enumerate(indices):
        print('id = {0:5d} - phrase = {1} - similarity = {2}'.format(i, X['url_picture'].loc[i], sim[j]))
        
            

In [None]:
def input_json(file_name):
    path = 'C:\\Visual Studio programs' + '\\' + file_name
    try:
        with open(path) as f:
            data_j =  json.load(f)
            messages = ''
            for i in data_j['dialogue']:
                if i['share_photo'] != False:
                    break 
                messages = messages + i['message'] + ' '
        return messages
    except:
        print('Файла не существует! Либо вы его не подгрузили')
        return talking()

In [None]:
def talking():
    print('''
        Если у вас файл json то введите его название 
        Например: file_with_dia.json
        Если вы хотите ввести запрос (описание картинки) то введите его
        Например: i want to see a burger
        Если вы хотите ввести запрос, диалог как то ещё, сообщите разработчику
        ''')
    
    arg = input()
    if '.json' in arg:
        return input_json(arg)
    else:
        return arg

In [None]:
df = pd.read_csv('C:\\Visual Studio programs\\rdata1.csv', sep=';')

In [None]:
df['phrase_after_preprocess'] = df['phrase'].apply(preprocess_sentence)

In [None]:
tfidf_matrix, vectorizer = get_most_similar_sentence('', df)

In [None]:
user_input = talking()
ind, sim = search(tfidf_matrix, vectorizer, user_input)
print_result(user_input, ind, df, sim)