<!-- ## Подготовка данных -->

## Импорты

In [1]:
import pandas as pd

import nltk
import spacy
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

from collections import defaultdict

from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
import pickle
import os

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance as levenshtein_distance


nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vsevolod026/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/vsevolod026/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Подготовка данных

### Код для выполнения

#### Импорт

In [3]:
file_path = "data/IMDB Dataset.csv"
df = pd.read_csv(file_path)


df = df[["review"]]
df = df.reset_index(drop=True)


duplicates_count = df.duplicated(subset=["review"]).sum()
print(f"Number of duplicate reviews: {duplicates_count}")

df = df.drop_duplicates(subset=["review"]).reset_index(drop=True)

Number of duplicate reviews: 418


#### Очистка

In [2]:
nlp = spacy.load("en_core_web_sm")


def preprocess_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()

    tokens = nltk.word_tokenize(text)

    stop_words = set(stopwords.words("english"))
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]

    doc = nlp(" ".join(tokens))
    tokens = [token.lemma_ for token in doc]

    tokens = [re.sub(r"\W+", "", token) for token in tokens]
    tokens = [token for token in tokens if token]

    return " ".join(tokens)


In [75]:
preprocess_text("Hello my name is Tony Montana. </br> Im political tourist from Cuba")


'hello name tony montana I m political tourist cuba'

In [None]:
output_file_path = "data/cleaned_IMDB_dataset.csv"

df["processed_review"] = df["review"].apply(preprocess_text)

df.to_csv(output_file_path, index=True)


  text = BeautifulSoup(text, "html.parser").get_text()


#### инвертированный индекс

In [12]:
tokenized_documents = df["processed_review"].apply(lambda x: x.split()).tolist()

inverted_index = defaultdict(list)

for doc_id, tokens in enumerate(tokenized_documents):
    for token in set(tokens):
        inverted_index[token].append(doc_id)

print("Inverted Index created:")
for token, doc_ids in list(inverted_index.items())[:10]:
    print(f"{token}: {doc_ids}")


with open("data/inverted_index.pkl", "wb") as file:
    pickle.dump(inverted_index, file)

Inverted Index created:
experience: [0, 12, 41, 69, 71, 79, 93, 101, 122, 124, 135, 163, 182, 235, 254, 256, 267, 271, 310, 327, 328, 350, 411, 421, 424, 435, 476, 485, 503, 514, 518, 591, 595, 611, 672, 693, 699, 705, 715, 727, 759, 762, 768, 784, 786, 790, 795, 814, 837, 860, 870, 875, 883, 884, 888, 915, 944, 953, 972, 979, 991, 995, 1013, 1039, 1041, 1069, 1105, 1115, 1139, 1143, 1242, 1247, 1272, 1275, 1294, 1328, 1334, 1357, 1401, 1430, 1433, 1436, 1443, 1453, 1508, 1512, 1513, 1527, 1533, 1539, 1563, 1578, 1584, 1587, 1606, 1626, 1660, 1663, 1686, 1710, 1712, 1726, 1760, 1817, 1831, 1858, 1883, 1910, 1957, 1960, 2004, 2016, 2022, 2028, 2101, 2109, 2155, 2190, 2200, 2208, 2248, 2264, 2285, 2302, 2354, 2372, 2383, 2384, 2389, 2430, 2453, 2482, 2489, 2526, 2532, 2545, 2556, 2567, 2568, 2570, 2584, 2586, 2593, 2608, 2622, 2634, 2638, 2641, 2674, 2680, 2687, 2699, 2727, 2736, 2738, 2740, 2779, 2811, 2819, 2839, 2864, 2869, 2888, 2904, 2916, 2925, 2935, 2937, 2938, 2962, 2973, 2995, 3

### Код для импорта 

In [13]:
tokenized_documents = df["processed_review"].apply(lambda x: x.split()).tolist()

cleaned_dataset_path = "data/cleaned_IMDB_dataset.csv"
df = pd.read_csv(cleaned_dataset_path)
print("Cleaned IMDB dataset loaded successfully.")

with open("data/inverted_index.pkl", "rb") as file:
    inverted_index = pickle.load(file)

print("Inverted index loaded successfully.")

corpus_tokens = set(inverted_index.keys())
print("Corpus tokens initialized")

Cleaned IMDB dataset loaded successfully.
Inverted index loaded successfully.
Corpus tokens initialized


## TF-IDF

### Код для запуска

1. Мы тренируем TF-IDF. Получаем 2 артефакта. разряженный список, по которому можно делать векторный поиск и сам натренированный алгоритм
2. Сама функция поиска состоит из следующих шагов:
- Преобразование текста в токены аналогично преобразованию, которое мы применяли к документам
- Для каждого токена из запроса ищем токен из корпуса документов, применяя расстояние левинштейна, чтобы защититься от опечаток
- По инвертированному индексу ищем все документы в которых есть хотя бы один токен из запроса
- Для всех найденных документов проверяем сходство по TF-IDF и выводим 10 лучших вариантов

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df["processed_review"])

model_path = "data/tfidf_model.pkl"
os.makedirs(os.path.dirname(model_path), exist_ok=True)
with open(model_path, "wb") as model_file:
    pickle.dump(tfidf_vectorizer, model_file)
print(f"TF-IDF model saved to: {model_path}")

sparse_matrix_path = "data/tfidf_sparse_matrix.npz"
scipy.sparse.save_npz(sparse_matrix_path, tfidf_matrix)
print(f"Sparse matrix saved to: {sparse_matrix_path}")


TF-IDF model saved to: data/tfidf_model.pkl
Sparse matrix saved to: data/tfidf_sparse_matrix.npz


### Код для импорта

In [5]:
tfidf_model_path = "data/tfidf_model.pkl"
with open(tfidf_model_path, "rb") as model_file:
    tfidf_vectorizer = pickle.load(model_file)
print("TF-IDF model loaded successfully.")

tfidf_sparse_matrix_path = "data/tfidf_sparse_matrix.npz"
tfidf_matrix = scipy.sparse.load_npz(tfidf_sparse_matrix_path)
print("TF-IDF sparse matrix loaded successfully.")


TF-IDF model loaded successfully.
TF-IDF sparse matrix loaded successfully.


## Функция для ранжирования

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance as levenshtein_distance


def get_preprocessed_tokens(document):
    return preprocess_text(document).split()


def correct_token(query_tokens, corpus_tokens):
    corrected_tokens = []
    for token in query_tokens:
        closest_match = min(corpus_tokens, key=lambda t: levenshtein_distance(token, t))
        corrected_tokens.append(closest_match)
    return corrected_tokens


def find_documents(corrected_tokens, inverted_index):
    matched_docs = set()
    for token in corrected_tokens:
        if token in inverted_index:
            matched_docs.update(inverted_index[token])
    return list(matched_docs)


def tf_idf_similarity_function(query, matched_docs):
    query_vector = tfidf_vectorizer.transform([query])

    doc_vectors = tfidf_matrix[matched_docs]

    similarities = cosine_similarity(query_vector, doc_vectors).flatten()
    return similarities


# 4. Main search function
def search(query, inverted_index, corpus_tokens, similarity_function, *args, **kwargs):
    query_tokens = get_preprocessed_tokens(query)
    query_cleaned = " ".join(query_tokens)

    corrected_tokens = correct_token(query_tokens, corpus_tokens)

    matched_docs = find_documents(corrected_tokens, inverted_index)

    similarities = similarity_function(query_cleaned, matched_docs, *args, **kwargs)

    top_indices = np.argsort(similarities)[::-1][:10]
    ranked_docs = [(matched_docs[idx], similarities[idx]) for idx in top_indices]

    return ranked_docs


top_documents = search(
    query="amazing movie",
    inverted_index=inverted_index,
    corpus_tokens=corpus_tokens,
    similarity_function=tf_idf_similarity_function,
)

print("Top 10 results:")
for doc_id, similarity in top_documents:
    print(f"Document ID: {doc_id}, Similarity: {similarity}")
    print(df.loc[doc_id, "review"])


Top 10 results:
Document ID: 18273, Similarity: 0.4574404114383907
Mani sir as usual brings out another amazing story with Kannathil Muthamittal. Such an amazing relationship between parents and child is brought out in a beautiful fashion. Mani Sir as usual without much special effects and not much outdoor shoots.(In fact this was the only movie where he went outside India ever..that too just to sri lanka).Mani's class is written all over the movie...and to add to it ARR's music..which is just amazing...Vellai Pookal is one of my most fav songs ever... Maddy,who is what he is in the film industry has impressed a lot too. Starting from alaipayuthey ,to kannathil to ayutha ezuthu to guru.. Mani ratnam has showed to the world what a versatile actor Maddy is. Simran has been really good too. She has showed that she can act too in non-glamorous and character roles. In all an amazing movie. Sad that the tamil public could not appreciate this gr8 movie and it bombed at the box-office....
Docu

## Glove

In [14]:
def load_glove_model(file_path):
    """
    Load GloVe model from a file.
    Args:
        file_path (str): Path to the GloVe model file (e.g., 'glove.6B.50d.txt').
    Returns:
        dict: A dictionary mapping words to their GloVe embeddings.
    """
    glove_model = {}
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.split()
            word = parts[0]
            embedding = np.array(parts[1:], dtype=float)
            glove_model[word] = embedding
    return glove_model


def document_to_vector(words, glove_model):
    embeddings = [glove_model[word] for word in words if word in glove_model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros((50))


glove_file_path = "data/glove.6B.50d.txt"
glove = load_glove_model(glove_file_path)


document_vectors = []
for tokens in tokenized_documents:
    document_vectors.append(document_to_vector(tokens, glove))
document_vectors = np.array(document_vectors)

In [36]:
input_text = "This is a simple test"
embedding = document_to_vector(input_text, glove)


print("GloVe Embedding:", embedding)


GloVe Embedding: [-0.03686     0.75582875  0.76508506  0.58474687  0.50978062 -0.11958975
  0.24396625 -0.61615125 -0.24093987 -0.05985969 -0.09420925  0.26297606
 -0.69521062 -0.48871412  0.28641341  0.03732463 -0.37501987  0.0801185
 -0.37139312 -0.34715    -0.31390673 -0.042878    0.42755581  0.2995705
  0.06056513 -1.08568019 -0.39526625  0.04601656  0.20413812 -0.21705837
  2.749675    0.14222144 -0.26805012  0.5845475  -0.08239794 -0.6834675
  0.33371375 -0.42074687  0.24567887 -0.22065     0.66057607 -0.0789435
 -0.09266925  0.02766412  0.04214122  0.18513188  0.22095281 -0.2764525
  0.37738662  0.62757937]


In [18]:
def query_to_vector(query, glove):
    query_tokens = get_preprocessed_tokens(query)
    return document_to_vector(query_tokens, glove)


def glove_similarity_function(query, matched_docs):
    query_vector = query_to_vector(query, glove)

    doc_vectors = document_vectors[matched_docs]

    similarities = cosine_similarity([query_vector], doc_vectors).flatten()

    return similarities


top_documents = search(
    query="brings out another amazing story",
    inverted_index=inverted_index,
    corpus_tokens=corpus_tokens,
    similarity_function=glove_similarity_function,
)

print("Top 10 results:")
for doc_id, similarity in top_documents:
    print(f"Document ID: {doc_id}, Similarity: {similarity}")
    print(df.loc[doc_id, "review"])


Top 10 results:
Document ID: 9154, Similarity: 0.9675494242332053
So far only the first episode has been shown, and a great fuss has been made about the lesbian sex scenes. But for those who bother to look past that they will find an incredibly beautiful love story and one that has in this episode ended in an upsetting climax/cliffhanger. I have found the story so powerful that I have been inspired to read the novel on which this fantastic series has been based.
Document ID: 24074, Similarity: 0.9644505428217939
This is the finest film ever made to deal with the subject of AIDS. It's a documentary about two men living with and dying of this illness. The film is beautiful, heartbreaking, funny, and incredibly moving. Above all, it is an amazing true love story. Be sure to have a few hankies ready before you watch this movie---you will need them. Extraordinary.
Document ID: 18152, Similarity: 0.9631051728569089
I love horror films, but I think they work way better when they hide a dramat

## Выводим результат

In [19]:
queries = [
    "amazing movie with great acting",
    "terrible plot and bad acting",
    "a masterpiece with stunning visuals",
    "boring and too slow-paced",
    "funniest comedy of the year",
    "best action scenes ever",
    "a touching story of friendship",
    "overrated and not worth the hype",
    "underrated gem with brilliant writing",
    "romantic movie with a happy ending",
    "horrifying and thrilling experience",
    "predictable plot but good performances",
    "great soundtrack and memorable dialogues",
    "beautiful cinematography and compelling characters",
    "the worst movie I have ever seen",
    "great sequel that surpassed the original",
    "mediocre film with poor direction",
    "amazing performance by the lead actor",
    "a thought-provoking and emotional drama",
    "visually stunning but lacks depth",
    "exciting and action-packed adventure",
    "terribly written with cringe-worthy dialogue",
    "family-friendly movie with heartwarming moments",
    "a unique and original story",
    "excellent adaptation of the book",
    "forgettable and uninspired movie",
    "a gripping thriller with unexpected twists",
    "classic film that stands the test of time",
    "disappointing ending but overall enjoyable",
    "a movie that changed my perspective",
]


def write_results_to_file(output_file, queries, tfidf_results, glove_results):
    with open(output_file, "w", encoding="utf-8") as f:
        for i, query in enumerate(queries):
            f.write("**********\n")
            f.write(f"**{query}**\n")
            f.write("**********\n\n")

            f.write("tfIdf:\n")
            for rank, (doc_id, similarity) in enumerate(tfidf_results[i], start=1):
                review_text = df.iloc[doc_id]["review"]
                f.write(
                    f"{rank}) {review_text.strip()} (Similarity: {similarity:.4f})\n"
                )
            f.write("------------\n\n")

            f.write("glove:\n")
            for rank, (doc_id, similarity) in enumerate(glove_results[i], start=1):
                review_text = df.iloc[doc_id]["review"]
                f.write(
                    f"{rank}) {review_text.strip()} (Similarity: {similarity:.4f})\n"
                )
            f.write("\n")


tfidf_results = []
glove_results = []

for query in queries:
    tfidf_top_documents = search(
        query=query,
        inverted_index=inverted_index,
        corpus_tokens=corpus_tokens,
        similarity_function=tf_idf_similarity_function,
    )[:5]
    tfidf_results.append(tfidf_top_documents)

    glove_top_documents = search(
        query=query,
        inverted_index=inverted_index,
        corpus_tokens=corpus_tokens,
        similarity_function=glove_similarity_function,
    )[:5]
    glove_results.append(glove_top_documents)

output_file = "query_results.txt"
write_results_to_file(output_file, queries, tfidf_results, glove_results)

print(f"Results written to {output_file}")


Results written to query_results.txt


## Оцениваем по метрикам

### Подготовливаем данные

In [None]:
import random


def write_random_reviews_to_file(output_file, queries, df):
    with open(output_file, "w", encoding="utf-8") as f:
        for i, query in enumerate(queries):
            f.write("-------\n")
            f.write(f'"query": "{query}",\n')

            random_reviews = df.sample(n=8, random_state=i)
            review_ids = random_reviews.index.tolist()
            f.write(f'"ids": {review_ids}\n')

            for doc_id in review_ids:
                review_text = df.loc[doc_id, "review"]
                f.write(f"{doc_id}: {review_text.strip()}\n")

            f.write("-------\n\n")


output_file = "queries_with_random_reviews.txt"

write_random_reviews_to_file(output_file, queries, df)

print(f"Random reviews written to {output_file}")


Random reviews written to queries_with_random_reviews.txt


### Функции для оценивания

In [37]:
def precision(reviews_ranged, reviews_scored):
    k = sum(reviews_scored.values())
    reviews_top_k = reviews_ranged[:k]
    relevance_top_k = map(lambda id: reviews_scored[id], reviews_top_k)
    relevant_in_k = sum(relevance_top_k)
    return relevant_in_k / k


precision([1, 2, 3], {1: 1, 2: 1, 3: 0})

1.0

In [None]:
def ndcg(reviews_ranged, reviews_scored):
    dcg = 0.0
    for i, review_id in enumerate(reviews_ranged):
        relevance = reviews_scored.get(review_id, 0)
        dcg += relevance / np.log2(i + 2)

    ideal_relevances = sorted(reviews_scored.values(), reverse=True)[:k]
    idcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(ideal_relevances))

    if idcg == 0:
        return 0.0

    ndcg = dcg / idcg
    return ndcg


reviews_ranged = [166, 27909, 35746, 1872, 12703, 16108, 26783, 25131]
reviews_scored = {
    166: 1,
    27909: 1,
    35746: 0,
    1872: 0,
    12703: 0,
    16108: 0,
    26783: 0,
    25131: 0,
}

ndcg_score = ndcg(reviews_ranged, reviews_scored)
print(f"NDCG: {ndcg_score:.4f}")


NDCG: 1.0000


In [54]:
def mean_reciprocal_rank(reviews_ranged, reviews_scored):
    for i, review_id in enumerate(reviews_ranged):
        relevance = reviews_scored.get(review_id, 0)
        if relevance == 1:
            return 1 / (i + 1)

    return 0.0


### Оценка

In [47]:
import json5

with open("train_data.json", "r") as file:
    data_test = json5.load(file)

for entry in data_test:
    ids = entry["ids"]
    ids_int_key = {int(key): value for key, value in ids.items()}
    entry["ids"] = ids_int_key
data_test[0]

{'query': 'amazing movie with great acting',
 'ids': {166: 1,
  27909: 1,
  35746: 1,
  1872: 0,
  12703: 0,
  16108: 0,
  26783: 0,
  25131: 0}}

In [None]:
def get_ranged_ids(similarity_func, query, ids):
    scores = similarity_func(query, ids)
    id_score_dict = {ids[i]: scores[i] for i in range(len(ids))}
    key_func = lambda id: id_score_dict[id]
    return sorted(ids, key=key_func)


def get_metrics(data_test):
    methods = [
        (glove_similarity_function, "glove"),
        (tf_idf_similarity_function, "tf-idf"),
    ]
    results = {
        "tf-idf": {"ndcg": 0, "mrr": 0, "map": 0},
        "glove": {"ndcg": 0, "mrr": 0, "map": 0},
    }
    num_entries = len(data_test)

    for entry_test in data_test:
        query = entry_test["query"]
        reviews_scored = entry_test["ids"]
        ids = list(reviews_scored.keys())

        for similarity_function, method_name in methods:
            reviews_ranged = get_ranged_ids(similarity_function, query, ids)

            ndcg_value = ndcg(reviews_ranged, reviews_scored)
            mrr = mean_reciprocal_rank(reviews_ranged, reviews_scored)
            map_score = precision(reviews_ranged, reviews_scored)

            results[method_name]["ndcg"] += ndcg_value
            results[method_name]["mrr"] += mrr
            results[method_name]["map"] += map_score

    for method_name in results:
        for metric in results[method_name]:
            results[method_name][metric] /= num_entries

    return results


get_metrics(data_test)

{'tf-idf': {'ndcg': 0.6191576550098508,
  'mrr': 0.5284832451499119,
  'map': 0.3024691358024691},
 'glove': {'ndcg': 0.5767705148166244,
  'mrr': 0.42993827160493836,
  'map': 0.23086419753086418}}