In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from scipy import sparse

In [2]:
def clean_metadata(df: pd.DataFrame) -> pd.DataFrame:

    def extract_names(field):
        if isinstance(field, str):
            names = [name.strip() for name in field.split(',')]
            return " ".join(names)
        return ""

    df["genres_str"] = df["genres"].apply(extract_names)
    df["keywords_str"] = df["keywords"].apply(extract_names)
    df["combined_metadata"] = (
        df["genres_str"] + " " +
        df["keywords_str"] + " " +
        df["overview"]
    ).fillna("")
    return df["combined_metadata"]

In [3]:
def vectorize_metadata(df: pd.DataFrame, column: str = "combined_metadata") -> tuple:
    tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df[column])
    return tfidf_matrix, tfidf_vectorizer

In [4]:
# def combine_similarity_matrix(tfidf_matrix: np.ndarray) -> np.ndarray:
#     similarity_matrix = cosine_similarity(tfidf_matrix)
#     return similarity_matrix

In [5]:
# def combine_top_k_similarity_matrix(tfidf_matrix: sparse.csr_matrix, k: int = 10) -> sparse.csr_matrix:
#     n_samples = tfidf_matrix.shape[0]
#     similarity_matrix = sparse.lil_matrix((n_samples, n_samples))
#
#     for i in range(n_samples):
#         scores = cosine_similarity(tfidf_matrix[i], tfidf_matrix).flatten()
#
#         top_k_indices = np.argpartition(scores, -(k+1))[-(k+1):]
#         top_k_indices = top_k_indices[top_k_indices != i][:k]
#
#         for j in top_k_indices:
#             similarity_matrix[i, j] = scores[j]
#
#     return similarity_matrix.tocsr()

In [6]:
def combine_top_k_similarity_matrix(tfidf_matrix: sparse.csr_matrix, k: int = 10, batch_size: int = 1000) -> sparse.csr_matrix:
    n_samples = tfidf_matrix.shape[0]
    similarity_matrix = sparse.lil_matrix((n_samples, n_samples))

    for start_idx in range(0, n_samples, batch_size):
        end_idx = min(start_idx + batch_size, n_samples)
        batch_indices = range(start_idx, end_idx)

        batch_similarity = cosine_similarity(tfidf_matrix[batch_indices], tfidf_matrix)

        for i, idx in enumerate(batch_indices):
            scores = batch_similarity[i]
            top_k_indices = np.argpartition(scores, -(k+1))[-(k+1):]
            top_k_indices = top_k_indices[top_k_indices != idx][:k]

            for j in top_k_indices:
                similarity_matrix[idx, j] = scores[j]

        print(f"Обработано {end_idx}/{n_samples}")

    return similarity_matrix.tocsr()

In [7]:
def save_similarity_matrix(similarity_matrix: np.ndarray, path: str) -> None:
    with open(path, "wb") as f:
        pickle.dump(similarity_matrix, f)

In [8]:
if __name__ == "__main__":
    df = pd.read_csv("../data/TMDB_movie_dataset_v11.csv").head(300000)
    metadata_series = clean_metadata(df)
    df["combined_metadata"] = metadata_series
    print("Создание metadata.")
    print(df["combined_metadata"].head())

    print("Векторизация текста.")
    tfidf_matrix, tfidf_vectorizer = vectorize_metadata(df, column="combined_metadata")

    print("Сохранение матрицы сходства.")
    matrix = combine_top_k_similarity_matrix(tfidf_matrix, k=10)
    save_similarity_matrix(matrix, "../models/similarity_matrix.pkl")
    print("Матрица сходства сохранена.")

Создание metadata.
0    Action Science Fiction Adventure rescue missio...
1    Adventure Drama Science Fiction rescue future ...
2    Drama Action Crime Thriller joker sadism chaos...
3    Action Adventure Fantasy Science Fiction futur...
4    Science Fiction Action Adventure new york city...
Name: combined_metadata, dtype: object
Векторизация текста.
Сохранение матрицы сходства.
Обработано 1000/300000
Обработано 2000/300000
Обработано 3000/300000
Обработано 4000/300000
Обработано 5000/300000
Обработано 6000/300000
Обработано 7000/300000
Обработано 8000/300000
Обработано 9000/300000
Обработано 10000/300000
Обработано 11000/300000
Обработано 12000/300000
Обработано 13000/300000
Обработано 14000/300000
Обработано 15000/300000
Обработано 16000/300000
Обработано 17000/300000
Обработано 18000/300000
Обработано 19000/300000
Обработано 20000/300000
Обработано 21000/300000
Обработано 22000/300000
Обработано 23000/300000
Обработано 24000/300000
Обработано 25000/300000
Обработано 26000/300000
Об