In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import pickle
from scipy import sparse
from google.colab import drive
from tqdm import tqdm

In [2]:
def clean_metadata(df: pd.DataFrame) -> pd.DataFrame:

    def extract_names(field):
        if isinstance(field, str):
            names = [name.strip() for name in field.split(',')]
            return " ".join(names)
        return ""

    df["genres_str"] = df["genres"].apply(extract_names)
    df["keywords_str"] = df["keywords"].apply(extract_names)
    df["combined_metadata"] = (
        df["genres_str"] + " " +
        df["keywords_str"] + " " +
        df["overview"]
    ).fillna("")
    return df["combined_metadata"]

In [3]:
def vectorize_metadata(df: pd.DataFrame, column: str = "combined_metadata") -> tuple:
    tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_features=3000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df[column])
    return tfidf_matrix, tfidf_vectorizer

In [4]:
def build_knn_similarity_with_progress(tfidf_matrix, k=10, batch_size=1000) -> sparse.csr_matrix:
    n = tfidf_matrix.shape[0]
    model = NearestNeighbors(n_neighbors=k+1, metric="cosine", algorithm="brute", n_jobs=-1)
    model.fit(tfidf_matrix)

    rows, cols, data = [], [], []

    for start in tqdm(range(0, n, batch_size), total=(n + batch_size - 1) // batch_size, desc="Поиск соседей"):
        end = min(start + batch_size, n)
        batch_distances, batch_indices = model.kneighbors(tfidf_matrix[start:end])

        for i in range(end - start):
            idx = start + i
            neighbors = batch_indices[i, 1:]
            sims = 1 - batch_distances[i, 1:]

            rows.extend([idx] * k)
            cols.extend(neighbors)
            data.extend(sims)

    return sparse.csr_matrix((data, (rows, cols)), shape=(n, n))

In [5]:
def save_similarity_matrix(similarity_matrix: np.ndarray, path: str) -> None:
    with open(path, "wb") as f:
        pickle.dump(similarity_matrix, f)

In [6]:
if __name__ == "__main__":
    drive.mount('/content/drive')
    df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/TMDB_movie_dataset_v11.csv")

    df = df.sort_values("popularity", ascending=False).head(125000).reset_index(drop=True)

    metadata_series = clean_metadata(df)
    df["combined_metadata"] = metadata_series
    print("Создание metadata.")
    print(df["combined_metadata"].head())

    print("Векторизация текста.")
    tfidf_matrix, tfidf_vectorizer = vectorize_metadata(df, column="combined_metadata")

    print("Сохранение матрицы сходства.")

    matrix = build_knn_similarity_with_progress(tfidf_matrix, k=10)
    save_similarity_matrix(matrix, "/content/drive/My Drive/Colab Notebooks/similarity_matrix.pkl")
    print("Матрица сходства сохранена.")