In [1]:
import json
import re
from typing import Optional

import nltk
import numpy as np
import pandas as pd
import wikipedia
from nltk.stem.snowball import SnowballStemmer
from tqdm.notebook import tqdm

tqdm.pandas()

In [2]:
DATA_PATH = "../data_raw/"

In [3]:
movies_raw: pd.DataFrame = pd.read_parquet(DATA_PATH + "movies.pq")  # movies names and genres
ratings_raw: pd.DataFrame = pd.read_parquet(DATA_PATH + "ratings.pq").drop(columns="timestamp")  # users' ratings
tags_raw: pd.DataFrame = pd.read_parquet(DATA_PATH + "tags.pq").drop(columns="timestamp")  # users' tags for movies
links_raw: pd.DataFrame = pd.read_parquet(DATA_PATH + "links.pq")  # ids from different datasets
tmdb_movies_raw: pd.DataFrame = pd.read_parquet(DATA_PATH + "tmdb_5000_movies.pq")  # tmdb movies data
tmdb_credits_raw: pd.DataFrame = pd.read_parquet(DATA_PATH + "tmdb_5000_credits.pq")  # tmdb movies' creators data
genome_data: pd.DataFrame = pd.read_parquet(DATA_PATH + "genome.pq")  # movies' tags and relevance of those

### TMDB preprocessing

In [None]:
tmdb_movies: pd.DataFrame = tmdb_movies_raw[
    [
        "id",
        "title",
        "genres",
        "release_date",
        "production_countries",
        "runtime",
        "revenue",
        "popularity",
        "vote_average",
        "vote_count",
    ]
]
tmdb_movies.loc[:, "genres"] = tmdb_movies.genres.transform(lambda y: json.loads(y, object_hook=lambda x: x["name"]))
tmdb_movies.loc[:, "production_countries"] = tmdb_movies.production_countries.transform(
    lambda y: json.loads(y, object_hook=lambda x: x["name"])
)
tmdb_movies.loc[:, "release_date"] = tmdb_movies.release_date.transform(lambda x: str(x).split("-", maxsplit=1)[0])
tmdb_movies = tmdb_movies.rename(columns={"id": "movie_id", "release_date": "year"})

In [None]:
tmdb_credits: pd.DataFrame = tmdb_credits_raw[["movie_id", "cast", "crew"]]
tmdb_credits.loc[:, "cast"] = tmdb_credits.cast.transform(lambda y: json.loads(y, object_hook=lambda x: x["name"])[:10])
tmdb_credits.loc[:, "crew"] = tmdb_credits.crew.transform(json.loads)
tmdb_credits.loc[:, "crew"] = tmdb_credits.crew.transform(lambda x: [el for el in x if el["job"] == "Director"])
tmdb_credits = tmdb_credits[tmdb_credits.crew.apply(len) >= 1]
tmdb_credits.loc[:, "crew"] = tmdb_credits.crew.transform(lambda x: [el["name"] for el in x])
tmdb_credits = tmdb_credits.rename(columns={"crew": "director"})

In [None]:
tmdb_data: pd.DataFrame = tmdb_movies.merge(tmdb_credits, on="movie_id")

### Wiki data

In [None]:
movies: pd.DataFrame = movies_raw.merge(links_raw, on="movieId")[["movieId", "tmdbId", "title", "genres"]]
movies = movies.merge(tmdb_movies, left_on="tmdbId", right_on="movie_id")

In [9]:
# functions from https://towardsdatascience.com/movielens-1m-deep-dive-part-i-8acfeda1ad4


def get_wikipedia_page_name(movie_title: str) -> str:
    matching_pages: list[str] = wikipedia.search(movie_title)
    if len(matching_pages) == 0:
        return ""
    return matching_pages[0]


def get_movie_plot(page_name: str) -> Optional[str]:
    try:
        try:
            movie_page_content = str(wikipedia.page(page_name, auto_suggest=False).content)
        except wikipedia.DisambiguationError as disamberror:
            for option in disamberror.options:
                if "film" in option:
                    movie_page_content = str(wikipedia.page(option, auto_suggest=False).content)
            return None
    except (wikipedia.PageError, KeyError):
        return None
    re_groups = re.search("Plot ==(.*?)=+ [A-Z]", str(movie_page_content).replace("\n", ""))
    if re_groups:
        return re_groups.group(1)
    return None

In [None]:
movies["wikipedia_page_name"] = movies["title_x"].progress_apply(get_wikipedia_page_name)
movies["movie_plot"] = movies["wikipedia_page_name"].progress_apply(get_movie_plot)
print(f'There are {movies["movie_plot"].isna().sum()} NaN movie plots')

In [None]:
movies = movies.drop(columns="title_x").rename(columns={"title_y": "title"})

In [None]:
movies.loc[:, "genres_x"] = movies.genres_x.apply(lambda x: x.split("|"))
movies.loc[:, "genres_y"] = movies.genres_y.apply(list)
movies["genres"] = (movies.genres_x + movies.genres_y).apply(set).apply(list)
movies.drop(columns=["genres_x", "genres_y"], inplace=True)

In [None]:
def remove_no_genres(genres_list: list[str]) -> list[str]:
    if ("(no genres listed)" in genres_list) and len(genres_list) > 1:
        return [genre for genre in genres_list if genre != "(no genres listed)"]
    return genres_list


def remove_foreign(genres_list: list[str]) -> list[str]:
    if "Foreign" in genres_list:
        if len(genres_list) > 1:
            return [genre for genre in genres_list if genre != "Foreign"]
        return ["(no genres listed)"]
    return genres_list


movies.loc[:, "genres"] = movies.genres.apply(remove_no_genres).apply(remove_foreign)

In [None]:
def remove_duplicate_music(genres_list: list[str]) -> list[str]:
    if "Music" in genres_list:
        tmp = [genre for genre in genres_list if genre != "Music"]
        if "Musical" in genres_list:
            return tmp
        return tmp + ["Musical"]
    return genres_list


def remove_duplicate_scifi(genres_list: list[str]) -> list[str]:
    if "Science Fiction" in genres_list:
        tmp = [genre for genre in genres_list if genre != "Science Fiction"]
        if "Sci-Fi" in genres_list:
            return tmp
        return tmp + ["Sci-Fi"]
    return genres_list


movies.loc[:, "genres"] = movies.genres.apply(remove_duplicate_music).apply(remove_duplicate_scifi)

In [None]:
production_country = tmdb_data[["movie_id", "production_countries"]]
movies = movies.merge(production_country, left_on="tmdbId", right_on="movie_id")

In [None]:
movies = movies.drop(columns=["wikipedia_page_name", "movie_id_y"]).rename(columns={"movie_id_x": "movie_id"})
movies = movies.merge(tmdb_data, left_on="movieId", right_on="movie_id")

### Сводим фильмы с тегами

In [None]:
TAG_RELEVANCE_THRESHOLD = 0.3

In [None]:
movies = movies_raw.merge(genome_data, on="movieId")
movies = movies[movies.relevance > TAG_RELEVANCE_THRESHOLD]

In [None]:
all_tags = pd.Series(movies.tag.unique())

In [None]:
stemmer = SnowballStemmer("english")
stemmed_tags = movies.tag.apply(stemmer.stem)
unique_stammed = stemmed_tags.unique()

In [None]:
nltk.download("averaged_perceptron_tagger")
tags_with_pos = all_tags.apply(str.split).apply(nltk.pos_tag)

Получается много пересекающихся тегов, в том числе потому, что многие похожие по смыслу теги записываются в несколько слов, с ошибками и т.д. Надо придумать, че с этим сделать  
Как идея -- попробовать посчитать встречаемость слов (может быть, до и после стемминга?)

In [None]:
tags_list = list(map(lambda tags: tags.split(), list(all_tags)))  # type: ignore [no-any-return]
words = []
for tags in tags_list:
    words += tags

In [None]:
unique_words, counter_words = np.unique(np.array(words), return_counts=True)
word_counts = dict(zip(unique_words, counter_words))

Возможно, время выставления также стоит учитывать, т.к. тогда мы сможем каким-то образом учитывать изменение в предпочтениях пользователя. Хотя, с другой стороны, для mvp нам это явно не поможет (это скорее улучшение для существующих пользователей). В общем, пока выкинем, потом придумаем