# Brief Outline: Exploiting Movie-Movie Similarity

## Movie Information

### Vectorize Movie Information
The movie information has a lot of text and numerical information.

### Aggregate

### Approximate Nearest Neighbour with Annoy Index

## Use the movies a user has seen to predict most similar movies

In [12]:
# !pip install --upgrade torch

In [21]:
# !pip install sentence-transformers -qq

In [31]:
import pickle
import random
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
import torch
from annoy import AnnoyIndex
from sentence_transformers import SentenceTransformer
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer, BertModel, BertTokenizer

tqdm.pandas()

In [14]:
def read(ds: str, data_dir=Path("../data/ext/od-challenge")):
    with (data_dir / f"{ds}.pickle").open("rb") as f:
        df = pickle.load(f)
    return df


aggs = read(ds="aggs")
teams = read(ds="teams")
movies = read(ds="movies")
labels = read(ds="labels")

data_dir = Path("../data/intermediate/")
train, test = pd.read_csv(data_dir / "train.csv"), pd.read_csv(data_dir / "test.csv")

In [28]:
# def write(df, ds, data_dir=Path("../data/intermediate")):
#     df.to_csv(data_dir / f"{ds}.csv", index=False)


# write(aggs, ds="aggs")
# write(teams, ds="teams")
# write(movies, ds="movies")
# write(labels, ds="labels")

In [17]:
df = movies
df

Unnamed: 0,movie_id,title,genres,year,synopsis
0,114709,Toy Story,"{Comedy, Animation, Children, Fantasy, Adventure}",1995,A boy called Andy Davis (voice: John Morris) u...
1,113497,Jumanji,"{Children, Fantasy, Adventure}",1995,The film begins in 1869 in the town of Brantfo...
2,113277,Heat,"{Action, Crime, Thriller}",1995,An inbound Los Angeles Blue Line train pulls i...
3,114319,Sabrina,"{Comedy, Romance}",1995,"Sabrina Fairchild (Julia Ormond), is the Larra..."
4,112302,Tom and Huck,"{Children, Adventure}",1995,The film opens with Injun Joe (Eric Schweig) a...
...,...,...,...,...,...
4102,3606756,Incredibles 2,"{Children, Animation, Action, Adventure}",2018,Agent Rick Dicker (Jonathan Banks) is intervie...
4103,5463162,Deadpool 2,"{Comedy, Action, Sci-Fi}",2018,After successfully working as the mercenary De...
4104,3778644,Solo: A Star Wars Story,"{Children, Action, Sci-Fi, Adventure}",2018,"In this second 'Star Wars' stand-alone, spin-o..."
4105,5095030,Ant-Man and the Wasp,"{Comedy, Action, Sci-Fi, Fantasy, Adventure}",2018,The film opens in 1987 as Hank Pym (Michael Do...


In [18]:
description = []
for row in df.iterrows():
    info = row[1]
    genres = list(info["genres"])
    genres = ", ".join(genres)
    text = f"The movie **{info['title']}** was released in {info['year']}. It was mainly known for the following genres: {genres}"
    description.append(text)

df["description"] = pd.Series(description)

In [22]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [47]:
def get_description_index():
    vector_dir = Path("../models/vector_index/")
    fname = "description.ann"
    index_path = vector_dir / fname
    sz = 768
    if index_path.exists():
        u = AnnoyIndex(sz, "angular")
        u.load(str(index_path))  # super fast, will just mmap the file
        return u
    else:
        embeddings = df.description.progress_apply(lambda x: model.encode(x))
        # # takes about 10 minutes to run locally on a slow CPU

        t = AnnoyIndex(f, "angular")  # Length of item vector that will be indexed
        for i, vector in enumerate(embeddings):
            t.add_item(i, vector)
        t.build(len(embeddings) // 10)
        t.save(str(index_path))
        return t


t = get_description_index()

# Demo: Movie-Movie Recommendation

In [179]:
random.seed(37)
idx = random.choice(movies.movie_id.tolist())


def similar_movies(movie_id):
    idx = movies[movies.movie_id == movie_id].index[0]
    indices, distances = t.get_nns_by_item(idx, 4, include_distances=True)
    return movies.loc[idx]["description"], movies.loc[indices][1:], distances[1:]


input_movie, recommended_movies, distances = recommend_movie(idx)
print(recommended_movies.movie_id, distances)
# print(input_movie, "\n---------\n", recommended_movies)

751     99253
752    103956
830     87050
Name: movie_id, dtype: int64 [0.3303639888763428, 0.4755329191684723, 0.6874837875366211]


Next, we use the movie-movie recommendation to recommend movies to our user - as per the original statement. 
In order to do this, we find movies similar to the movies the user has already seen and recommend those. 

The assumption is that there all users have liked (rating >= 3.5) atleast 2 movies, 1 each across train and test split. 

**Note on Hit Ratio**:
> A hit is counted if any of the movies we recommend, is present in the movies the user rates in test.

In [134]:
set(test.user_id.unique()) - set(train.user_id.unique())

set()

In [195]:
def user_hits(predicted_movies: List[int], seen_movies: List[int]):
    return len(set(predicted_movies) & set(seen_movies)) > 0


def get_seen_movies(test, user_id, threshold=3.0):
    df = test[test.user_id == user_id]
    seen_movies = []
    for user_item_rating in df.iterrows():
        if user_item_rating[1]["rating"] >= threshold:
            seen_movies.append(user_item_rating[1]["movie_id"])
    return seen_movies


def recommend_movies(user_id, train, threshold=3.0, k=10):
    train = train[train.user_id == user_id]
#     train = train[train.rating >= threshold]
    recommended_movies = []
    for movie_id in train.movie_id:
        _, movie_df, distances = similar_movies(movie_id)
        movie_df["similarity"] = [1 - d for d in distances]
        recommended_movies.append(movie_df)
    try:
        recommended_movies = pd.concat(recommended_movies)
    except ValueError:
        return []

    seen_movies = train.movie_id.unique()
    recommended_movies["seen"] = recommended_movies.movie_id.apply(
        lambda x: x not in seen_movies
    )
    recommended_movies = recommended_movies[recommended_movies.seen]
    recommended_movies["counts"] = recommended_movies["movie_id"].map(
        recommended_movies["movie_id"].value_counts()
    )
    recommended_movies.sort_values(
        by=["similarity"], inplace=True, ascending=False
    )
    return recommended_movies.movie_id.unique()[:k]


def calc_hit_rate(split, train):
    hits = []
    for user_id in tqdm(split.user_id.unique()):
        recommended_movies = recommend_movies(user_id=user_id, train=train)
#         for mv in recommended_movies:
#             print(movies[movies.movie_id == mv][["title", "year", "genres"]])
        seen_movies = get_seen_movies(split, user_id)

#         print("-----------------")
#         for mv in seen_movies:
#             print(movies[movies.movie_id==mv][["title", "genres"]])
#         print("##############")
        hits.append(
            user_hits(predicted_movies=recommended_movies, seen_movies=seen_movies)
        )

    return sum(hits) / len(hits)


calc_hit_rate(test, train)

100%|████████████████████████████████████████████████████████████████████████████████| 608/608 [01:33<00:00,  6.50it/s]


0.31743421052631576

## Synopsis Embedding Illustration

In [49]:
# class SynopsisProc:
#     def __init__(self):
#         self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
#         self.model = BertModel.from_pretrained("bert-base-uncased")

#     def embed(self, paragraphs):
#         encoded_input = self.tokenizer(
#             paragraphs, padding=True, truncation=True, return_tensors="pt"
#         )
#         output = self.model(**encoded_input)
#         embedding = output.pooler_output
#         return embedding

In [78]:
# sp = SynopsisProc()

In [52]:
# synopsis_embedding = df['synopsis'].progress_apply(lambda x: sp.embed(x))