[Dataset Link](https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata?select=tmdb_5000_movies.csv)

In [71]:
import pandas as pd
df= pd.read_csv("../datasets/tmdb_5000_movies.csv")
df.head()
df.iloc[0]

budget                                                          237000000
genres                  [{"id": 28, "name": "Action"}, {"id": 12, "nam...
homepage                                      http://www.avatarmovie.com/
id                                                                  19995
keywords                [{"id": 1463, "name": "culture clash"}, {"id":...
original_language                                                      en
original_title                                                     Avatar
overview                In the 22nd century, a paraplegic Marine is di...
popularity                                                     150.437577
production_companies    [{"name": "Ingenious Film Partners", "id": 289...
production_countries    [{"iso_3166_1": "US", "name": "United States o...
release_date                                                   2009-12-10
revenue                                                        2787965087
runtime                               

In [72]:
import json

def concat_genre_keywords_overview(row:pd.Series):
    title = row["title"]
    genres =json.loads(row["genres"])
    keywords = json.loads(row["keywords"])
    overview = row["overview"]

    genres= " ".join("".join(genre["name"].split()) for genre in genres)
    keywords = " ".join("".join(keyword["name"].split()) for keyword in keywords)

    return "Genres of %s are %s. Some keywords for this movie are %s. A quick overview of this movie is %s" % (title, genres, keywords, overview)

In [73]:
df["movie_details"] = df.apply(concat_genre_keywords_overview, axis=1)
df.head()["movie_details"]

0    Genres of Avatar are Action Adventure Fantasy ...
1    Genres of Pirates of the Caribbean: At World's...
2    Genres of Spectre are Action Adventure Crime. ...
3    Genres of The Dark Knight Rises are Action Cri...
4    Genres of John Carter are Action Adventure Sci...
Name: movie_details, dtype: object

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df["movie_details"])

In [75]:
movie2idx = pd.Series(df.index, index=df["title"])
test_movie_idx = movie2idx["Avatar"]
query_vec = tfidf_matrix[test_movie_idx]
query_vec.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]], shape=(1, 28285))

In [96]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_movies(query: str, n: int):
    query_vec = tfidf.transform([query])
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix)
    scores = cosine_sim.flatten()

    top_n_movie_indices = (-scores).argsort()

    recommendations = {
        df.iloc[idx]["title"]: {
            "similarity_score": f"{int((float(scores[idx]) / 1) * 100)}%"
        }
        for idx in top_n_movie_indices 
        if scores[idx] > 0
    }

    return {
        "query": query,
        "recommendations": dict(list(recommendations.items())[:n])
    }


In [97]:
import pprint

search_query = input("Enter your favourite movie to get recommendations: ")
pprint.pprint(recommend_movies(search_query,10))

{'query': 'Space',
 'recommendations': {"Ender's Game": {'similarity_score': '26%'},
                     'Gravity': {'similarity_score': '27%'},
                     'Lifeforce': {'similarity_score': '24%'},
                     'Lost in Space': {'similarity_score': '37%'},
                     'Space Chimps': {'similarity_score': '39%'},
                     'Space Dogs': {'similarity_score': '32%'},
                     'Space Pirate Captain Harlock': {'similarity_score': '46%'},
                     'Treasure Planet': {'similarity_score': '29%'},
                     'You Only Live Twice': {'similarity_score': '29%'},
                     'Zathura: A Space Adventure': {'similarity_score': '31%'}}}
