# Simple Content-Based Recommendation System

In [1]:
# Imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display, HTML

In [2]:
# Load the movie data and combine it with the user's query to create a corpus.
def form_corpus(movies: str, query: str):
    """
    Loads in a .csv file of movie data and combines it with the user query for
    data preprocessing.

    Args:
        `movies`: A DataFrame containing movie data.
        `query`: The user's query.

    Returns:
        A list of movie plots and the user's query.
    """
    # Combine user query with movie data
    corpus = movies["Plot"].to_list()
    corpus.append(query)
    return corpus

In [3]:
# Vectorize the corpus into TF-IDF vectors and get cosine similarity between
# each movie plot and the user's query.
def calculate_cos_similarities(corpus: list):
    """
    Transforms each movie plot and the user's query into a TF-IDF vector, and
    gets the cosine similarity between each movie plot and the user's query.

    Args:
        `corpus`: A list of movie plots and the user's query.

    Returns:
        A NumPy array containing all of the cosine similarities between each
        individual movie plot and the user's query.
    """
    # Convert the corpus to TF-IDF vectors
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

    # Calculate cosine similarity between the user query and movie plots
    return cosine_similarity(tfidf_matrix[:-1], tfidf_matrix[-1]).flatten()

In [4]:
# Get the top movie recommendations for the user according to their query.
def find_recommendations(query: str, n: int = 5):
    """
    Finds the top `n` movie recommendations based on the contents of `query`.

    Args:
        `query`: A string containing a description of the user's preferences.
        `n`: The number of most similar movies to find.

    Returns:
        A DataFrame containing the top `n` movie recommendations.
    """
    # Load in the movies dataset
    movies = pd.read_csv("movie_plots.csv")

    # Form the corpus and get cosine similarities
    corpus = form_corpus(movies, query)
    cos_sims = calculate_cos_similarities(corpus)

    # Get the top n matches
    top_idxs = cos_sims.argsort()[-n:][::-1]
    top_movies = movies.iloc[top_idxs][["Title", "Plot"]]

    # Return the recommendations
    return top_movies.reset_index(drop=True)

In [5]:
# Display the movie recommendations neatly.
def display_movies(movies):
    """
    Neatly displays a DataFrame of movie recommendations using HTML.

    Args:
        `movies`: A DataFrame of recommended movies with their titles and
        descriptions.

    Returns:
        None; displays each movie in `movies`, neatly displaying each
        movie's title and description.
    """
    html_content = """<style>
    .movie-container {
        font-family: Arial, sans-serif;
        margin-bottom: 20px;
        padding: 10px;
        border: 1px solid #ddd;
        border-radius: 5px;
        background-color: #f9f9f9;
    }
    .movie-title {
        font-size: 18px;
        font-weight: bold;
        color: #333;
    }
    .movie-plot {
        font-size: 14px;
        color: #666;
        margin-top: 5px;
    }
    </style>"""

    html_content += "<h1 style='color:#dae2f0'>Top Movie Recommendations</h1>"
    
    for _, row in movies.iterrows():
        html_content += f"""
        <div class='movie-container'>
            <div class='movie-title'>{row['Title']}</div>
            <div class='movie-plot'>{row['Plot']}</div>
        </div>
        """
    
    display(HTML(html_content))

### Enter your query in the below cell, as well as how many recommendations you would like.

In [6]:
# Enter your query in the string below.
query = "I love romantic comedies with lots of drama!"

# Enter the number of recommendations you would like.
matches = 3

In [7]:
result = find_recommendations(query, matches)
result

Unnamed: 0,Title,Plot
0,Oh Doctor!,"As described in Exhibitors Herald, a film maga..."
1,Caught in a Cabaret,Chaplin plays a waiter who fakes being a Greek...
2,Romance,"As described in a film publication,[2] a youth..."


In [8]:
display_movies(result)