In [1]:
# Dataset URL
# https://www.kaggle.com/datasets/parasharmanas/movie-recommendation-system/data

In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import en_core_web_sm


# Load SpaCy NLP model (ensure it's installed)
nlp = en_core_web_sm.load()  # Use medium model for better word similarity

# Function to extract unique genres from dataset
def get_unique_genres(data):
    """
    Extracts unique genres from the dataset.
    :param data: DataFrame containing movie genres.
    :return: Set of unique genres.
    """
    unique_genres = set()
    for genre_list in data['genres'].dropna():
        unique_genres.update(genre_list.split('|'))
    return unique_genres

def extract_genres(user_input, unique_genres, top_n=3):
    """
    Extracts top N genres from user input using NLP (word similarity + lemmatization).
    :param user_input: Raw user text.
    :param unique_genres: Set of unique genres from dataset.
    :param top_n: Number of genres to extract.
    :return: List of top N extracted genres.
    """
    doc = nlp(user_input.lower())  # Process user input
    extracted_genres = []

    # Convert dataset genres into NLP tokens for comparison
    genre_tokens = {genre: nlp(genre.lower()) for genre in unique_genres}

    # Compute similarity scores for each word in the user input
    similarity_scores = []
    for token in doc:
        token_lemma = token.lemma_  # Get base form of the word

        # Check for exact matches first
        if token_lemma in unique_genres:
            similarity_scores.append((token_lemma, 1.0))
            continue

        # Compare token similarity to known genres
        for genre, nlp_genre in genre_tokens.items():
            if token.has_vector:
                similarity = token.similarity(nlp_genre)
                similarity_scores.append((genre, similarity))

    # Sort by similarity score in descending order and select top N unique genres
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_genres = []
    seen = set()

    for genre, score in similarity_scores:
        if genre not in seen:
            top_genres.append(genre)
            seen.add(genre)
        if len(top_genres) == top_n:
            break

    return top_genres if top_genres else ["Unknown"]

# Load datasets
def load_data(movie_file, rating_file):
    """
    Load movie and rating datasets.
    :param movie_file: Path to the CSV file containing movies and their genres.
    :param rating_file: Path to the CSV file containing user ratings.
    :return: Merged DataFrame with movie information, ratings, and unique genres.
    """
    movies = pd.read_csv(movie_file)
    ratings = pd.read_csv(rating_file)

    # Compute average rating per movie
    avg_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
    
    # Merge average ratings with movie data
    movie_data = movies.merge(avg_ratings, on='movieId', how='left').fillna({'rating': 0})
    
    # Extract unique genres from dataset
    unique_genres = get_unique_genres(movie_data)
    
    return movie_data, ratings, unique_genres

# Compute TF-IDF vectors
def compute_tfidf(data, text_column):
    """
    Compute TF-IDF vectors for a given text column.
    :param data: DataFrame containing movie genres.
    :param text_column: Column name to transform into TF-IDF vectors.
    :return: TF-IDF matrix and vectorizer.
    """
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(data[text_column].fillna(""))
    return tfidf_matrix, vectorizer

# Content-Based Recommendation (Genres + Ratings)
def recommend_content_based(user_input, data, tfidf_matrix, vectorizer, unique_genres, top_n=5):
    """
    Recommend movies based on extracted genres and content similarity.
    :param user_input: User's input description.
    :param data: DataFrame containing movie information.
    :param tfidf_matrix: Precomputed TF-IDF matrix.
    :param vectorizer: TF-IDF vectorizer.
    :param unique_genres: Set of unique genres.
    :param top_n: Number of recommendations to return.
    :return: List of recommended movie titles with adjusted scores.
    """
    extracted_genres = extract_genres(user_input, unique_genres)
    genre_string = " ".join(extracted_genres)
    print(f"Extracted Genres: {genre_string}")

    user_tfidf = vectorizer.transform([genre_string])
    similarity_scores = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
    
    # Incorporate ratings into the scoring
    adjusted_scores = similarity_scores * (data['rating'] / 5)  # Normalize ratings to [0,1]
    
    top_indices = adjusted_scores.argsort()[-top_n:][::-1]
    recommendations = [(data.iloc[i]['title'], adjusted_scores[i]) for i in top_indices]
    return recommendations

def recommend_hybrid(user_id, data, ratings, tfidf_matrix, vectorizer, top_n=5):
    """
    Hybrid recommendation combining content-based filtering and user ratings.
    :param user_id: ID of the user.
    :param data: DataFrame containing movie information.
    :param ratings: DataFrame containing user ratings.
    :param tfidf_matrix: Precomputed TF-IDF matrix.
    :param vectorizer: TF-IDF vectorizer.
    :param top_n: Number of recommendations to return.
    :return: List of recommended movie titles.
    """
    # Filter ratings for the specific user
    user_ratings = ratings[ratings['userId'] == user_id]

    # Merge user-rated movies with movie data
    user_movies = user_ratings.merge(data, on='movieId', how='left')

    if user_movies.empty:
        print("No ratings found for this user. Defaulting to content-based recommendations.")
        return recommend_content_based(
            input("Enter movie description preferences: "), data, tfidf_matrix, vectorizer, get_unique_genres(data), top_n
        )

    # Compute weighted similarity based on user's rated movies
    weighted_scores = {}
    for _, row in user_movies.iterrows():
        if 'rating' not in row or pd.isna(row['rating']):  # Handle missing ratings
            row['rating'] = 3.0  # Default to neutral rating if missing

        movie_tfidf = vectorizer.transform([row['combined_text']])
        similarity_scores = cosine_similarity(movie_tfidf, tfidf_matrix).flatten()

        weighted_scores[row['movieId']] = similarity_scores * row['rating']

    # Sum up similarity scores weighted by ratings
    total_scores = np.sum(list(weighted_scores.values()), axis=0)

    top_indices = total_scores.argsort()[-top_n:][::-1]
    recommendations = [(data.iloc[i]['title'], total_scores[i]) for i in top_indices]
    
    return recommendations

In [None]:
# Interactive execution in Jupyter Notebook
if __name__ == "__main__":
    movie_file = 'Data/movies.csv'
    rating_file = 'Data/ratings.csv'
    
    # Load data
    data, ratings, unique_genres = load_data(movie_file, rating_file)

    # Combine title and weighted genres for TF-IDF processing
    data['combined_text'] = data['title'] + " " + data['genres'] * 3  # Weighted genres
    tfidf_matrix, tfidf_vectorizer = compute_tfidf(data, 'combined_text')
    
    # User chooses recommendation type
    user_choice = input("Enter '1' for content-based or '2' for hybrid recommendation: ")
    if user_choice == '1':
        user_input = input("Enter movie description preferences: ")
        recommendations = recommend_content_based(user_input, data, tfidf_matrix, tfidf_vectorizer, unique_genres)
    elif user_choice == '2':
        user_id = int(input("Enter your user ID: "))
        recommendations = recommend_hybrid(user_id, data, ratings, tfidf_matrix, tfidf_vectorizer)
    else:
        print("Invalid choice.")
        recommendations = []

    # Display recommendations
    print("Recommended Movies:")
    for title, score in recommendations:
        print(f"{title} (Score: {score:.2f})")

Recommended Movies:
Children of Men (2006) (Score: 468.05)
Fire From Below (2009) (Score: 456.74)
Day After Tomorrow, The (2004) (Score: 445.39)
Jumper (2008) (Score: 435.52)
Batman (1943) (Score: 435.37)
