In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse

In [3]:
def load_data():
    # File paths (update these based on your local path)
    ratings_path = 'ml-100k/u.data'
    movies_path = 'ml-100k/u.item'
    
    # Load ratings
    ratings = pd.read_csv(ratings_path, sep='\t', names=['userId', 'movieId', 'rating', 'timestamp'])
    
    # Load movies
    movies_columns = ['movieId', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 
                      'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
                      'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                      'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
    movies = pd.read_csv(movies_path, sep='|', names=movies_columns, encoding='latin-1')
    
    # Create a 'genres' column by combining genre flags
    genre_cols = movies_columns[5:]
    movies['genres'] = movies[genre_cols].apply(lambda x: ' '.join([col for col in genre_cols if x[col] == 1]), axis=1)
    
    # Select relevant columns
    movies = movies[['movieId', 'title', 'genres']]
    
    # Merge ratings and movies
    data = pd.merge(ratings, movies, on='movieId')
    
    return ratings, movies, data

In [4]:
def build_content_based_model(movies):
    # Initialize TF-IDF Vectorizer
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(movies['genres'])
    
    # Compute cosine similarity matrix
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    return cosine_sim

def get_content_based_recommendations(title, movies, cosine_sim, top_n=10):
    # Get the index of the movie that matches the title
    idx = movies[movies['title'].str.contains(title, case=False, na=False)].index
    if len(idx) == 0:
        return "Movie not found!"
    idx = idx[0]
    
    # Get pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N similar movies (excluding the movie itself)
    sim_scores = sim_scores[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    
    return movies['title'].iloc[movie_indices].tolist()

In [5]:
def build_collaborative_model(ratings):
    # Prepare data for Surprise
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
    
    # Split into train and test sets
    trainset, testset = train_test_split(data, test_size=0.25, random_state=42)
    
    # Train SVD model
    algo = SVD()
    algo.fit(trainset)
    
    # Evaluate model
    predictions = algo.test(testset)
    print("Collaborative Filtering RMSE:", rmse(predictions))
    
    return algo, trainset

def get_collaborative_recommendations(user_id, algo, movies, ratings, top_n=10):
    # Get all movie IDs
    all_movie_ids = movies['movieId'].unique()
    
    # Get movies already rated by the user
    rated_movies = ratings[ratings['userId'] == user_id]['movieId'].tolist()
    
    # Predict ratings for unrated movies
    predictions = []
    for movie_id in all_movie_ids:
        if movie_id not in rated_movies:
            pred = algo.predict(user_id, movie_id)
            predictions.append((movie_id, pred.est))
    
    # Sort by predicted rating
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Get top N movie IDs
    top_movie_ids = [x[0] for x in predictions[:top_n]]
    
    # Return movie titles
    return movies[movies['movieId'].isin(top_movie_ids)]['title'].tolist()

In [6]:
def get_hybrid_recommendations(user_id, title, movies, ratings, cosine_sim, algo, top_n=10):
    # Get content-based recommendations
    content_recs = get_content_based_recommendations(title, movies, cosine_sim, top_n=20)
    if isinstance(content_recs, str):
        return content_recs
    
    # Get movie IDs for content-based recommendations
    content_movie_ids = movies[movies['title'].isin(content_recs)]['movieId'].tolist()
    
    # Predict ratings for these movies using collaborative filtering
    predictions = []
    for movie_id in content_movie_ids:
        pred = algo.predict(user_id, movie_id)
        predictions.append((movie_id, pred.est))
    
    # Sort by predicted rating
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Get top N movie IDs
    top_movie_ids = [x[0] for x in predictions[:top_n]]
    
    # Return movie titles
    return movies[movies['movieId'].isin(top_movie_ids)]['title'].tolist()

In [11]:
if __name__ == "__main__":
    # Load data
    ratings, movies, data = load_data()
    
    # Build content-based model
    cosine_sim = build_content_based_model(movies)
    
    # Build collaborative model
    algo, trainset = build_collaborative_model(ratings)
    
    # Example: Content-based recommendations
    print("\nContent-Based Recommendations for 'Kick':")
    print(get_content_based_recommendations('Kick', movies, cosine_sim))
    
    # Example: Collaborative recommendations for user ID 1
    print("\nCollaborative Recommendations for User ID 1:")
    print(get_collaborative_recommendations(1, algo, movies, ratings))
    
    # Example: Hybrid recommendations for user ID 1 based on 'Toy Story'
    print("\nHybrid Recommendations for User ID 1 based on 'Kick':")
    print(get_hybrid_recommendations(1, 'Kick', movies, ratings, cosine_sim, algo))

RMSE: 0.9445
Collaborative Filtering RMSE: 0.9444806865851452

Content-Based Recommendations for 'Kick':
['Eat Drink Man Woman (1994)', 'Ed Wood (1994)', "What's Eating Gilbert Grape (1993)", 'Welcome to the Dollhouse (1995)', 'Swingers (1996)', 'Citizen Ruth (1996)', 'As Good As It Gets (1997)', 'Deconstructing Harry (1997)', 'Wag the Dog (1997)', 'Adventures of Priscilla, Queen of the Desert, The (1994)']

Collaborative Recommendations for User ID 1:
['Secrets & Lies (1996)', 'L.A. Confidential (1997)', "Schindler's List (1993)", "One Flew Over the Cuckoo's Nest (1975)", 'Close Shave, A (1995)', 'To Kill a Mockingbird (1962)', 'Fantasia (1940)', 'North by Northwest (1959)', 'Killing Fields, The (1984)', 'Rear Window (1954)']

Hybrid Recommendations for User ID 1 based on 'Kick':
['Eat Drink Man Woman (1994)', 'Ed Wood (1994)', "What's Eating Gilbert Grape (1993)", 'Welcome to the Dollhouse (1995)', 'Swingers (1996)', 'Citizen Ruth (1996)', 'As Good As It Gets (1997)', 'Apartment, The