In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [2]:

# Load movie ratings
ratings_df =  pd.read_csv("Dataset_Movie/ratings_small.csv") # User-movie ratings
movies_df = pd.read_csv("Dataset_Movie/movies_metadata.csv")  # Movie details


  movies_df = pd.read_csv("Dataset_Movie/movies_metadata.csv")  # Movie details


 #### Collaborative Filtering (User-Based)
 This method recommends movies based on what similar users have liked.

Steps:<br>
- Create a user-item matrix from the ratings dataset.

- Normalize the ratings by subtracting each user’s mean rating.

- Compute cosine similarity between users.

- Find the top 5 most similar users for a given user.

- Recommend movies that these similar users have rated highly.

In [4]:
# Create user-item matrix
user_item_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating')

In [5]:
user_item_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,


In [6]:
# Compute centered mean
centered_ratings = user_item_matrix.sub(user_item_matrix.mean(axis=1),axis=0)

In [7]:
# filling NAN with 0
centered_ratings.fillna(0,inplace=True)

In [8]:
# Compute cosine similarity between users
user_similarity = cosine_similarity(centered_ratings)
user_sim_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

In [9]:
user_sim_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.00362,-0.002274,0.0,-0.070321,0.0,0.042632,0.0,...,0.0,0.0,0.018643,0.001031,0.0,0.0,0.0,0.044095,0.0,-0.013096
2,0.0,1.0,-0.001852,-0.004854,0.012639,0.0,0.042691,0.021066,0.011109,-0.007989,...,-0.018248,-0.021546,0.018902,-0.058952,0.028515,-0.106828,-0.007999,-0.041628,-0.090233,0.056258
3,0.0,-0.001852,1.0,0.018594,-0.025903,-0.0632,0.0549,0.026488,-0.036187,0.038021,...,0.044297,0.019581,0.070702,0.030669,0.143705,0.096713,0.027451,0.089297,-0.009815,0.062276
4,0.00362,-0.004854,0.018594,1.0,0.010801,0.019224,0.057519,0.05543,-0.010442,0.005126,...,0.011978,0.006569,0.027687,0.092092,0.021334,0.040833,0.018428,0.028642,0.019848,0.032749
5,-0.002274,0.012639,-0.025903,0.010801,1.0,-0.005843,-0.015075,-0.038886,0.013708,0.0305,...,0.046134,0.001903,0.00162,0.036819,-0.038269,-0.019537,-0.071721,0.00376,-0.029455,-0.036814


In [10]:
def find_similar_users(user_id, top_n=5):
    return user_sim_df[user_id].sort_values(ascending=False)[1:top_n+1].index.tolist()

#### Content-Based Filtering (Using TF-IDF)
This method recommends movies based on their descriptions, genres, or metadata.

Steps:<br>
- Extract movie overview (description) text.

- Convert text into numerical vectors using TF-IDF (Term Frequency-Inverse Document Frequency).

- Compute cosine similarity between movies based on their descriptions.

- Recommend movies similar to what the user has watched.

In [12]:
# Convert NaN values in 'overview' column to empty strings
movies_df['overview'] = movies_df['overview'].fillna('')

In [13]:
#dropping rows with null values
movies_df.dropna(inplace=True)
#resetting the index of table
movies_df.reset_index(inplace=True)

In [14]:
# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['overview'])

In [15]:
# Compute cosine similarity for content-based filtering
content_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

### Hybrid recommendation function 
We now combine Collaborative Filtering and Content-Based Filtering using a weighted approach.

Steps:<br>
- Find similar users using collaborative filtering.

- Get content-based similarity scores for movies.

- Combine both scores using a weight factor alpha:

- alpha * collaborative_score + (1 - alpha) * content_score

In [17]:

movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(movies_df['id'])}


In [18]:
def hybrid_recommend(user_id, movie_id, alpha=0.5):
    # Step 1: Find users similar to the given user (based on cosine similarity of user ratings)
    similar_users = find_similar_users(user_id)
    
    # Step 2: Calculate average rating that similar users have given to the target movie
    similar_users_ratings = user_item_matrix.loc[similar_users].mean()
    collab_score = similar_users_ratings.get(movie_id, 0)  # Collaborative filtering score

    # Step 3: Get the list of movies that the target user has already rated
    watched_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index.tolist()

    # Step 4: Check if the movie exists in the similarity matrix index mapping
    if movie_id not in movie_id_to_index:
        return 0  # Cannot compute recommendation if movie is unknown

    # Step 5: Get index of the target movie in the similarity matrix
    movie_idx = movie_id_to_index[movie_id]

    # Step 6: Calculate content-based similarity score
    content_scores = []
    for m in watched_movies:
        if m in movie_id_to_index:
            m_idx = movie_id_to_index[m]
            # Get similarity score between target movie and each movie the user has watched
            content_scores.append(content_sim[movie_idx][m_idx])

    # Step 7: Average the content-based scores (if any)
    content_score = np.mean(content_scores) if content_scores else 0

    # Step 8: Combine collaborative and content-based scores using a weighted average
    # alpha = weight given to collaborative score
    final_score = alpha * collab_score + (1 - alpha) * content_score

    # Step 9: Return the final hybrid score for this user and movie
    return final_score


In [19]:
def recommend_movies(user_id,top_n=5):
    all_movies = movies_df["id"].to_list()
    scores = {movie:hybrid_recommend(user_id,movie) for movie in all_movies}
    
    # Sort movies by highest hybrid score
    recommended_moives=sorted(scores.items(),key=lambda x : x[1],reverse=True)[:top_n]

    return [movies_df[movies_df["id"] == movie_id].title.values[0] for movie_id, _ in recommended_moives]

    

In [20]:
recommend_movies(1)

['GoldenEye',
 'Friday',
 'From Dusk Till Dawn',
 'Blue in the Face',
 'Mighty Morphin Power Rangers: The Movie']

Above are the moives that are recommended to user with id 1