In [2]:
import pandas as pd
# Load datasets
ratings = pd.read_csv('/kaggle/input/the-movies-dataset/ratings.csv')
movies = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv', low_memory=False) #he whole file is read in one go rather than in chunks.

# Display samples
print("Ratings:\n", ratings.head())
print("Movies Metadata:\n", movies[['title', 'genres', 'overview']].head())


Ratings:
    userId  movieId  rating   timestamp
0       1      110     1.0  1425941529
1       1      147     4.5  1425942435
2       1      858     5.0  1425941523
3       1     1221     5.0  1425941546
4       1     1246     5.0  1425941556
Movies Metadata:
                          title  \
0                    Toy Story   
1                      Jumanji   
2             Grumpier Old Men   
3            Waiting to Exhale   
4  Father of the Bride Part II   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   
3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
4                     [{'id': 35, 'name': 'Comedy'}]   

                                            overview  
0  Led by Woody, Andy's toys live happily in his ...  
1  When siblings Judy and Peter discover an encha...  
2  A family wedding reignites t

In [3]:
import ast

def parse_genres(genres_str):
    try:
        genres_list = ast.literal_eval(genres_str)  # convert string to list of dicts, built in.
        return [genre['name'] for genre in genres_list]  # extract names
    except Exception as e:
        return []  # return empty list if parsing fails
#creating a new column named parsed genres in movies dataframe
movies['parsed_genres'] = movies['genres'].apply(parse_genres) #apply the function each row

print(movies[['title', 'parsed_genres']].head(10))


                         title                       parsed_genres
0                    Toy Story         [Animation, Comedy, Family]
1                      Jumanji        [Adventure, Fantasy, Family]
2             Grumpier Old Men                   [Romance, Comedy]
3            Waiting to Exhale            [Comedy, Drama, Romance]
4  Father of the Bride Part II                            [Comedy]
5                         Heat    [Action, Crime, Drama, Thriller]
6                      Sabrina                   [Comedy, Romance]
7                 Tom and Huck  [Action, Adventure, Drama, Family]
8                 Sudden Death       [Action, Adventure, Thriller]
9                    GoldenEye       [Adventure, Action, Thriller]


In [None]:
# print(movies.head())
# print('Blank Space ')
# print(ratings.head())

In [4]:
#in this step we ensure that the id values are numeric 
movies['id']=pd.to_numeric(movies['id'],errors='coerce') #errors ='coerce' will convert invalid values in NaN
ratings['movieId'] = pd.to_numeric(ratings['movieId'], errors='coerce')

#new dataframe
merged_df=pd.merge(movies,ratings,left_on='id',right_on='movieId')
print(merged_df.head(3))

   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
2  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
2  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   

                               homepage     id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story  862.0  tt0114709                en   
1  http://toystory.disney.com/toy-story  862.0  tt0114709                en   
2  http://toystory.disney.com/toy-story  862.0  tt0114709                en   

  original_title                                           overview  ...  \
0      Toy Story  Led by Woody, Andy's toys live happily in his ...  ...   
1      Toy St

In [5]:
missing_overview = merged_df[merged_df['overview'].isna()]
print(missing_overview.head(3))
#checking for NaN values

         adult belongs_to_collection budget genres homepage        id  \
685437   False                   NaN      0     []      NaN  145925.0   
2543054  False                   NaN      0     []      NaN  134368.0   
2543055  False                   NaN      0     []      NaN  134368.0   

           imdb_id original_language   original_title overview  ... tagline  \
685437   tt0107575                it  Mille bolle blu      NaN  ...     NaN   
2543054  tt0122642                en    One Tough Cop      NaN  ...     NaN   
2543055  tt0122642                en    One Tough Cop      NaN  ...     NaN   

                   title  video vote_average vote_count  parsed_genres  \
685437   Mille bolle blu  False          6.0        1.0             []   
2543054    One Tough Cop  False          3.0        3.0             []   
2543055    One Tough Cop  False          3.0        3.0             []   

         userId movieId rating   timestamp  
685437   255325  145925    3.0  1466989379  
254

In [6]:
#creting a new dataframe for recommendations
movie_list=merged_df[['title','genres','overview','tagline']].drop_duplicates(subset='title')
#now we remove rows with missing overviews
movie_list=movie_list.dropna(subset='overview')

print(movie_list.head(10))

                       title  \
0                  Toy Story   
374                  Jumanji   
497                     Heat   
2842               GoldenEye   
4161        Cutthroat Island   
18049                 Casino   
27212  Sense and Sensibility   
27489             Four Rooms   
42747             Get Shorty   
43370                Copycat   

                                                  genres  \
0      [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
374    [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
497    [{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...   
2842   [{'id': 12, 'name': 'Adventure'}, {'id': 28, '...   
4161   [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...   
18049  [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...   
27212  [{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...   
27489  [{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...   
42747  [{'id': 35, 'name': 'Comedy'}, {'id': 53, 'nam...   
43370  [{'id': 18, 'name': 'Drama'}, {'id': 53,

In [7]:
movie_list = movie_list.reset_index(drop=True)
print(movie_list.head(5))

              title                                             genres  \
0         Toy Story  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1           Jumanji  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2              Heat  [{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...   
3         GoldenEye  [{'id': 12, 'name': 'Adventure'}, {'id': 28, '...   
4  Cutthroat Island  [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...   

                                            overview  \
0  Led by Woody, Andy's toys live happily in his ...   
1  When siblings Judy and Peter discover an encha...   
2  Obsessive master thief, Neil McCauley leads a ...   
3  James Bond must unmask the mysterious head of ...   
4  Morgan Adams and her slave, William Shaw, are ...   

                                             tagline  
0                                                NaN  
1          Roll the dice and unleash the excitement!  
2                           A Los Angeles Crime Saga 

In [9]:
#applying tools 
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer: remove stopwords, convert to lowercase automatically
tfidf = TfidfVectorizer(stop_words='english')

# Transform the overview column into a TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(movie_list['overview'])

# Check shape of matrix
print(tfidf_matrix.shape)


(7281, 29584)


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity matrix from the TF-IDF matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print(cosine_sim.shape)  # Should be (num_movies, num_movies)


(7281, 7281)


In [13]:
# Create reverse mapping of titles to index
indices = pd.Series(movie_list.index, index=movie_list['title']).drop_duplicates()

# Define recommendation function
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies (excluding itself)
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movie_list['title'].iloc[movie_indices]


In [14]:
#testing
# print(get_recommendations('Inception'))
test_titles_1 = [
    "Inception",
    "The Dark Knight",
    "The Godfather",
    "Titanic",
    "The Matrix",
    "Forrest Gump",
    "Interstellar",
    "Pulp Fiction",
    "The Silence of the Lambs",
    
    "The Shawshank Redemption",
    "Spirited Away"
]

for title in test_titles_1:
    print(f"Input: {title}")
    print(get_recommendations(title), end="\n\n")

Input: Inception
2797                  Primeval
43                       Crumb
2512     A History of Violence
1705                    Duplex
6378                  Blockade
1144    When Brendan Met Trudy
3243             Today You Die
2864             Benny's Video
4148          Dreams of a Life
5577                 Maidstone
Name: title, dtype: object

Input: The Dark Knight
426      Batman Returns
37       Batman Forever
158              Batman
2445      Batman Begins
929                 JFK
1577              Q & A
471      Batman & Robin
2047    To End All Wars
3910    Sherlock Holmes
3887         Tight Spot
Name: title, dtype: object

Input: The Godfather
341      The Godfather: Part II
587     The Godfather: Part III
2735           Household Saints
3894          Short Sharp Shock
2177               Violent City
5684          Gang War in Milan
1806            Queen of Hearts
3632            Murder, He Says
2672             The Public Eye
6369    The Condemned of Altona
Name: title, 

In [11]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# 1. Load SBERT model (light and fast: "all-MiniLM-L6-v2")
model = SentenceTransformer('all-MiniLM-L6-v2')

# 2. Encode all overviews into 384-dimension semantic vectors
overview_embeddings = model.encode(movie_list['overview'], show_progress_bar=True)

# 3. Compute cosine similarity matrix
cosine_sim_sbert = cosine_similarity(overview_embeddings, overview_embeddings)

# 4. Reverse mapping of title to index
indices = pd.Series(movie_list.index, index=movie_list['title']).drop_duplicates()

# 5. Semantic recommendation function
def get_semantic_recommendations(title, cosine_sim=cosine_sim_sbert, top_n=10):
    if title not in indices:
        return f"'{title}' not found in the movie list."

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # skip the movie itself
    movie_indices = [i[0] for i in sim_scores]
    return movie_list['title'].iloc[movie_indices]


Batches:   0%|          | 0/228 [00:00<?, ?it/s]

In [12]:

test_titles = [
    "Inception",
    "The Dark Knight",
    "The Godfather",
    "Titanic",
    "The Matrix",
    "Forrest Gump",
    "Interstellar",
    "Pulp Fiction",
    "The Silence of the Lambs",
    "La La Land",
    "The Shawshank Redemption",
    "Spirited Away"
]

for title in test_titles:
    print(f"Input: {title}")
    print(get_semantic_recommendations(title), end="\n\n")


Input: Inception
433                     Sneakers
5482    The Man Who Wouldn't Die
447               Absolute Power
2519         Kiss Kiss Bang Bang
4829      The Mind of Mr. Soames
3243               Today You Die
1493                       I Spy
4493                    Paranoia
50                       The Net
5545              Dead on Course
Name: title, dtype: object

Input: The Dark Knight
158                   Batman
2445           Batman Begins
426           Batman Returns
471           Batman & Robin
37            Batman Forever
4239                  Recoil
837                  RoboCop
6163    Another Public Enemy
1824            The Enforcer
6201             Double Bang
Name: title, dtype: object

Input: The Godfather
341      The Godfather: Part II
587     The Godfather: Part III
2823         Salvatore Giuliano
6137    The Most Beautiful Wife
2240        The Stolen Children
454             The Devil's Own
2512      A History of Violence
1631          This Man Must Die
2543   

In [15]:
# Install surprise if not installed
# !pip install scikit-surprise

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import pandas as pd

# Assuming 'ratings' and 'movie_list' DataFrames are already loaded in your environment

# Use only relevant columns: userId, movieId, rating
ratings_1 = ratings[['userId', 'movieId', 'rating']]

# Define reader with rating scale
reader = Reader(rating_scale=(ratings_1.rating.min(), ratings_1.rating.max()))

# Load data into Surprise format
data = Dataset.load_from_df(ratings_1[['userId', 'movieId', 'rating']], reader)

# Split data into training and test sets
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

# Initialize SVD algorithm
svd = SVD()

# Train the model on the training set
svd.fit(trainset)

# Evaluate on the test set
predictions = svd.test(testset)
rmse = accuracy.rmse(predictions)

print(f"Test RMSE: {rmse}")

# Find top 500 movies by number of ratings (popularity)
top_1000_movie_ids = ratings_1['movieId'].value_counts().head(500).index.tolist()

# Function to get top-N recommendations for a given user limited to top 1000 popular movies
def get_svd_recommendations(user_id, n=10):
    # Limit to top 1000 popular movie IDs
    all_movie_ids = top_1000_movie_ids
    
    # Get movies already rated by user
    rated_movies = ratings_1[ratings_1['userId'] == user_id]['movieId']
    
    # Predict ratings for movies not rated yet
    predictions = []
    for movie_id in all_movie_ids:
        if movie_id not in rated_movies.values:
            pred = svd.predict(user_id, movie_id)
            predictions.append((movie_id, pred.est))
    
    # Sort predictions by estimated rating descending
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Get top-n movie IDs
    top_movie_ids = [pred[0] for pred in predictions[:n]]
    
    # Map movie IDs to titles
    recommended_titles = movie_list.loc[movie_list.index.isin(top_movie_ids), 'title']
    
    return recommended_titles

# Example: get recommendations for user with userId=1
print(get_svd_recommendations(user_id=1, n=10))


RMSE: 0.7998
Test RMSE: 0.7998247555712416
541                       Godzilla
778                  Deep Blue Sea
908                      The Beach
912           The Whole Nine Yards
1193              The Sand Pebbles
2858              Perfect Stranger
2997                     Bee Movie
4973    The Emperor's Candlesticks
5618                      Cavegirl
Name: title, dtype: object


In [16]:
def hybrid_recommendations(user_id=None, movie_title=None, n=10):
    # If movie_title is provided, do content-based first
    if movie_title:
        try:
            idx = indices[movie_title]
        except KeyError:
            return f"Movie '{movie_title}' not found in database."

        # Get top 25 similar movies (more than needed, to filter with SVD)
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:26]
        movie_indices = [i[0] for i in sim_scores]

        candidate_movies = movie_list.iloc[movie_indices]

        # If user_id given, predict rating and personalize
        if user_id:
            preds = []
            for mid in candidate_movies.index:
                movie_id = movie_list.loc[mid, 'movieId'] if 'movieId' in movie_list.columns else None
                if movie_id is not None:
                    pred = svd.predict(user_id, movie_id)
                    preds.append((mid, pred.est))
                else:
                    preds.append((mid, 0))  # default low score if no movieId
                
            preds.sort(key=lambda x: x[1], reverse=True)
            top_indices = [p[0] for p in preds[:n]]
            return movie_list.loc[top_indices, 'title']
        
        # Else just return top n similar movies (content-based only)
        return candidate_movies['title'].head(n)
    
    # If only user_id given, do collaborative filtering recommendation
    elif user_id:
        return get_svd_recommendations(user_id, n)
    
    else:
        return "Please provide a user ID or a movie title for recommendations."


In [17]:
test_movies = [
    "Fight Club", "The Shawshank Redemption", "The Lord of the Rings: The Fellowship of the Ring",
    "The Social Network", "The Notebook", "Avengers: Endgame",
    "Schindler's List", "Her", "The Grand Budapest Hotel", "Joker"
]

for title in test_movies:
    print(f"\nRecommendations for: {title}")
    print(hybrid_recommendations(user_id=123, movie_title=title, n=5))



Recommendations for: Fight Club
4738            In Fear
4867    Father and Guns
347         Raging Bull
440          Angel Baby
1460     The Experiment
Name: title, dtype: object

Recommendations for: The Shawshank Redemption
5795     The Farm: Angola, USA
5645      The Domino Principle
2113             Sherlock, Jr.
4095           The Peach Thief
2488    The 40 Year Old Virgin
Name: title, dtype: object

Recommendations for: The Lord of the Rings: The Fellowship of the Ring
603                             The Lord of the Rings
3689                           The Return of the King
1813    The Lord of the Rings: The Return of the King
1531            The Lord of the Rings: The Two Towers
347                                       Raging Bull
Name: title, dtype: object

Recommendations for: The Social Network
Movie 'The Social Network' not found in database.

Recommendations for: The Notebook
Movie 'The Notebook' not found in database.

Recommendations for: Avengers: Endgame
Movie 'Aveng