In [26]:
# ==== INSTALL REQUIRED PACKAGES ====

!pip install numpy==1.26.4                    # fix numpy for surprise
!pip install pandas scikit-learn seaborn matplotlib scikit-surprise   # core libs



In [27]:
# ==== IMPORTS ====

import pandas as pd                            # data handling
import numpy as np                              # numerical ops
import seaborn as sns                           # visualization
import matplotlib.pyplot as plt                 # plotting

# surprise library models
from surprise import Dataset, Reader, KNNBasic, KNNWithMeans, SVD
from surprise.model_selection import cross_validate



In [28]:
# ==== LOAD DATA ====

movies = pd.read_csv("/content/movies.csv")     # load movies file
ratings = pd.read_csv("/content/ratings.csv")   # load ratings file

movies.head(), ratings.head()                   # preview data


(   movieId                               title  \
 0        1                    Toy Story (1995)   
 1        2                      Jumanji (1995)   
 2        3             Grumpier Old Men (1995)   
 3        4            Waiting to Exhale (1995)   
 4        5  Father of the Bride Part II (1995)   
 
                                         genres  
 0  Adventure|Animation|Children|Comedy|Fantasy  
 1                   Adventure|Children|Fantasy  
 2                               Comedy|Romance  
 3                         Comedy|Drama|Romance  
 4                                       Comedy  ,
    userId  movieId  rating  timestamp
 0       1        1     4.0  964982703
 1       1        3     4.0  964981247
 2       1        6     4.0  964982224
 3       1       47     5.0  964983815
 4       1       50     5.0  964982931)

In [29]:
# ==== PREPROCESS MOVIES ====

movies = movies.drop_duplicates(subset='movieId')          # remove duplicate movieId

movies['title'] = movies['title'].str.lower()              # lowercase titles
movies['genres'] = movies['genres'].str.lower()            # lowercase genres

movies['genres'] = movies['genres'].str.replace("|"," ",regex=False)  # replace separator

movies['tags'] = movies['title'] + " " + movies['genres']  # create text tags

movies.head()                                              # preview processed movies

Unnamed: 0,movieId,title,genres,tags
0,1,toy story (1995),adventure animation children comedy fantasy,toy story (1995) adventure animation children ...
1,2,jumanji (1995),adventure children fantasy,jumanji (1995) adventure children fantasy
2,3,grumpier old men (1995),comedy romance,grumpier old men (1995) comedy romance
3,4,waiting to exhale (1995),comedy drama romance,waiting to exhale (1995) comedy drama romance
4,5,father of the bride part ii (1995),comedy,father of the bride part ii (1995) comedy


In [30]:

# ==== PREPROCESS RATINGS ====

ratings = ratings.drop_duplicates()                        # remove duplicate rows

ratings['rating'] = ratings['rating'].astype(float)        # ensure float rating
ratings['movieId'] = ratings['movieId'].astype(int)        # ensure int movieId
ratings['userId'] = ratings['userId'].astype(int)          # ensure int userId

ratings = ratings[(ratings['rating'] >= 0.5) & (ratings['rating'] <= 5)]   # valid ratings only

ratings.head()                                             # preview processed ratings


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [31]:
# ==== PREPARE SURPRISE DATA ====

reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))   # define scale

data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader)  # build dataset



In [32]:
# ==== MODEL TESTING ====

models = {
    "KNNBasic": KNNBasic(sim_options={'name':'cosine','user_based':True}),        # simple knn
    "KNNWithMeans": KNNWithMeans(sim_options={'name':'cosine','user_based':True}),# mean-knn
    "SVD": SVD()                                                                  # matrix factorization
}

results = {}                                                                      # store metrics

for name, model in models.items():
    print(f"Training {name}...")                                                  # show progress
    cv = cross_validate(model, data, measures=['RMSE','MAE'], cv=3, verbose=False) # run cv
    results[name] = (cv['test_rmse'].mean(), cv['test_mae'].mean())               # save results

results                                                                               # display scores



Training KNNBasic...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Training KNNWithMeans...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Training SVD...


{'KNNBasic': (0.9776585841559652, 0.7530903104919271),
 'KNNWithMeans': (0.9074900245600547, 0.6936216741375419),
 'SVD': (0.8794096448861387, 0.6760157987131045)}

In [33]:
# ==== SELECT BEST MODEL ====

best_model_name = min(results, key=lambda x: results[x][0])     # lowest RMSE wins
best_model_name                                                  # show best model



'SVD'

In [34]:
# ==== TRAIN BEST MODEL ====

best_model = models[best_model_name]                            # select model

trainset = data.build_full_trainset()                           # build full training set
best_model.fit(trainset)                                        # train model

print("Best model selected:", best_model_name)                  # print best model


Best model selected: SVD


In [35]:
# ==== CONTENT MODEL (TF-IDF) ====

from sklearn.feature_extraction.text import TfidfVectorizer     # text vectorizer
from sklearn.metrics.pairwise import cosine_similarity          # similarity metric

tfidf = TfidfVectorizer(stop_words='english')                   # remove stopwords
tfidf_matrix = tfidf.fit_transform(movies['tags'])              # vectorize movie tags

cosine_sim = cosine_similarity(tfidf_matrix)                    # build similarity matrix

print("Content model ready!")                                   # confirm readiness



Content model ready!


In [36]:
# ==== COLLABORATIVE SCORE FUNCTION ====

def predict_collab_score(movie_id, sample_users=150):
    """Estimate collaborative score using sampled users."""

    users = ratings['userId'].unique()                           # list unique users

    if len(users) > sample_users:                                # speed improvement
        users = np.random.choice(users, sample_users, replace=False)

    preds = []                                                    # store predictions

    for u in users:
        try:
            p = best_model.predict(int(u), int(movie_id)).est     # predicted rating
            preds.append(p)
        except:
            continue                                              # ignore missing predictions

    return np.mean(preds) if preds else 0.0                       # average collab score


In [38]:
 #==== HYBRID RECOMMENDER ====

def hybrid_recommend(movie_title, topk=10):
    """Hybrid = 70% content + 30% collaborative."""

    movie_title = movie_title.lower().strip()                     # normalize title

    if movie_title not in movies['title'].values:
        raise ValueError("Movie not found in dataset!")           # handle missing

    idx = movies[movies['title'] == movie_title].index[0]         # index of requested movie

    sim_scores = list(enumerate(cosine_sim[idx]))                 # get similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    candidates = sim_scores[1:50]                                 # choose top 50 similar

    results = []                                                   # store hybrid scores

    for cand_idx, content_score in candidates:
        movie_id = movies.iloc[cand_idx]['movieId']               # candidate movieId
        collab_score = predict_collab_score(movie_id)             # predicted rating

        final_score = 0.7*content_score + 0.3*(collab_score/5)    # hybrid weight
        results.append((cand_idx, final_score))                   # store score

    results = sorted(results, key=lambda x: x[1], reverse=True)   # sort by score

    top_idx = [i[0] for i in results[:topk]]                      # top k results

    return movies.iloc[top_idx][['title','genres']]               # return recommendations


In [39]:
# ==== ENABLE WIDGET SUPPORT ====

!pip install ipywidgets                                  # install widgets
from google.colab import output
output.enable_custom_widget_manager()                    # enable UI widgets





In [40]:
# ==== CSV DOWNLOAD HELPER ====

from google.colab import files                           # file download tool

def download_recommendations_as_csv(df, filename="recommendations.csv"):
    """Save recommendations to CSV and download it."""
    df.to_csv(filename, index=False)                     # save file
    files.download(filename)                             # trigger download


In [41]:
# ==== MOVIE PICKER UI WITH DOWNLOAD ====

import ipywidgets as widgets                              # ui components
from IPython.display import display                       # display widgets

movie_dropdown = widgets.Dropdown(
    options=sorted(movies['title'].tolist()),             # dropdown movie list
    description='Pick Movie:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

run_button = widgets.Button(
    description="Recommend Movies",                       # run recommendations
    button_style='success',
    layout=widgets.Layout(width='200px')
)

download_button = widgets.Button(
    description="Download CSV",                           # download button
    button_style='info',
    layout=widgets.Layout(width='200px')
)

output_area = widgets.Output()                            # output container
recommendations_df = None                                 # store last results


def on_recommend_clicked(b):
    """Generate recommendations on button click."""

    global recommendations_df
    output_area.clear_output()                            # clear old output

    with output_area:
        selected_movie = movie_dropdown.value             # get selected movie
        print("\nSelected movie:", selected_movie)        # display selection

        recommendations_df = hybrid_recommend(selected_movie, topk=10)  # compute recs
        display(recommendations_df)                       # show table


def on_download_clicked(b):
    """Download recommendations as CSV."""

    if recommendations_df is not None:                    # ensure results exist
        download_recommendations_as_csv(recommendations_df)
    else:
        with output_area:
            print(" Run recommendations first!")        # warn user


run_button.on_click(on_recommend_clicked)                 # assign callback
download_button.on_click(on_download_clicked)             # assign callback

display(movie_dropdown, run_button, download_button, output_area)  # show UI

Dropdown(description='Pick Movie:', layout=Layout(width='400px'), options=("'71 (2014)", "'burbs, the (1989)",…

Button(button_style='success', description='Recommend Movies', layout=Layout(width='200px'), style=ButtonStyle…

Button(button_style='info', description='Download CSV', layout=Layout(width='200px'), style=ButtonStyle())

Output()