In [3]:
# Importing neccesary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [4]:
#loading the dataset
movies = pd.read_csv('/content/movies.csv')
ratings = pd.read_csv('/content/ratings.csv')

# Inspect
print(movies.head())
print(ratings.head())


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [5]:
movies.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [7]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [8]:
ratings.shape

(100836, 4)

In [9]:
movies.shape

(9742, 3)

In [10]:
movies.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [11]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Create TF-IDF matrix for genres
tfidf = TfidfVectorizer(stop_words='english')
movies['genres'] = movies['genres'].fillna('')
tfidf_matrix = tfidf.fit_transform(movies['genres'])

# Compute similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Function to recommend movies
def recommend_movies(movie_title, cosine_sim=cosine_sim):
    indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()
    idx = indices[movie_title]

    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Top 10 recommendations
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

print(recommend_movies("Toy Story (1995)"))


1706                                          Antz (1998)
2355                                   Toy Story 2 (1999)
2809       Adventures of Rocky and Bullwinkle, The (2000)
3000                     Emperor's New Groove, The (2000)
3568                                Monsters, Inc. (2001)
6194                                     Wild, The (2006)
6486                               Shrek the Third (2007)
6948                       Tale of Despereaux, The (2008)
7760    Asterix and the Vikings (Astérix et les Viking...
8219                                         Turbo (2013)
Name: title, dtype: object


In [16]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

# Prepare the data for Surprise
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)



In [17]:
# Train an SVD model
model = SVD()
cross_validate(model, data, cv=5)

# Train on the full dataset
trainset = data.build_full_trainset()
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7bb8243b5750>

In [18]:
# Predict ratings
def predict_rating(user_id, movie_id):
    return model.predict(user_id, movie_id).est

print(predict_rating(1, 10))  # Predict for user 1 and movie 10


3.8717284501208793


In [19]:
# Predict ratings
def predict_rating(user_id, movie_id):
    return model.predict(user_id, movie_id).est

print(predict_rating(4, 47))

2.861136959396724


In [21]:
# Predict ratings
def predict_rating(user_id, movie_id):
    return model.predict(user_id, movie_id).est

print(predict_rating(1, 47))

4.94679984163785


In [22]:
# Predict ratings
def predict_rating(user_id, movie_id):
    return model.predict(user_id, movie_id).est

print(predict_rating(1, 1))

4.444786694618074


In [23]:
from surprise.model_selection import cross_validate

# Perform 5-fold cross-validation
results = cross_validate(model, data, cv=5)

# Print results
print("Cross-validation results:")
print("RMSE:", results['test_rmse'])
print("MAE:", results['test_mae'])


Cross-validation results:
RMSE: [0.87606506 0.87238373 0.88545725 0.8750976  0.86132697]
MAE: [0.67041996 0.66874751 0.67939262 0.67339511 0.66513969]


In [25]:
import joblib

# Save the trained model
joblib.dump(model, 'movie_recommender_model.joblib')


['movie_recommender_model.joblib']