In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielens-20m-dataset/rating.csv
/kaggle/input/movielens-20m-dataset/link.csv
/kaggle/input/movielens-20m-dataset/genome_tags.csv
/kaggle/input/movielens-20m-dataset/genome_scores.csv
/kaggle/input/movielens-20m-dataset/tag.csv
/kaggle/input/movielens-20m-dataset/movie.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

In [3]:

# Load movie and rating data
movies = pd.read_csv('/kaggle/input/movielens-20m-dataset/movie.csv')
ratings = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')


In [4]:
# Handle missing data
movies['genres'] = movies['genres'].fillna('')

# Convert genres to numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(movies['genres'])

# Compute similarity between movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [5]:
def get_content_based_recommendations(movie_title, df, cosine_sim):
    if movie_title not in df['title'].values:
        return "Movie not found in the database."
    idx = df.index[df['title'] == movie_title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

# Prepare data for collaborative filtering model
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

In [7]:
# Train SVD model
model = SVD()
model.fit(trainset)
train_predictions = model.test(trainset.build_testset())
test_predictions = model.test(testset)

print("Training MAE:", accuracy.mae(train_predictions))
print("Testing MAE:", accuracy.mae(test_predictions))

MAE:  0.5161
Training MAE: 0.5161495903277442
MAE:  0.5981
Testing MAE: 0.5980647832845137


In [8]:
def recommend_movies_for_user(user_id, model, df, n=5):
    watched_movies = df[df['userId'] == user_id]['movieId'].unique()
    all_movies = df['movieId'].unique()
    unwatched_movies = [m for m in all_movies if m not in watched_movies]
    predictions = [(m, model.predict(user_id, m).est) for m in unwatched_movies]
    recommendations = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    movie_names = movies[movies['movieId'].isin([rec[0] for rec in recommendations])]['title']
    return movie_names.tolist()

# Test recommendations
movie_name = movies['title'].sample(1).values[0]
print(f"Movies similar to {movie_name}:", get_content_based_recommendations(movie_name, movies, cosine_sim))

user_id = ratings['userId'].sample(1).values[0]
print(f"Recommended movies for user {user_id}:", recommend_movies_for_user(user_id, model, ratings))


Movies similar to Torrente, el brazo tonto de la ley (1998): 749               Striptease (1996)
851                  Carpool (1996)
893         Some Like It Hot (1959)
933            Thin Man, The (1934)
1057    Fish Called Wanda, A (1988)
Name: title, dtype: object
Recommended movies for user 18540: ['Leaving Las Vegas (1995)', '2001: A Space Odyssey (1968)', 'Clockwork Orange, A (1971)', 'Shining, The (1980)', "Pan's Labyrinth (Laberinto del fauno, El) (2006)"]


# This is just a simple RECOMMENDATION SYSTEM project and it needs a lot of development but I can't develop it because I have exams at university and Kaggle is very slow (if you read my code to learn you should know that it is just a basic project)