# Recommendation System for Movies

This notebook contains the Data Cleaning for the Capstone Project regarding a Recommendation System for Movies. 
- This notebook serves to read and join the tables from different data set. 
- The reader is expected to run the codes and ty to understand the contents of the data files. 



## Loading Packages

In [7]:
import sys
# data analysis stack
import numpy as np
import pandas as pd
import os

# data visualization stack
import matplotlib.pyplot as plt
%matplotlib inline
# import seaborn as sns
# sns.set_style('whitegrid')

# miscellaneous
import warnings
warnings.filterwarnings("ignore")

# Loading Datasets 
First dataset is from: https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata/data

In [8]:
df_links = pd.read_csv(r'C:\Users\shabn\neuefische\ds-capstone-recommendation\data\ml-latest-small\links.csv')
df_movies = pd.read_csv(r'C:\Users\shabn\neuefische\ds-capstone-recommendation\data\ml-latest-small\movies.csv')
df_ratings = pd.read_csv(r'C:\Users\shabn\neuefische\ds-capstone-recommendation\data\ml-latest-small\ratings.csv')
df_tags = pd.read_csv(r'C:\Users\shabn\neuefische\ds-capstone-recommendation\data\ml-latest-small\tags.csv')




# Data Exploration

In [9]:
# Display information and first few rows for each table (links)
print("Links Table")
print(df_links.info())
print(df_links.head())


Links Table
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB
None
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0


In [10]:
# Display information and first few rows for each table (movies)
print("Movies Table")
print(df_movies.info())
print(df_movies.head())


Movies Table
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None
   movieId                               title   
0        1                    Toy Story (1995)  \
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [11]:
# Display information and first few rows for each table (ratings)
print("Ratings Table")
print(df_ratings.info())
print(df_ratings.head())


Ratings Table
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [12]:
# Display information and first few rows for each table (tags)
print("Tags Table")
print(df_tags.info())
print(df_tags.head())


Tags Table
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB
None
   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2       2    60756     will ferrell  1445714992
3       2    89774     Boxing story  1445715207
4       2    89774              MMA  1445715200


# Data cleaning 

In [13]:
# Convert timestamp to datetime in table rankings
if 'timestamp' in df_ratings.columns:
    df_ratings['timestamp'] = pd.to_datetime(df_ratings['timestamp'], unit='s')

print(df_ratings.head())


   userId  movieId  rating           timestamp
0       1        1     4.0 2000-07-30 18:45:03
1       1        3     4.0 2000-07-30 18:20:47
2       1        6     4.0 2000-07-30 18:37:04
3       1       47     5.0 2000-07-30 19:03:35
4       1       50     5.0 2000-07-30 18:48:51


In [14]:
# Convert timestamp to datetime in table tags
if 'timestamp' in df_tags.columns:
    df_tags['timestamp'] = pd.to_datetime(df_tags['timestamp'], unit='s')

print(df_tags.head())

   userId  movieId              tag           timestamp
0       2    60756            funny 2015-10-24 19:29:54
1       2    60756  Highly quotable 2015-10-24 19:29:56
2       2    60756     will ferrell 2015-10-24 19:29:52
3       2    89774     Boxing story 2015-10-24 19:33:27
4       2    89774              MMA 2015-10-24 19:33:20


# Decide on a Model type

## Preparation for User-Item Matrix for collaborative filtering models


In [31]:
user_item_matrix = df_ratings.pivot_table(index='userId', columns='movieId', values='rating')

In [32]:
# Install and Import Surprise
!pip install scikit-surprise



### Importing the respective packages

In [17]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

In [18]:
# Prepare the data for Surprise
reader = Reader(rating_scale=(df_ratings['rating'].min(), df_ratings['rating'].max()))
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)


In [19]:
# Train-test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [20]:
# Train the SVD model
model = SVD()
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x180bf6d7650>

In [21]:
# Make predictions and evaluate
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
print("RMSE: {rmse}")

RMSE: 0.8811
RMSE: {rmse}


#### User-Based Collaborative Filtering using Cosine Similarity

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
# Fill NaNs with zeros for similarity calculation
user_item_matrix_filled = user_item_matrix.fillna(0)

# Calculate cosine similarity
user_similarity = cosine_similarity(user_item_matrix_filled)


#### Content-Based Filtering Model

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
# Vectorize the genres column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_movies['genres'])

# Calculate similarity
movie_similarity = cosine_similarity(tfidf_matrix)

## Set up the data for NMF for the baseline model

In [26]:
import pandas as pd
from sklearn.decomposition import NMF

In [27]:
# Fill missing vlaues with 0 (we can still decide on another imputation strategies)
user_item_matrix =df_ratings.pivot_table(index='userId', columns='movieId', values='rating')

# Starting again with my own ratings

In [7]:
# To get a list of movies in the dataset, loading the movies dataframe

import pandas as pd
from datetime import datetime

In [8]:
df_movies = pd.read_csv(r'C:\Users\shabn\neuefische\ds-capstone-recommendation\data\ml-latest-small\movies.csv')

In [9]:
movie_list = df_movies[['movieId', 'title']]
print(movie_list)

      movieId                                      title
0           1                           Toy Story (1995)
1           2                             Jumanji (1995)
2           3                    Grumpier Old Men (1995)
3           4                   Waiting to Exhale (1995)
4           5         Father of the Bride Part II (1995)
...       ...                                        ...
9737   193581  Black Butler: Book of the Atlantic (2017)
9738   193583               No Game No Life: Zero (2017)
9739   193585                               Flint (2017)
9740   193587        Bungo Stray Dogs: Dead Apple (2018)
9741   193609        Andrew Dice Clay: Dice Rules (1991)

[9742 rows x 2 columns]


In [11]:
!pip install openpyx1

ERROR: Could not find a version that satisfies the requirement openpyx1 (from versions: none)
ERROR: No matching distribution found for openpyx1


In [10]:
# Save to Excel for better formatting
df_movies[['movieId', 'title']].to_excel('all_movies.xlsx', index=False)

ModuleNotFoundError: No module named 'openpyxl'

In [9]:
# 
my_ratings = [
    {'userId': 9999, 'movieId': 2, 'rating': 4.8, 'timestamp': int(datetime.now().timestamp()) },
    {'userId': 9999, 'movieId': 1101, 'rating': 5, 'timestamp': int(datetime.now().timestamp()) },
    {'userId': 9999, 'movieId': 1370, 'rating': 4.5, 'timestamp': int(datetime.now().timestamp()) },
    {'userId': 9999, 'movieId': 1515, 'rating': 4.9, 'timestamp': int(datetime.now().timestamp()) },
    {'userId': 9999, 'movieId': 1580, 'rating': 5, 'timestamp': int(datetime.now().timestamp()) },
    {'userId': 9999, 'movieId': 1722, 'rating': 4.5, 'timestamp': int(datetime.now().timestamp()) },
    {'userId': 9999, 'movieId': 1831, 'rating': 4.7, 'timestamp': int(datetime.now().timestamp()) },
    {'userId': 9999, 'movieId': 1858, 'rating': 4.2, 'timestamp': int(datetime.now().timestamp()) },
    {'userId': 9999, 'movieId': 62374, 'rating': 4.5, 'timestamp': int(datetime.now().timestamp()) },
    {'userId': 9999, 'movieId': 67923, 'rating': 4.6, 'timestamp': int(datetime.now().timestamp()) }
]

# Neighbourhood-base collaborative filtering

In [None]:
# Dataset Loading and Preprocessing
from surprise import Dataset          # Allows loading and handling of datasets
from surprise import Reader           # Helps define the format of custom datasets

# Built-in Collaborative Filtering Algorithms
from surprise import KNNBasic         # Basic k-nearest neighbors algorithm
from surprise import KNNWithMeans     # KNN that takes mean ratings into account for predictions
from surprise import KNNWithZScore    # KNN that uses z-score normalization for ratings
from surprise import KNNBaseline      # KNN with a baseline approach for handling bias

# Matrix Factorization Algorithms
from surprise import SVD              # Singular Value Decomposition
from surprise import SVDpp            # SVD++ for implicit feedback
from surprise import NMF              # Non-Negative Matrix Factorization

# Baseline and Simpler Models
from surprise import BaselineOnly     # Predicts based on baseline estimates
from surprise import NormalPredictor   # Random ratings based on normal distribution
from surprise import CoClustering      # Co-clustering algorithm for recommendation

# Model Evaluation
from surprise import accuracy          # Measures accuracy (e.g., RMSE, MAE)
from surprise.model_selection import cross_validate  # Cross-validation for model evaluation
from surprise.model_selection import train_test_split # Splits data into train/test sets
from surprise.model_selection import GridSearchCV    # Hyperparameter tuning

# Similarity Computation Options
from surprise import similarities      # Utility to specify similarity measures (e.g., cosine, MSD)


