# Hybrid Recommendation: The Basic Personalization Systems

- SVD model + Collaborative Filtering
- build a simple hybrid recommender that brings together techniques we have implemented in the content based and collaborative filter based engines. 

In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

from IPython.display import display, clear_output, HTML
display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))
import warnings; warnings.simplefilter('ignore')

In [96]:
class hybrid_recomsys:
    
    def __init__(self, metadata_path, links_small_path, ratings_small_path):
        
        self.metadata = pd.read_csv(metadata_path)
        self.ratings_small = pd.read_csv(ratings_small_path)
        self.links_small = pd.read_csv(links_small_path)
        self.id_map = pd.read_csv(links_small_path)[['movieId', 'tmdbId']]
        
    def get_small_metadata(self):
        
        """Function to clean the data and get small dataset from metadata
        
        We need to use small dataset to avoide the expesive computational power"""
        
        self.metadata['vote_count'] = self.metadata[self.metadata['vote_count'].notnull()]['vote_count'].astype('int')
        self.metadata['vote_average'] = self.metadata[self.metadata['vote_average'].notnull()]['vote_average'].astype('int')
        self.metadata = self.metadata.drop([19730, 29503, 35587]) #some incomplete data
        self.metadata['id'] = self.metadata['id'].astype('int')
        self.metadata['year'] = pd.to_datetime(self.metadata['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
        
        self.links_small = self.links_small[self.links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
        
        self.small_data = self.metadata[self.metadata['id'].isin(self.links_small)]
        #print(self.small_data.shape)
        
        self.small_data['tagline'] = self.small_data['tagline'].fillna('')
        self.small_data['description'] = self.small_data['overview'] + self.small_data['tagline']
        self.small_data['description'] = self.small_data['description'].fillna('')
    
    def get_cosine_sim(self):
        
        """Function to callculate cosine similarity based on the  movie description"""
        
        tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
        tfidf_matrix = tf.fit_transform(self.small_data['description'])
        self.cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
        #print(tfidf_matrix.shape)
    
    def get_SVD_model(self):
        
        """"Function to train the SVD model with small data
        
        The model predict rating from userId input, the accuracy indicated by RMSE
        
        """
        
        reader = Reader()
        data = Dataset.load_from_df(self.ratings_small[['userId', 'movieId', 'rating']], reader)

        # Split the dataset into training and testing sets
        trainset, testset = train_test_split(data, test_size=0.2)

        # Initialize the SVD model
        self.svd = SVD()

        # Fit the model to the training data
        self.svd.fit(trainset)

        # Perform cross-validation and evaluate the model
        results = cross_validate(self.svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

        # Access the results (RMSE and MAE for each fold)
        for fold_num in range(5):
            print(f"Fold {fold_num + 1}: RMSE = {results['test_rmse'][fold_num]}, MAE = {results['test_mae'][fold_num]}")

        # Optionally, you can calculate the mean RMSE and MAE across all folds
        mean_rmse = results['test_rmse'].mean()
        mean_mae = results['test_mae'].mean()
        print(f"Mean RMSE across folds: {mean_rmse}")
        print(f"Mean MAE across folds: {mean_mae}")
    
    def convert_int(self, x):
        
        """Function to convert x to int """
        try:
            return int(x)
        except:
            return np.nan
    
    def get_idmap(self):
        """Function to convert x to int """
        self.small_data = self.small_data.reset_index()
        self.indices = pd.Series(self.small_data.index, index=self.small_data['title'])
        
        self.id_map['tmdbId'] = self.id_map['tmdbId'].apply(self.convert_int)
        self.id_map.columns = ['movieId', 'id']
        self.id_map = self.id_map.merge(self.small_data[['title', 'id']], on='id').set_index('title')
        self.indices_map = self.id_map.set_index('id')
    
    def prep_hybrid(self):
        
        """Function to prepared the SVD model and cosin similarity matrix to make hybrid recommedation"""
        
        self.get_small_metadata()
        self.get_cosine_sim()
        self.get_SVD_model()
        self.get_idmap()
    
    def main(self, userId, title, display=10):
        
        
        """Function to make hybrid recommedation
        
        Args:
            userId(int): the user ID
            title(str): the movie that user interested
            
        Return:
            movies(object): the data table, which the recommeded movies
            
        """
        
        idx = self.indices[title]
        tmdbId = self.id_map.loc[title]['id']
        #print(idx)
        movie_id = self.id_map.loc[title]['movieId']

        sim_scores = list(enumerate(self.cosine_sim[int(idx)]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:26]
        movie_indices = [i[0] for i in sim_scores]

        movies = self.small_data.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
        movies['est'] = movies['id'].apply(lambda x: self.svd.predict(userId, self.indices_map.loc[x]['movieId']).est)
        movies = movies.sort_values('est', ascending=False)
        
        return movies.head(display)

## Preparing SVD model with Collaborative Filtering

In [97]:
metadata_path = r"C:\Users\MMIL\Panithan\Git_projects\TheMoviesDataset\archive\movies_metadata.csv"
ratings_small_path = r"C:\Users\MMIL\Panithan\Git_projects\TheMoviesDataset\archive\ratings_small.csv"
links_small_path = r"C:\Users\MMIL\Panithan\Git_projects\TheMoviesDataset\archive\links_small.csv"

A1 = hybrid_recomsys(metadata_path, links_small_path, ratings_small_path)
A1.prep_hybrid()

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8979  0.8928  0.9032  0.8961  0.8977  0.8976  0.0033  
MAE (testset)     0.6922  0.6861  0.6976  0.6914  0.6901  0.6915  0.0037  
Fit time          2.81    2.80    2.78    2.69    2.63    2.74    0.07    
Test time         0.32    0.07    0.08    0.07    0.07    0.12    0.10    
Fold 1: RMSE = 0.8979139938296814, MAE = 0.6921971751854891
Fold 2: RMSE = 0.8928495995937311, MAE = 0.6861072540223896
Fold 3: RMSE = 0.903155679302676, MAE = 0.6975651992364367
Fold 4: RMSE = 0.896086334250506, MAE = 0.6914407216604268
Fold 5: RMSE = 0.8977497888881752, MAE = 0.6901100104747441
Mean RMSE across folds: 0.8975510791729538
Mean MAE across folds: 0.6914840721158971


## The hybrid movie recommendersystems:

- **Input:** user ID and the Title of a Movie (The movies that user just finished watched)
- **Output:** Predicted high rating for a movies as a "Similar movies". 
- We sorted those ouput movies on the basis of expected ratings and recommend to the particular user.

In [98]:
A1.main(userId=1, title ='Iron Man')

Unnamed: 0,title,vote_count,vote_average,year,id,est
3274,Empire of the Sun,491.0,7.0,1987,10110,3.256275
6144,Batman Begins,7511.0,7.0,2005,272,3.073342
1772,Married to the Mob,67.0,5.0,1988,2321,3.037424
6055,Hostage,519.0,6.0,2005,2026,2.965814
5511,To End All Wars,42.0,6.0,2001,1783,2.963293
8758,Avengers: Age of Ultron,6908.0,7.0,2015,99861,2.871619
3009,Missing in Action,107.0,5.0,1984,15379,2.855302
5661,Scarface,88.0,7.0,1932,877,2.822287
1648,Return from Witch Mountain,38.0,5.0,1978,14822,2.810076
3842,Baran,19.0,7.0,2001,43774,2.795073


In [99]:
A1.main(userId= 500, title ='Iron Man')

Unnamed: 0,title,vote_count,vote_average,year,id,est
8285,Iron Man 3,8951.0,6.0,2013,68721,3.222865
3500,The Last Dragon,71.0,6.0,1985,13938,3.139356
355,Street Fighter,330.0,4.0,1994,11667,3.106803
3274,Empire of the Sun,491.0,7.0,1987,10110,3.088131
5511,To End All Wars,42.0,6.0,2001,1783,3.072415
3009,Missing in Action,107.0,5.0,1984,15379,3.0493
8758,Avengers: Age of Ultron,6908.0,7.0,2015,99861,3.033825
1735,Stage Fright,61.0,6.0,1950,1978,2.994628
6144,Batman Begins,7511.0,7.0,2005,272,2.991196
1648,Return from Witch Mountain,38.0,5.0,1978,14822,2.980047
