# Collaborative filtering :

<div>
    <img src="https://editor.analyticsvidhya.com/uploads/460031_9XZYM6B5Ly-ENYTkEtr9dA.png" alt="" style='width: 350px; height:400px;'>
    <p>Collaborative filtering is a recommendation technique that relies on collecting and analyzing
         user interactions, behaviors, or preferences to make predictions and suggest items or content
          that users might find relevant. It assumes that users who have agreed in the past tend to agree
           in the future and recommends items based on similarities between users or items, fostering
            personalized recommendations in various domains such as movies, music, or products.
             Collaborative filtering can be categorized into user-based and item-based methods.</p>
    <ul>
        <li>In theory, user-user and item-item are dual approaches</li>
        <li style="color: red;">In practice, item-item ourperforms user-user in many use cases</li>
        <li>Items are 'simpler' than users 
            <ul>
                <li>Items belong to a small set of 'genres', users have varied tastes</li>
                <li>Item similarity is more meaningful than User simnilarity</li>
            </ul>
        </li>
    </ul>
    <a href="https://www.kaggle.com/code/nadirnourine/multiple-movies-recommander-content-based">You can check my item-item approach in this notebook</a>
</div>

# Import main libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix


In [97]:
metadata= pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv')
ratings = pd.read_csv('/kaggle/input/the-movies-dataset/ratings.csv')

  metadata= pd.read_csv('movies_metadata/movies_metadata.csv')


# Data cleaning 

In [98]:
metadata.head()
metadata['id'] = pd.to_numeric(metadata['id'], errors='coerce', downcast='integer')

# Drop rows with NaN (invalid entries)
metadata = metadata.dropna(subset=['id'])

ratings= ratings.iloc[:2000000,:]

ratings.head(len(ratings))


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
...,...,...,...,...
1999995,20808,69122,4.0,1258758055
1999996,20808,69640,3.0,1248641618
1999997,20808,69757,5.0,1258756389
1999998,20808,70286,4.5,1258756286


In [4]:
print('nb of users : ', len(ratings['userId'].unique()))
print('nb of movies : ', len(ratings['movieId'].unique()))

nb of users :  20808
nb of movies :  23281


In [92]:
ratings_array= np.array(ratings)
ratings_array.shape

(2000000, 6)

# Center ratings 
### Because some users are more clement in their ratings and other not 

In [93]:
# Assuming 'ratings' is your DataFrame with columns: 'user_id', 'movieId', 'rating'
# Extracting unique user and movie IDs
unique_user_ids = ratings['userId'].unique()
unique_movie_ids = ratings['movieId'].unique()
for idx, value in enumerate(unique_user_ids):
    user_indices = np.where(ratings['userId'] == value)
      
    user_mean_rating= np.mean(ratings_array[user_indices][:,2])
    ratings_array[user_indices, 2] = ratings_array[user_indices, 2] - user_mean_rating


In [99]:
ratings['centered_rating']= ratings_array[:,2]
ratings

Unnamed: 0,userId,movieId,rating,timestamp,centered_rating
0,1,110,1.0,1425941529,-3.277778
1,1,147,4.5,1425942435,0.222222
2,1,858,5.0,1425941523,0.722222
3,1,1221,5.0,1425941546,0.722222
4,1,1246,5.0,1425941556,0.722222
...,...,...,...,...,...
1999995,20808,69122,4.0,1258758055,0.821995
1999996,20808,69640,3.0,1248641618,-0.178005
1999997,20808,69757,5.0,1258756389,1.821995
1999998,20808,70286,4.5,1258756286,1.321995


# Create the sparse matrix 

In [100]:

# Creating a mapping of user and movie IDs to integer indices
user_id_to_index = {user_id: index for index, user_id in enumerate(unique_user_ids)}
movie_id_to_index = {movie_id: index for index, movie_id in enumerate(unique_movie_ids)}

# Mapping user and movie IDs to integer indices in the DataFrame
ratings['user_index'] = ratings['userId'].map(user_id_to_index)
ratings['movie_index'] = ratings['movieId'].map(movie_id_to_index)

# Creating a sparse matrix using CSR format
sparse_matrix = csr_matrix((ratings['centered_rating'], (ratings['user_index'], ratings['movie_index'])))

In [101]:
sparse_matrix.shape


(20808, 23281)

# Cosine similarity

In [102]:
similarity=cosine_similarity(sparse_matrix,sparse_matrix)

In [128]:
similarity.shape

(20808, 20808)

# Main 

In [104]:
def get_user_bestMovies(user_id):
   
    user_indices = np.array(np.where(ratings['userId'] == user_id)).ravel()
    best_movies_ = np.where(ratings.iloc[user_indices]['rating']>3)+user_indices[0]
    best_movies_ids=(ratings.iloc[best_movies_.ravel()]['movieId'])

    best_movies = []
    for value in best_movies_ids:
        best_movies_indices = np.where(metadata['id'] == value)
        titles = metadata.iloc[best_movies_indices]['title'].values
        if len(titles) > 0:
            best_movies.append(titles[0])

    return best_movies


In [12]:
def get_simillar_user(user_id):
    indices_descending = np.argsort(similarity[user_id])[::-1]
    return indices_descending

# Get similar users

In [114]:
simillar_users=get_simillar_user(0)
print(simillar_users)

[    0  7932 10407 ...   466 10473 19205]


# Get most similar user best movies

In [123]:
get_user_bestMovies(simillar_users[1])

['Twelve Monkeys',
 'Duel',
 'Backdraft',
 'Shriek If You Know What I Did Last Friday the Thirteenth']