In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors

In [8]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
users = pd.read_csv('users.csv')

In [9]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation Children's Comedy
1,2,Jumanji (1995),Adventure Children's Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [11]:
users.head()

Unnamed: 0,userId,gender,age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


# Item-based collaborative filtering
**Item-based collaborative filtering** is the recommendation system to use the similarity between items using the ratings by users.

**Step 1**: Find the most similar (the nearest) movies to the movie for which you want to predict the rating.

**Step 2**: Calculate the weighted average of the ratings for the most similar movies by the user.

In [12]:
movie_user_df = ratings.pivot(index='movieId',columns='userId',values='rating')
movie_user_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,4.0,,4.0,5.0,5.0,...,,4.0,,,4.0,,,,,3.0
2,,,,,,,,,,5.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,3.0,,,...,,,,,2.0,2.0,,,,
5,,,,,,,,,,,...,,,,,1.0,,,,,


In [13]:
movie_user_df.shape #3706 movies and 6040 users 

(3706, 6040)

In [14]:
movie_user_df.fillna(0,inplace=True)
movie_user_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [15]:
def predict_rating_for_movies_that_user_not_watched(df, user_id, number_neighbors):
    # copy df
    df1 = df.copy()

    # find the nearest neighbors using NearestNeighbors
    number_neighbors = 10
    knn = NearestNeighbors(metric='cosine', algorithm='brute')
    knn.fit(df.values)
    distances, indices = knn.kneighbors(df.values, n_neighbors=number_neighbors)

    # convert user_name to user_index
    user_index = df.columns.tolist().index(user_id)

    #  m: the row number of t in df,t: movie_title
    for m , t in list(enumerate(df.index)):
    
        # find movies without ratings by user_id
        if df.iloc[m, user_index] == 0:
            similar_movies = indices[m].tolist()
            movie_distances = distances[m].tolist()
            
            # indices[3] = [3 6 7]. The movie itself is in the first place. so we remove the movie itself from the list.
            if m in similar_movies:
                id_movie = similar_movies.index(m)
                similar_movies.remove(m)
                movie_distances.pop(id_movie) 
                
            else:
                similar_movies = similar_movies[:number_neighbors-1]
                movie_distances = movie_distances[:number_neighbors-1]
                
            # movie_similarity = 1 - movie_distance    
            movie_similarity = [1-x for x in movie_distances]
            movie_similarity_copy = movie_similarity.copy()
            nominator = 0
            
            for s in range(0, len(movie_similarity)):
                # check if the rating of a similar movie is zero
                if df.iloc[similar_movies[s], user_index] == 0:
                    
                    # if the rating is zero, ignore the rating and the similarity in calculating the predicted rating
                    if len(movie_similarity_copy) == (number_neighbors - 1):
                        movie_similarity_copy.pop(s)
                    else:
                        movie_similarity_copy.pop(s-(len(movie_similarity)-len(movie_similarity_copy)))

                # if the rating is not zero, use the rating and similarity in the calculation
                else:
                    nominator = nominator + movie_similarity[s]*df.iloc[similar_movies[s],user_index]
                    
            # check if the number of the ratings with non-zero is positive
            if len(movie_similarity_copy) > 0:
                # check if the sum of the ratings of the similar movies is positive.
                if sum(movie_similarity_copy) > 0:
                    predicted_r = nominator/sum(movie_similarity_copy)
                # Even if there are some movies for which the ratings are positive, some movies have zero similarity even though they are selected as similar movies.
                # in this case, the predicted rating becomes zero as well  
                else:
                    predicted_r = 0
            # if all the ratings of the similar movies are zero, then predicted rating should be zero
            else:
                predicted_r = 0

            # place the predicted rating into the copy of the original dataset
            df1.iloc[m,user_index] = round(predicted_r,1)
    return df1

In [16]:
def recommend_movies_by_userId(df,user, num_recommended_movies):
    df1 = predict_rating_for_movies_that_user_not_watched(df, user, 1000)
    print('The list of the Movies {} Has Watched \n'.format(user))

    list_of_movies = df[df[user] > 0][user].index.tolist()
    print(list_of_movies)
    print('\n')

    recommended_movies = []

    for m in df[df[user] == 0].index.tolist():
        index_df = df.index.tolist().index(m)
        predicted_rating = df1.iloc[index_df, df1.columns.tolist().index(user)]
        recommended_movies.append((m, predicted_rating))

    sorted_recommended_movies = sorted(recommended_movies, key=lambda x:x[1], reverse=True)
    
    print('The list of the Recommended Movies \n')
    rank = 1
    for recommended_movie in sorted_recommended_movies[:num_recommended_movies]:
        print('{}: {} - predicted rating:{}'.format(rank, recommended_movie[0], recommended_movie[1]))
        rank = rank + 1

In [17]:
recommend_movies_by_userId(movie_user_df, 500, 10) # 500: user_id, 500: number of recommended movies

The list of the Movies 500 Has Watched 

[17, 34, 36, 50, 126, 262, 296, 318, 337, 364, 497, 527, 531, 546, 551, 588, 594, 595, 608, 661, 720, 745, 899, 914, 918, 919, 953, 954, 971, 1022, 1028, 1029, 1030, 1031, 1032, 1035, 1073, 1081, 1088, 1097, 1148, 1172, 1196, 1197, 1213, 1220, 1223, 1278, 1282, 1380, 1381, 1449, 1480, 1654, 1704, 1951, 2012, 2014, 2017, 2046, 2052, 2078, 2081, 2083, 2084, 2087, 2088, 2099, 2108, 2162, 2243, 2324, 2357, 2396, 2463, 2565, 2571, 2599, 2657, 2692, 2700, 2709, 2746, 2762, 2804, 2857, 2858, 2863, 2946, 2997, 3396, 3408, 3429, 3461, 3504, 3671, 3712, 3751, 3795, 3897, 3948]


The list of the Recommended Movies 

1: 2 - predicted rating:5.0
2: 21 - predicted rating:5.0
3: 28 - predicted rating:5.0
4: 164 - predicted rating:5.0
5: 181 - predicted rating:5.0
6: 198 - predicted rating:5.0
7: 222 - predicted rating:5.0
8: 261 - predicted rating:5.0
9: 302 - predicted rating:5.0
10: 314 - predicted rating:5.0


In [18]:
import pickle
pickle.dump(movie_user_df,open('moviesId_userId.pkl','wb'))

In [19]:
movie_user = pickle.load(open('moviesId_userId.pkl','rb'))

In [20]:
movie_user

userId,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
import movieposters as mp
link = mp.get_poster(title='Braveheart (1995)')

In [27]:
link

'https://m.media-amazon.com/images/M/MV5BMzkzMmU0YTYtOWM3My00YzBmLWI0YzctOGYyNTkwMWE5MTJkXkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_QL75_UY562_CR1,0,380,562_.jpg'

In [24]:
def retrieve_movie_name_genres(movie_id, movies_df):
    movie_name = movies_df[movies_df['movieId'] == movie_id]['title'].values[0]
    movie_genres = movies_df[movies_df['movieId'] == movie_id]['genres'].values[0]
    return movie_name, movie_genres

In [25]:
retrieve_movie_name_genres(1, movies)

('Toy Story (1995)', "Animation Children's Comedy")

In [29]:
for i in range(len(movies)):
    try:
        movies.loc[i,'poster_id'] = mp.get_poster(title=movies.loc[i,'title'])
    except:
        movies.loc[i,'poster_id'] = "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRoWcWg0E8pSjBNi0TtiZsqu8uD2PAr_K11DA&usqp=CAU"

In [30]:
movies.head()

Unnamed: 0,movieId,title,genres,poster_id
0,1,Toy Story (1995),Animation Children's Comedy,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,2,Jumanji (1995),Adventure Children's Fantasy,https://m.media-amazon.com/images/M/MV5BZTk2Zm...
2,3,Grumpier Old Men (1995),Comedy Romance,https://m.media-amazon.com/images/M/MV5BMjQxM2...
3,4,Waiting to Exhale (1995),Comedy Drama,https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,5,Father of the Bride Part II (1995),Comedy,https://m.media-amazon.com/images/M/MV5BOTEyNz...


In [31]:
movies.to_csv('movies_with_posters.csv',index=False)

In [32]:
def retrieve_movie_name_genres(movie_id, movies_df):
    movie_name = movies_df[movies_df['movieId'] == movie_id]['title'].values[0]
    movie_genres = movies_df[movies_df['movieId']
                             == movie_id]['genres'].values[0]
    posters = movies_df[movies_df['movieId']
                        == movie_id]['poster_id'].values[0]
    return movie_name, movie_genres, posters

In [33]:
retrieve_movie_name_genres(1, movies)

('Toy Story (1995)',
 "Animation Children's Comedy",
 'https://m.media-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_QL75_UX380_CR0,2,380,562_.jpg')

In [34]:
movies["poster_id"].value_counts()

https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRoWcWg0E8pSjBNi0TtiZsqu8uD2PAr_K11DA&usqp=CAU                                                        3109
https://m.media-amazon.com/images/M/MV5BZWU5NWQ2MWMtYmI4Ni00Yjk5LTk3ODktMDVlZGViNGUwN2M5XkEyXkFqcGdeQXVyNTAyODkwOQ@@._V1_QL75_UX380_CR0,15,380,562_.jpg       3
https://m.media-amazon.com/images/M/MV5BYmVhNmFmOGYtZjgwNi00ZGQ0LThiMmQtOGZjMDUzNzJhMGIzXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_QL75_UY562_CR1,0,380,562_.jpg        2
https://m.media-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_QL75_UX380_CR0,2,380,562_.jpg        1
https://m.media-amazon.com/images/M/MV5BYjM0N2ViMzUtMTc1OS00YmEzLWE2NWYtNjU5NTY4NjRlOTI0XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_QL75_UX380_CR0,2,380,562_.jpg        1
                                                                                                                                                           ... 
https://m.media-amazon.com/images/M/MV5B