In [43]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error, mean_squared_error,mean_absolute_percentage_error
from scipy.spatial.distance import cosine

In [44]:
views = pd.read_csv('movies_views.csv' ,low_memory=False)
ratings = pd.read_csv('movies_ratings.csv',low_memory=False)
movies = pd.read_csv('movies_movie.csv',low_memory=False)

movies.drop(['rating','portrait','landscape'],inplace=True, axis=1)
ratings.drop('rorb_id',inplace=True, axis=1)
ratings.rename(columns = {'ron':'m_id'}, inplace = True)

In [45]:
movies.head()

Unnamed: 0,m_id,m_name,release_year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [46]:
ratings.head()

Unnamed: 0,rating,rby,m_id
0,8,1.0,1.0
1,8,1.0,3.0
2,8,1.0,6.0
3,10,1.0,42.0
4,10,1.0,45.0


In [47]:
views.head()

Unnamed: 0,m_id,sum,count
0,1,1686,215
1,2,755,110
2,3,339,52
3,4,33,7
4,5,301,49


In [48]:

ratings.rename(columns = {'rating':'centered_rating'}, inplace = True)
ratings.head()

Unnamed: 0,centered_rating,rby,m_id
0,8,1.0,1.0
1,8,1.0,3.0
2,8,1.0,6.0
3,10,1.0,42.0
4,10,1.0,45.0


In [49]:
msc = pd.merge(movies,views,on='m_id')
msc['average_rating'] = msc['sum'] / msc['count']
msc

Unnamed: 0,m_id,m_name,release_year,sum,count,average_rating
0,1,Toy Story,1995,1686,215,7.841860
1,2,Jumanji,1995,755,110,6.863636
2,3,Grumpier Old Men,1995,339,52,6.519231
3,4,Waiting to Exhale,1995,33,7,4.714286
4,5,Father of the Bride Part II,1995,301,49,6.142857
...,...,...,...,...,...,...
8757,8758,Black Butler: Book of the Atlantic,2017,8,1,8.000000
8758,8759,No Game No Life: Zero,2017,7,1,7.000000
8759,8760,Flint,2017,7,1,7.000000
8760,8761,Bungo Stray Dogs: Dead Apple,2018,7,1,7.000000


In [50]:
popularity_threshold = 25
rating_popular_movie= msc.query('count >= @popularity_threshold')
rating_popular_movie

Unnamed: 0,m_id,m_name,release_year,sum,count,average_rating
0,1,Toy Story,1995,1686,215,7.841860
1,2,Jumanji,1995,755,110,6.863636
2,3,Grumpier Old Men,1995,339,52,6.519231
4,5,Father of the Bride Part II,1995,301,49,6.142857
5,6,Heat,1995,805,102,7.892157
...,...,...,...,...,...,...
8228,8229,"Big Short, The",2015,206,26,7.923077
8285,8286,Zootopia,2016,249,32,7.781250
8439,8440,Arrival,2016,207,26,7.961538
8477,8478,Rogue One: A Star Wars Story,2016,212,27,7.851852


In [51]:
df = pd.merge(rating_popular_movie,ratings,on='m_id')
df.drop('release_year', inplace=True, axis=1)
df

Unnamed: 0,m_id,m_name,sum,count,average_rating,centered_rating,rby
0,1,Toy Story,1686,215,7.84186,8,1.0
1,1,Toy Story,1686,215,7.84186,8,5.0
2,1,Toy Story,1686,215,7.84186,9,7.0
3,1,Toy Story,1686,215,7.84186,5,15.0
4,1,Toy Story,1686,215,7.84186,9,17.0
...,...,...,...,...,...,...,...
56306,8506,Logan,214,25,8.56000,8,567.0
56307,8506,Logan,214,25,8.56000,10,586.0
56308,8506,Logan,214,25,8.56000,10,596.0
56309,8506,Logan,214,25,8.56000,7,599.0


In [52]:
user_movie_matrix = df.pivot(index='rby', columns='m_id', values='centered_rating')
user_movie_train_matrix=None
user_movie_matrix

m_id,1,2,3,5,6,7,10,11,16,17,...,7792,7796,7967,7987,8074,8229,8286,8440,8478,8506
rby,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,8.0,,8.0,,8.0,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,8.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
608.0,5.0,4.0,4.0,,,,8.0,,9.0,,...,,,,,,,,,,
609.0,6.0,,,,,,8.0,,,,...,,,,,,,,,,
610.0,10.0,,,,10.0,,,,9.0,,...,6.0,,7.0,7.0,9.0,8.0,8.0,10.0,8.0,10.0
611.0,,,,,,,,,,,...,,,,,,,,,,


In [53]:
def compute_cosine_similarity(user1, user2):
    common_items = user_movie_matrix.loc[user1].dropna().index.intersection(user_movie_train_matrix.loc[user2].dropna().index)
    if len(common_items) == 0:
        return 0
    else:
        user1_ratings = user_movie_train_matrix.loc[user1][common_items]
        user2_ratings = user_movie_train_matrix.loc[user2][common_items]
        return 1 - cosine(user1_ratings, user2_ratings)

In [54]:
def find_k_nearest_neighbors(user, k):
    similarities = []
    for other_user in user_movie_train_matrix.index:
        if other_user != user:
            similarity = compute_cosine_similarity(user, other_user)
            similarities.append((other_user, similarity))
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:k]

In [55]:
def user_based_recommendation(user, k, rm):
    if user not in user_movie_matrix.index:
        return "No recommendations available for this user"
    user_ratings = user_movie_matrix.loc[user].dropna()
    if len(user_ratings) == 0:
        return "This user has not rated any movies"
    global user_movie_train_matrix
    user_movie_train_matrix = user_movie_matrix.copy()
    train_rated_movies = user_movie_train_matrix.loc[user].dropna()
    num = int(0.2 * len(train_rated_movies))
    np.random.seed(42)
    movies_to_remove = np.random.choice(train_rated_movies.index, size=num, replace=False)
    
    user_movie_train_matrix.loc[user, movies_to_remove] = np.nan
    
    test_id = movies_to_remove.tolist()
    user_df = df[(df['rby'] == user) & (df['m_id'].isin(test_id))]
    actual_r = user_df[["m_id", "centered_rating"]].reset_index(drop=True)
    
    neighbors = find_k_nearest_neighbors(user, k)
    neighbor_ratings = []
    neighbor_similarity_sum =[]
    for neighbor in neighbors:
        neighbor_user_id = neighbor[0]
        neighbor_similarity = neighbor[1]
        neighbor_ratings.append(user_movie_train_matrix.loc[neighbor_user_id].fillna(0) * neighbor_similarity)
        neighbor_similarity_sum.append(user_movie_train_matrix.loc[neighbor_user_id].notnull() * abs(neighbor_similarity))
        
    neighbor_ratings_df = pd.concat(neighbor_ratings, axis=1).sum(axis=1)

    neighbor_sum_df = pd.concat(neighbor_similarity_sum, axis=1).sum(axis=1)

    
    zz = pd.DataFrame({'m_id': neighbor_ratings_df.index, 'rating_s': neighbor_ratings_df.values})
    xx = pd.DataFrame({'m_id': neighbor_sum_df.index, 'similarity_sum': neighbor_sum_df.values})
    xz = pd.merge(zz,xx,on='m_id')
    
    xz['rating'] = round(xz['rating_s']/xz['similarity_sum'])
    xz.drop(['similarity_sum','rating_s'],inplace=True, axis=1)

    xz=xz.dropna()
    
    estimated_r=xz.copy()

    merged_ratings = pd.merge(actual_r, estimated_r, on='m_id',  how='inner')
    
    if merged_ratings.shape[0]>0:
        mae = mean_absolute_error(merged_ratings['centered_rating'], merged_ratings['rating'])
        mape = mean_absolute_percentage_error(merged_ratings['centered_rating'], merged_ratings['rating'])
    else:
        mae="not available"
        mape="not available"

    user_ratings = user_movie_train_matrix.loc[user].dropna()
    
    unrated_movies = user_movie_matrix.columns.difference(user_ratings.index).tolist()

    if len(unrated_movies) == 0:
        return "This user has rated all movies"
    if neighbor_ratings_df[unrated_movies].empty:
        return "There are no unrated movies left to recommend"
    recommended_movie = xz[xz['m_id'].isin(unrated_movies)].sort_values('rating',ascending=False)['m_id'].tolist()
    
    recommended_movie=recommended_movie[:rm]
    tp=len(list(set(recommended_movie).intersection(test_id)))

    print("no. of nearest neighbour :",len(neighbors))
    print("No. of recommended movies :",len(recommended_movie))
    print("TP :",tp)
    print("no. of Movies watched in test :",len(test_id))
    print("Precison :",tp/len(recommended_movie))
    print("recall :",tp/len(test_id))
    print("MAE :",mae)
    print("MAPE :",mape)
    recommended_movie=recommended_movie[:5]
    movie_id_to_name = dict(zip(movies['m_id'], df['m_name']))
    movie_names = [movie_id_to_name[m_id] for m_id in recommended_movie]
    print("User-Neighborhood-Based Recommendation:")
    return movie_names

In [56]:
print(user_based_recommendation(123, 15,60))

no. of nearest neighbour : 15
No. of recommended movies : 60
TP : 1
no. of Movies watched in test : 8
Precison : 0.016666666666666666
recall : 0.125
MAE : 1.1428571428571428
MAPE : 0.14200680272108843
User-Neighborhood-Based Recommendation:
['Nell ', 'Apollo 13 ', 'Casino ', 'Twelve Monkeys (a.k.a. 12 Monkeys) ', 'Heat ']
