In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
df = movies.merge(ratings, on='movieId')
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    100836 non-null  int64  
 1   title      100836 non-null  object 
 2   genres     100836 non-null  object 
 3   userId     100836 non-null  int64  
 4   rating     100836 non-null  float64
 5   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 4.6+ MB


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
def recommend (active_uid, df):
    selected_df = df[['userId', 'movieId', 'rating']]
    active_u_rec = selected_df[selected_df['userId'] == active_uid]
    #print(active_u_rec)
    active_seen_movies = active_u_rec.movieId.tolist()
    users_rec = selected_df[selected_df['movieId'].isin(active_seen_movies)]
    users_rec = users_rec[users_rec['userId']!=active_uid]
    users_rec_group = users_rec.groupby(['userId'])
    sorted_user_group = sorted(users_rec_group, reverse=True, key=lambda x: len(x[1]))
    sorted_user_group = sorted_user_group[:100]
    
    user_similarity_scores = {}
    for u_id, group in sorted_user_group:
        active_user = active_u_rec[active_u_rec['movieId'].isin(group['movieId'])]
        active_user_ratings = active_user['rating'].tolist()
        group_ratings = group['rating'].tolist()
        sim_score = cosine_similarity([active_user_ratings], [group_ratings])
        user_similarity_scores[u_id] = sim_score[0][0]
        
    similarity_df = pd.DataFrame.from_dict(user_similarity_scores, orient = 'index', columns =['similarity'])
    similarity_df = similarity_df.sort_values(['similarity'], ascending = False)
    top_similar_users = similarity_df.iloc[:50]
    #top_similar_users['userId'] = top_similar_users.index
    top_similar_users=top_similar_users.reset_index()
    top_similar_users=top_similar_users.rename(columns={'index': 'userId'})

        
     #print(top_similar_users)
    top_user_ratings = top_similar_users.merge(ratings, on='userId')
    top_user_ratings['weighted_ratings'] = top_user_ratings['rating'] * top_user_ratings['similarity']
    group_ratings = top_user_ratings.groupby(['movieId']).sum()[['similarity','weighted_ratings']]
    group_ratings['avg_weighted_rating'] = group_ratings['weighted_ratings']/group_ratings['similarity']
    group_ratings = group_ratings.sort_values(['avg_weighted_rating'], ascending = False)
    recommendations = group_ratings[~group_ratings.index.isin(active_u_rec['movieId'])]
    #print(recommendations)
    top_20 = recommendations.iloc[:20]
    #print(top_20)
    top_20  = movies[movies['movieId'].isin(top_20.index)][['movieId','title']]
    #print(recommendations.loc[top])
    return top_20 

In [19]:
recommend(10, df)

{(68,): 0.9186784074034311, (414,): 0.9026784230454101, (177,): 0.9272416430255292, (298,): 0.7832428757640624, (448,): 0.8686309379730608, (232,): 0.9129538695011875, (249,): 0.8984112177317187, (525,): 0.9005917724428727, (111,): 0.8970509695827853, (509,): 0.9222345202505512, (21,): 0.9194473260719448, (105,): 0.8908063967282546, (483,): 0.8989981527242457, (610,): 0.89623138938113, (599,): 0.869859522560398, (305,): 0.8916219623453572, (563,): 0.9260266584022188, (489,): 0.8835025345286477, (560,): 0.8722917660430625, (62,): 0.8714938938676565, (318,): 0.908091193596638, (381,): 0.9016791933509338, (438,): 0.9132819294927604, (480,): 0.8736598873883366, (200,): 0.9309214333623124, (307,): 0.8540523651555301, (339,): 0.8767884216950443, (380,): 0.8966459030113201, (474,): 0.9179589913235384, (517,): 0.8457553872624107, (274,): 0.899661061760722, (590,): 0.9278207124142659, (292,): 0.9392047535885324, (534,): 0.9108026002368791, (606,): 0.902706183762946, (357,): 0.9230989809290605, 

In [None]:
[(user_id,user_id_df ), ]

In [42]:
def recommend (active_uid, df):
    selected_df = df[['userId', 'movieId','title', 'rating']]
    active_user_df = selected_df[selected_df['userId'] == active_uid]
    seen_movies = active_user_df['movieId'].tolist()
    #print(seen_movies)
    other_users = selected_df[(selected_df['movieId'].isin(seen_movies)) & (selected_df['userId'] != active_uid)]
    #print(other_users)
    users_grp = other_users.groupby('userId')
    users_grp = sorted(users_grp, reverse = True, key = lambda x: len(x[1]))
    #print(users_grp)
    sim_dict = {}
    for other_user_id, other_user_df in users_grp[:100]:
        other_user_rating = other_user_df['rating'].tolist()
        active_user_rating = active_user_df[active_user_df['movieId'].isin(other_user_df['movieId'])]['rating'].tolist()
        #print(active_user_rating)
        sim_score = cosine_similarity([other_user_rating], [active_user_rating])
        #print(sim_score)
        sim_dict[other_user_id] = sim_score[0][0]
    #print(sim_dict)
    sim_df = pd.DataFrame()
    sim_df['userId'] = sim_dict.keys()
    sim_df['Similarity'] = sim_dict.values()
    #print(sim_df)
    sim_df = sim_df.sort_values('Similarity', ascending = False)
    top_sim_df = sim_df.head(20)
    #print(top_sim_df)
    
    sim_users_df = pd.merge(selected_df, top_sim_df, how = 'inner', on = 'userId')
    #print(sim_users_df)
    filtered_df = sim_users_df[~(sim_users_df['movieId'].isin(seen_movies))]
    
    filtered_df['weighted_rating'] = filtered_df['Similarity'] * filtered_df['rating']
    #print(filtered_df)
    filtered_grp = filtered_df.groupby('movieId')[['userId']].count()
    
    filtered_grp  = filtered_grp[filtered_grp['userId'] >= 10]
    #print(filtered_grp)
    filtered_df = filtered_df[filtered_df['movieId'].isin(filtered_grp.index)]
    #print(filtered_df)
    filtered_grp = filtered_df.groupby('movieId')[['weighted_rating', 'Similarity']].sum()
    #print(filtered_grp)
    filtered_grp['Predicted_rating'] = filtered_grp['weighted_rating'] / filtered_grp['Similarity']
    #print(filtered_grp)
    sorted_df = filtered_grp.sort_values(by=['Predicted_rating'], ascending = False)
    #print(sorted_df)
    sorted_df = sorted_df[sorted_df['Predicted_rating'] >= 3.8]
    #print(sorted_df)
    recommended_movies = sorted_df.head(20).index
    print(recommended_movies)
    recommendations =movies[movies['movieId'].isin(recommended_movies)]['title']
    print(recommendations.tolist())

In [43]:
recommend(10, df)

Index([  318,   364,  1210,  4886,  4896,  1196, 54001, 60069, 59315,   260,
        1270,     1,  8368,  4963, 40815,  1721,  3793,  6365,  5989, 69122],
      dtype='int64', name='movieId')
['Toy Story (1995)', 'Star Wars: Episode IV - A New Hope (1977)', 'Shawshank Redemption, The (1994)', 'Lion King, The (1994)', 'Star Wars: Episode V - The Empire Strikes Back (1980)', 'Star Wars: Episode VI - Return of the Jedi (1983)', 'Back to the Future (1985)', 'Titanic (1997)', 'X-Men (2000)', 'Monsters, Inc. (2001)', "Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)", "Ocean's Eleven (2001)", 'Catch Me If You Can (2002)', 'Matrix Reloaded, The (2003)', 'Harry Potter and the Prisoner of Azkaban (2004)', 'Harry Potter and the Goblet of Fire (2005)', 'Harry Potter and the Order of the Phoenix (2007)', 'Iron Man (2008)', 'WALL·E (2008)', 'Hangover, The (2009)']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['weighted_rating'] = filtered_df['Similarity'] * filtered_df['rating']


In [None]:
cosine_similarity?

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
dataset = movies.merge(ratings,on='movieId')[['movieId','userId','title','rating']]

In [5]:
movie_grp = dataset.groupby('movieId')[['userId']].count()

In [6]:
movie_grp

Unnamed: 0_level_0,userId
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
...,...
193581,1
193583,1
193585,1
193587,1


In [7]:
movie_grp = movie_grp.sort_values(['userId'], ascending = False)
movie_grp.rename(columns = {'userId': 'Number of Users'}, inplace = True)
selected_movies = movie_grp[movie_grp['Number of Users'] >=10]
selected_movies = selected_movies.index
selected_movies

Index([  356,   318,   296,   593,  2571,   260,   480,   110,   589,   527,
       ...
       45668,    89,  8014, 55768, 40826,  2361,  4228, 55721,  1017, 55232],
      dtype='int64', name='movieId', length=2269)

In [8]:
dataset = dataset[dataset['movieId'].isin(selected_movies)]
dataset

Unnamed: 0,movieId,userId,title,rating
0,1,1,Toy Story (1995),4.0
1,1,5,Toy Story (1995),4.0
2,1,7,Toy Story (1995),4.5
3,1,15,Toy Story (1995),2.5
4,1,17,Toy Story (1995),4.5
...,...,...,...,...
100792,187593,331,Deadpool 2 (2018),4.0
100793,187593,338,Deadpool 2 (2018),1.0
100794,187593,380,Deadpool 2 (2018),3.0
100795,187593,514,Deadpool 2 (2018),3.5


In [9]:
selected_movies

Index([  356,   318,   296,   593,  2571,   260,   480,   110,   589,   527,
       ...
       45668,    89,  8014, 55768, 40826,  2361,  4228, 55721,  1017, 55232],
      dtype='int64', name='movieId', length=2269)

In [12]:
def recommend (active_uid,df):
    selected_df = df[['userId','movieId','rating']]
    active_u_rec = selected_df[selected_df['userId']==active_uid]
    #print(active_u_rec)
    active_seen_movies = active_u_rec['movieId'].tolist()
    other_users = selected_df[selected_df['movieId'].isin(active_seen_movies)]
    other_users = other_users[other_users['userId']!=active_uid]
    #print(other_users)
    other_user_group = other_users.groupby('userId')
    sorted_user_group = sorted(other_user_group,reverse=True,key=lambda x: len(x[1]))
    sorted_user_group = sorted_user_group[:100]
    #print(sorted_user_group)
    user_similarity_scores = {}
    for uid, group in sorted_user_group:    
        active_user = active_u_rec[active_u_rec['movieId'].isin(group['movieId'])]
        #print(active_user)
        #print(group)
        active_user_rating = active_user['rating'].tolist()
        group_rating = group['rating'].tolist()
        sim_score = cosine_similarity([active_user_rating],[group_rating])
        user_similarity_scores[uid] = sim_score[0][0]
    similarity_df = pd.DataFrame.from_dict(user_similarity_scores,orient='index',columns=['similarity'])
    similarity_df = similarity_df.sort_values(['similarity'],ascending=False)
    top_similar_users = similarity_df.iloc[:50]
    top_similar_users = top_similar_users.reset_index()
    top_similar_users = top_similar_users.rename(columns={'index':'userId'})

    top_user_ratings = top_similar_users.merge(ratings,on='userId')
    top_user_ratings['Weighted Rating'] = top_user_ratings['rating']*top_user_ratings['similarity']
    group_ratings = top_user_ratings.groupby('movieId').sum()[['similarity','Weighted Rating']]
    group_ratings['Avg Weighted Rating'] = group_ratings['Weighted Rating']/group_ratings['similarity']
    group_ratings = group_ratings.sort_values('Avg Weighted Rating',ascending=False)
    recommendations = group_ratings[~group_ratings.index.isin(active_u_rec['movieId'])]
    top_20 = recommendations.iloc[:20]
    top_20 = movies[movies['movieId'].isin(top_20.index)][['movieId','title']]
    #print(group_ratings)
    #print(top_user_ratings)
    return top_20
    

In [13]:
recommend(200, dataset)

Unnamed: 0,movieId,title
83,94,Beautiful Girls (1996)
251,290,Once Were Warriors (1994)
281,322,Swimming with Sharks (1995)
2131,2836,Outside Providence (1999)
2690,3606,On the Town (1949)
2947,3951,Two Family House (2000)
3068,4117,Hope and Glory (1987)
3394,4617,Let It Ride (1989)
3685,5075,Waydowntown (2000)
3807,5328,Rain (2001)
