# ****Import Python Library

Following are the Packages Import 
(1) pandas # Pandas Library to load the CSV, TSV Files

(2) numpy # Numerical Operation on Data

(3) sklearn.model_selection # import the train_test_split which will split the data set into the Train & Test Dataset

(4) sklearn.metrics # Import the mean_squared_error (MSE) to evaluate the model and pairwise_distances to calculate the distance between the two vecors

(5) sklearn.metrics.pairwise # Import the cosine similarity

(6) scipy.spatial.distance # Import the Cosine and corelation between the vectors


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import mean_squared_error,pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine, correlation
import warnings
warnings.filterwarnings('ignore')

# ***Load the Data.

I have copied the data into the Local Drive, before load into the variable via pandas

In [2]:
# Reading ratings file
ratings = pd.read_csv('../data/ratings.dat', sep='::', encoding='latin-1', names=['userId','movieId','rating','timestamp'])

# Reading movies file
movies = pd.read_csv('../data/movies.dat', sep='::', encoding='latin-1', names=['movieId','title','genres'])

##### Size of the Rating Datasets Loaded (Printing)

In [3]:
print ("Size of the ratings dataset: {}".format(ratings.shape))
ratings.head(5)

Size of the ratings dataset: (1000209, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


##### Size of the Movies Datasets Loaded (Printing)

In [4]:
print ("Size of the movies dataset: {}".format(movies.shape))
movies.head(5)

Size of the movies dataset: (3883, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


### Merge the datasets of movies and rtatings dataframe

In [5]:
df_movies_ratings=pd.merge(movies, ratings,how="inner",on="movieId")
print ("Size of the movies dataset: {}".format(df_movies_ratings.shape))
df_movies_ratings.head(5)

Size of the movies dataset: (1000209, 6)


Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,978237008
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,978233496
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,978225952
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,978226474


#  ***SYSTEM - I

## SCHEME - I (High Rated)

Propose the User Top 10 Movies (High Rated) against the input Genres 


The Top 10 High Rated Movies are based on the "Rating". So once the "genres" has been input, we will search for all the movies against the respective "genres". We will then calculate average all the rating (mentioned by all the users) against the movie id, then sort it. We will show the Top 10 Movies which have the highest average rating.

### Input the Genres (From the User)

In [6]:
input_genres = 'Comedy'

### Extract Movies and Average the Rating 

Extract all the Movies, UserIds, Ratings against the Input Genres

Then stored the Average the "Rating" against each movideId (& Title) 

In [134]:
df_movies_ratings_genres = df_movies_ratings[df_movies_ratings['genres'] == 'Comedy']
df_movies_ratings_genres_grouped_by_rating = df_movies_ratings_genres.groupby(['movieId','title']).mean().reset_index()
df_movies_ratings_genres_grouped_Top_10 = df_movies_ratings_genres_grouped_by_rating.sort_values(['rating'],ascending=False).head(10)
df_movies_ratings_genres_grouped_Top_10[['movieId','title','rating']]

Unnamed: 0,movieId,title,rating
204,1830,Follow the Bitch (1998),5.0
384,3233,Smashing Time (1967),5.0
354,3022,"General, The (1927)",4.368932
140,1136,Monty Python and the Holy Grail (1974),4.33521
348,2937,"Palm Beach Story, The (1942)",4.288462
122,905,It Happened One Night (1934),4.280749
81,598,Window to Paris (1994),4.25
129,1002,Ed's Next Move (1996),4.25
127,951,His Girl Friday (1940),4.24937
420,3462,Modern Times (1936),4.236066


### Show the Top 10 Movies (SYSTEM - I & SCHEME - I )

In [135]:
df_movies_ratings_genres_grouped_Top_10['title']

204                   Follow the Bitch (1998)
384                      Smashing Time (1967)
354                       General, The (1927)
140    Monty Python and the Holy Grail (1974)
348              Palm Beach Story, The (1942)
122              It Happened One Night (1934)
81                     Window to Paris (1994)
129                     Ed's Next Move (1996)
127                    His Girl Friday (1940)
420                       Modern Times (1936)
Name: title, dtype: object

## SCHEME - II (Trending)

Propose the User Top 10 Movies (Trending) against the input Genres 


The Top 10 Trending Movie is derived based on the number of users rated the movie. If large number of users have rated the movie, will treat the movie is a trending

### Input the Genres (From the User)

In [136]:
input_genres = 'Comedy'

### Extract Movies and Average the Rating 

Extract all the Movies, UserIds, Ratings against the Input Genres

Then stored the Average the "Rating" against each movideId (& Title) 

In [130]:
df_movies_ratings_genres = df_movies_ratings[df_movies_ratings['genres'] == 'Comedy']
df_movies_ratings_genres_grouped_by_User_Id_Count = df_movies_ratings_genres.groupby(['movieId','title']).sum().reset_index()
df_movies_ratings_genres_grouped_Top_10_user_Id_Count = df_movies_ratings_genres_grouped_by_User_Id_Count.sort_values(['userId'],ascending=False).head(10)
df_movies_ratings_genres_grouped_Top_10_user_Id_Count


Unnamed: 0,movieId,title,userId,rating,timestamp
350,2997,Being John Malkovich (1999),6812507,9245,2176167564950
140,1136,Monty Python and the Holy Grail (1974),4926716,6932,1554749970467
324,2791,Airplane! (1980),4834332,6874,1683064258937
345,2918,Ferris Bueller's Day Off (1986),4579092,6065,1431218141186
159,1394,Raising Arizona (1987),4466021,5766,1393007524779
305,2599,Election (1999),4386151,5982,1479535681520
308,2683,Austin Powers: The Spy Who Shagged Me (1999),4368093,4859,1394669169002
132,1079,"Fish Called Wanda, A (1988)",4201738,5266,1287726379158
29,223,Clerks (1994),4173569,5573,1372721807925
312,2706,American Pie (1999),4140443,5153,1351613377020


### Show the Top 10 Movies (SYSTEM - I & SCHEME - II )

In [132]:
df_movies_ratings_genres_grouped_Top_10_user_Id_Count['title']

350                     Being John Malkovich (1999)
140          Monty Python and the Holy Grail (1974)
324                                Airplane! (1980)
345                 Ferris Bueller's Day Off (1986)
159                          Raising Arizona (1987)
305                                 Election (1999)
308    Austin Powers: The Spy Who Shagged Me (1999)
132                     Fish Called Wanda, A (1988)
29                                    Clerks (1994)
312                             American Pie (1999)
Name: title, dtype: object

# ***SPLITING OF DATA

We will use the Sklearn's train_test_split to split the data

We will use 75% Training Set and 25% Test Set

In [116]:
X = ratings.copy()
y = ratings['userId']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 125247)

##### Print the Split Data sets

In [117]:
X_train.shape

(750156, 4)

In [118]:
X_train

Unnamed: 0,userId,movieId,rating,timestamp
497036,3054,1284,4,970157677
834793,5017,1291,3,962582897
203497,1248,2561,1,975625474
733746,4386,1625,4,965241005
108525,711,2324,5,975621555
...,...,...,...,...
764102,4543,2598,3,965112213
705613,4227,2694,4,965315451
367689,2146,2948,2,974623277
107529,709,2716,5,975540039


In [119]:
y_train

497036    3054
834793    5017
203497    1248
733746    4386
108525     711
          ... 
764102    4543
705613    4227
367689    2146
107529     709
55589      368
Name: userId, Length: 750156, dtype: int64

In [120]:
y_train.shape

(750156,)

In [121]:
print ("Training - Features - Data Shape: {}".format(X_train.shape))
print ("Training - Target -  Data Shape: {}".format(y_train.shape[0]))
print ("---------------------------------------------")
print ("Test - Feature - Data Shape: {}".format(X_test.shape))
print ("Test - Feature - Data Shape: {}".format(y_test.shape[0]))

Training - Features - Data Shape: (750156, 4)
Training - Target -  Data Shape: 750156
---------------------------------------------
Test - Feature - Data Shape: (250053, 4)
Test - Feature - Data Shape: 250053


##### Defining the Function Rot Mean Squared Error (RMSE)

In [122]:
#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [123]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    id_pairs = zip(X_test['userId'], X_test['movieId']) #Construct a list of user-movie tuples from the testing dataset
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs]) #Predict the rating for every user-movie tuple
    y_true = np.array(X_test['rating']) #Extract the actual ratings given by the users in the test data
    return rmse(y_true, y_pred) #Return the final RMSE score

#  SYSTEM - II

SYSTEM - II, has basically 2 methods based on the Collaborative Filtering

(1) User-based Collaborative Filtering
 
    Find Users similar to a particular User and then recommend products that those users have liked to the first user

(2) Item-based Colloborative Filtering

In [167]:
X_train.head(4)

Unnamed: 0,userId,movieId,rating,timestamp
497036,3054,1284,4,970157677
834793,5017,1291,3,962582897
203497,1248,2561,1,975625474
733746,4386,1625,4,965241005


In [124]:
r_matrix = X_train.pivot_table(values='rating', index='userId', columns='movieId')
r_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


### Mean

As part of this, we will take the userId and movieId and output the mean rating for the movie by all the users who have rated it



In [125]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, movie_id):
    if movie_id in r_matrix:  #Check if movie_id exists in r_matrix
        mean_rating = r_matrix[movie_id].mean() #Compute the mean of all the ratings given to the movie
    else:
        mean_rating = 3.0 #Default to a rating of 3.0 in the absence of any information
    return mean_rating

In [126]:
score(cf_user_mean)

0.9795862297937492

### Weighted Mean

Instead of assigning the same weight to all users, would be more intuitive to give more preference to those users whose rating are similar to the user in question than the other users whose ratings are not


![image.png](attachment:image.png)


In the above formular, r(u,m) is the rating given by the user u to the movie m.


In [128]:
r_matrix = r_matrix.fillna(0)
r_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [129]:
cosine_sim = cosine_similarity(r_matrix, r_matrix)

#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)


In [130]:
r_matrix.shape

(6040, 3669)

In [131]:
cosine_sim.shape

(6040, 6040)

In [20]:
def train_matrix(X_train_input):
    r_matrix_output = X_train_input.pivot_table(values='rating', index='userId', columns='movieId')
    r_matrix_output = r_matrix_output.fillna(0)
    return r_matrix_output

In [21]:
def weighted_mean(r_matrix_input):
    #Compute the cosine similarity matrix using the dummy ratings matrix
    cosine_sim_output = cosine_similarity(r_matrix_input, r_matrix_input)
    
    #Convert into pandas dataframe 
    cosine_sim_output = pd.DataFrame(cosine_sim_output, index=r_matrix_input.index, columns=r_matrix_input.index)
    
    return cosine_sim_output


### #User Based Collaborative Filter using Weighted Mean Ratings

In [30]:
r_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,1.0,4.0,0.0,0.0,0.0
671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
673,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,5.0,3.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
if movieId in r_matrix:
    print ("Ok")

NameError: name 'movieId' is not defined

In [37]:
cosine_sim.shape

(6040, 6040)

In [36]:
r_matrix.shape

(5372, 3693)

In [132]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, movie_id):
    if movie_id in r_matrix: #Check if movie_id exists in r_matrix
        sim_scores = cosine_sim[user_id] #Get the similarity scores for the user in question with every other user
        m_ratings = r_matrix[movie_id] #Get the user ratings for the movie in question
        idx = m_ratings[m_ratings.isnull()].index #Extract the indices containing NaN in the m_ratings series
        m_ratings = m_ratings.dropna() #Drop the NaN values from the m_ratings Series
        sim_scores = sim_scores.drop(idx) #Drop the corresponding cosine scores from the sim_scores series
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum() #Compute the final weighted mean
    else:
        wmean_rating = 3.0 #Default to a rating of 3.0 in the absence of any information
    return wmean_rating

In [None]:
#Gender and Occupation Based Collaborative Filter using Mean Ratings
def cf_gen_occ(user_id, movie_id):
     #Check if movie_id exists in gen_occ_mean
    if movie_id in gen_occ_mean.index:
         #Identify the user
        user = users.loc[user_id]



In [28]:
def cf_user_wmean(r_matrix_input, cosine_sim_input,user_):
    #Check if movie_id exists in r_matrix
    if movieId in r_matrix_input:
        sim_scores = cosine_sim_input[userId] #Get the similarity scores for the user in question with every other user
        m_ratings = r_matrix_input[movieId] #Get the user ratings for the movie in question
        idx = m_ratings[r_matrix_input.isnull()].index #Extract the indices containing NaN in the m_ratings series
        m_ratings = m_ratings.dropna() #Drop the NaN values from the m_ratings Series
        sim_scores = sim_scores.drop(idx)  #Drop the corresponding cosine scores from the sim_scores series
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum() #Compute the final weighted mean
    else:
        wmean_rating = 3.0 #Default to a rating of 3.0 in the absence of any information
    return wmean_rating

In [133]:
score(cf_user_wmean)

3.2183000784380065

In [38]:
r_matrix = train_matrix(X.iloc[train_idx])

In [39]:
r_matrix.shape

(5372, 3693)

In [40]:
consne_sim = weighted_mean(r_matrix)
cosine_sim

In [41]:
consne_sim.shape

(5372, 5372)

In [42]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, movie_id):
    if movie_id in r_matrix: #Check if movie_id exists in r_matrix
        sim_scores = cosine_sim[user_id] #Get the similarity scores for the user in question with every other user
        m_ratings = r_matrix[movie_id] #Get the user ratings for the movie in question
        idx = m_ratings[m_ratings.isnull()].index #Extract the indices containing NaN in the m_ratings series
        m_ratings = m_ratings.dropna() #Drop the NaN values from the m_ratings Series
        sim_scores = sim_scores.drop(idx) #Drop the corresponding cosine scores from the sim_scores series
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum() #Compute the final weighted mean
    else:
        wmean_rating = 3.0 #Default to a rating of 3.0 in the absence of any information
    return wmean_rating

In [44]:
X_train.shape

(900188, 4)

In [45]:
X_test.shape

(100021, 4)

In [134]:
score(cf_user_wmean)

3.2183000784380065

In [55]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, movie_id):
    if movie_id in r_matrix: #Check if movie_id exists in r_matrix
        sim_scores = cosine_sim[user_id] #Get the similarity scores for the user in question with every other user
        m_ratings = r_matrix[movie_id] #Get the user ratings for the movie in question
        idx = m_ratings[m_ratings.isnull()].index #Extract the indices containing NaN in the m_ratings series
        m_ratings = m_ratings.dropna() #Drop the NaN values from the m_ratings Series
        sim_scores = sim_scores.drop(idx) #Drop the corresponding cosine scores from the sim_scores series
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum() #Compute the final weighted mean
    else:
        wmean_rating = 3.0 #Default to a rating of 3.0 in the absence of any information
    return wmean_rating

In [56]:
cosine_sim.shape

(6040, 6040)

In [47]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    id_pairs = zip(X_test['userId'], X_test['movieId']) #Construct a list of user-movie tuples from the testing dataset
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs]) #Predict the rating for every user-movie tuple
    y_true = np.array(X_test['rating']) #Extract the actual ratings given by the users in the test data
    return rmse(y_true, y_pred) #Return the final RMSE score

In [100]:
user_id = 1
movie_id = 1193
sim_scores = cosine_sim[user_id]

KeyError: 1

In [65]:
m_ratings = r_matrix[movie_id]

In [67]:
idx = m_ratings[m_ratings.isnull()].index

In [None]:
m_ratings = r_matrix[movie_id]

In [68]:
m_ratings = m_ratings.dropna()

In [69]:
sim_scores = sim_scores.drop(idx)

In [71]:
sim_scores.shape

(6040,)

In [72]:
m_ratings.shape

(5372,)

In [70]:
np.dot(sim_scores, m_ratings)/ sim_scores.sum()

ValueError: shapes (6040,) and (5372,) not aligned: 6040 (dim 0) != 5372 (dim 0)

In [59]:
cf_user_wmean(192,1544)

ValueError: shapes (6040,) and (5372,) not aligned: 6040 (dim 0) != 5372 (dim 0)

In [58]:
user = 192
movie = 1544

192 1544
192 1475
192 2278
192 1476
192 1479
192 2424
192 3082
192 2353
192 2427
192 1625
192 1552
192 1626
192 3159
192 1556
192 2289
192 1488
192 886
192 3160
192 1702
192 1704
192 1635
192 1562
192 2294
192 1639
192 2297
192 1495
192 1497
192 1499
192 1711
192 3317
192 2443
192 1641
192 3246
192 3247
192 3174
192 100
192 1717
192 3249
192 3176
192 1645
192 1573
192 3178
192 1647
192 104
192 105
192 3250
192 2450
192 1721
192 3253
192 1722
192 3256
192 1653
192 1580
192 110
192 1654
192 112
192 1729
192 1584
192 987
192 1801
192 2605
192 1730
192 3408
192 1805
192 1732
192 1735
192 1590
192 2539
192 2393
192 1591
192 2394
192 2395
192 1739
192 2396
192 125
192 1597
192 1599
192 994
192 996
192 1810
192 2614
192 2541
192 2542
192 2616
192 2617
192 1744
192 1673
192 204
192 1748
192 132
192 1676
192 208
192 2622
192 3354
192 2625
192 1825
192 1752
192 2628
192 1753
192 1680
192 3358
192 1754
192 1682
192 144
192 2702
192 3360
192 2706
192 1831
192 2707
192 3438
192 2490
192 1834
192 19

195 1207
195 2948
195 1136
195 1064
195 531
195 2877
195 2879
195 608
195 392
195 70
195 468
195 76
195 3822
195 2010
195 2011
195 3751
195 2012
195 1210
195 3752
195 2950
195 3753
195 1212
195 1213
195 2880
195 1214
195 3683
195 1215
195 1216
195 610
195 2019
195 1217
195 611
195 3686
195 1218
195 2959
195 541
195 1148
195 542
195 546
195 80
195 475
195 85
195 3902
195 2020
195 2021
195 3835
195 1221
195 3763
195 1222
195 3764
195 1223
195 2890
195 2026
195 1224
195 2027
195 1225
195 2028
195 1080
195 2966
195 1081
195 1228
195 3697
195 2968
195 3698
195 551
195 3699
195 552
195 480
195 627
195 1089
195 628
195 558
195 92
195 93
195 488
195 3910
195 3911
195 1300
195 3915
195 2103
195 1301
195 3916
195 2105
195 1230
195 3918
195 2971
195 2034
195 1305
195 1232
195 1306
195 1233
195 1234
195 1161
195 2974
195 1090
195 2976
195 1237
195 1091
195 1092
195 632
195 635
195 562
195 1169
195 1097
195 709
195 1099
195 565
195 495
195 2111
195 2113
195 2115
195 1240
195 1242
195 1243
195 3785


202 2391
202 3267
202 2392
202 1663
202 2539
202 1591
202 1665
202 2395
202 1593
202 2469
202 2396
202 2398
202 1597
202 996
202 999
202 1810
202 2541
202 3271
202 2542
202 3418
202 2616
202 2470
202 3273
202 2617
202 2471
202 3274
202 2474
202 1672
202 1746
202 1673
202 1747
202 1674
202 2478
202 1676
202 208
202 209
202 3421
202 2622
202 1821
202 1752
202 2628
202 1680
202 1754
202 1682
202 2558
202 2485
202 216
202 145
202 1689
202 3500
202 2700
202 2701
202 2702
202 3360
202 3361
202 2706
202 2633
202 2560
202 3436
202 2634
202 1760
202 3438
202 2490
202 2637
202 2491
202 1835
202 2638
202 1690
202 2567
202 150
202 3298
202 1694
202 224
202 151
202 225
202 153
202 157
202 3511
202 3512
202 2710
202 3513
202 3515
202 2640
202 2714
202 2641
202 1912
202 2642
202 2716
202 2643
202 2717
202 2644
202 2571
202 2718
202 2645
202 2572
202 300
202 3448
202 2719
202 1917
202 1918
202 1772
202 2648
202 303
202 231
202 160
202 1777
202 161
202 235
202 163
202 237
202 165
202 166
202 3450
202 3

209 2722
209 2599
209 3615
209 2671
209 2826
209 2828
209 2694
209 356
209 2858
209 1127
210 648
210 1320
210 2126
210 2058
210 653
210 6
210 1408
210 733
210 809
210 736
210 1431
210 902
210 2167
210 1438
210 839
210 694
210 3113
210 1518
210 1379
210 1527
210 780
210 1388
210 1391
210 2412
210 1610
210 1616
210 2273
210 1544
210 1479
210 3082
210 2353
210 2427
210 1552
210 1556
210 1488
210 3165
210 1562
210 1499
210 3178
210 1722
210 3257
210 1580
210 1586
210 1588
210 1801
210 1591
210 1667
210 1599
210 2540
210 2617
210 3274
210 1816
210 204
210 208
210 1754
210 1687
210 145
210 2701
210 1831
210 1833
210 3438
210 2490
210 1690
210 2568
210 153
210 1917
210 1918
210 160
210 165
210 2720
210 2722
210 2723
210 316
210 170
210 1792
210 1867
210 181
210 10
210 1876
210 2826
210 21
210 349
210 420
210 423
210 353
210 288
210 1100
210 502
210 1037
210 504
210 507
210 434
210 42
210 292
210 1047
210 1049
210 376
210 377
210 380
210 2002
210 2006
210 465
210 393
210 2881
210 546
210 1221


218 2013
218 3753
218 3827
218 3755
218 2959
218 3901
218 3911
218 3916
218 3852
218 1171
219 648
219 3863
219 3791
219 3005
219 3101
219 1513
219 785
219 3301
219 2433
219 3253
219 3408
219 2396
219 2706
219 223
219 2712
219 2718
219 2657
219 266
219 3481
219 2752
219 3555
219 2683
219 2688
219 2762
219 2763
219 2858
219 2791
219 2792
219 2795
219 3675
219 3753
219 3755
219 2028
219 480
219 3911
219 3785
220 2987
220 648
220 2126
220 3798
220 2997
220 1258
220 1259
220 1186
220 1
220 800
220 2064
220 1265
220 1197
220 1198
220 1199
220 3882
220 1270
220 1272
220 2076
220 1288
220 2302
220 1500
220 1291
220 1295
220 1296
220 3114
220 2171
220 2174
220 1374
220 2321
220 2324
220 1394
220 1617
220 1625
220 2359
220 1484
220 1711
220 1644
220 101
220 3176
220 3178
220 3253
220 1722
220 110
220 2384
220 1589
220 3408
220 1732
220 2467
220 2396
220 125
220 2542
220 1674
220 2628
220 1754
220 3505
220 3361
220 160
220 235
220 1923
220 3457
220 2583
220 318
220 3534
220 2599
220 3618
220 3623

229 1721
229 3326
229 3255
229 1580
229 110
229 3186
229 1729
229 1584
229 3189
229 1589
229 2607
229 1805
229 1735
229 2541
229 1816
229 1672
229 1673
229 2549
229 1747
229 1676
229 2622
229 3425
229 2628
229 1680
229 3285
229 1682
229 3360
229 1831
229 2707
229 1833
229 3438
229 1909
229 150
229 3298
229 2569
229 3510
229 3513
229 3440
229 2712
229 1840
229 2571
229 2718
229 1916
229 3448
229 1918
229 1848
229 161
229 3452
229 1921
229 3453
229 2657
229 1784
229 318
229 172
229 175
229 3536
229 2590
229 1862
229 2596
229 1799
229 329
229 185
229 3617
229 2672
229 2746
229 2676
229 1876
229 260
229 11
229 265
229 196
229 3554
229 3481
229 3555
229 1882
229 1957
229 1884
229 3489
229 2688
229 24
229 25
229 26
229 3563
229 2908
229 1961
229 2839
229 2766
229 1968
229 1897
229 34
229 3717
229 2771
229 3578
229 1037
229 367
229 296
229 3723
229 2858
229 3594
229 3598
229 524
229 1059
229 527
229 455
229 62
229 2002
229 3744
229 3745
229 1060
229 3675
229 608
229 2012
229 1210
229 3825
229

238 3194
238 3196
238 3198
238 1594
238 2398
238 3271
238 3418
238 1747
238 1674
238 3420
238 3350
238 2621
238 3424
238 2628
238 1680
238 1682
238 1759
238 3504
238 3362
238 1834
238 3296
238 1693
238 150
238 1694
238 151
238 2712
238 3370
238 3371
238 300
238 3379
238 161
238 3526
238 2725
238 2728
238 2729
238 3386
238 1788
238 247
238 175
238 2801
238 3461
238 2732
238 1933
238 1860
238 1935
238 3467
238 2738
238 1936
238 3468
238 3469
238 1938
238 3543
238 3471
238 1941
238 3546
238 1944
238 1945
238 1946
238 1873
238 1949
238 337
238 265
238 16
238 17
238 199
238 2750
238 1012
238 1952
238 1954
238 2686
238 271
238 272
238 25
238 279
238 3700
238 2904
238 2908
238 2764
238 1962
238 3498
238 1968
238 30
238 32
238 34
238 36
238 3713
238 3714
238 2912
238 1103
238 1104
238 3649
238 508
238 41
238 296
238 3723
238 3724
238 3654
238 2852
238 1041
238 3586
238 515
238 446
238 58
238 3801
238 3730
238 3733
238 3734
238 2932
238 1120
238 3735
238 1124
238 3668
238 2866
238 1054
238 529


245 356
245 288
245 1101
245 3643
245 2915
245 1103
245 2916
245 1031
245 2918
245 500
245 1971
245 3576
245 1973
245 1036
245 3578
245 1974
245 1037
245 504
245 432
245 2779
245 434
245 508
245 435
245 362
245 42
245 292
245 44
245 367
245 368
245 296
245 3723
245 2921
245 2922
245 2851
245 3654
245 2780
245 2929
245 1982
245 512
245 2858
245 440
245 2788
245 1049
245 370
245 2789
245 517
245 377
245 379
245 3802
245 3804
245 3732
245 3733
245 3807
245 3735
245 3809
245 3590
245 3591
245 2863
245 2791
245 1125
245 2792
245 520
245 2866
245 1127
245 2867
245 2794
245 1128
245 1129
245 2796
245 1994
245 3599
245 2797
245 380
245 1997
245 527
245 454
245 382
245 383
245 529
245 457
245 69
245 3812
245 2000
245 3740
245 3814
245 1200
245 2003
245 1201
245 3671
245 2942
245 1203
245 1130
245 2006
245 1204
245 2944
245 2871
245 2872
245 1206
245 2009
245 1207
245 2947
245 1208
245 2948
245 1136
245 2949
245 2877
245 532
245 533
245 608
245 464
245 70
245 466
245 76
245 2010
245 2012
245 121

260 1198
260 592
260 1275
260 748
260 2153
260 1429
260 688
260 3107
260 1431
260 1291
260 2167
260 836
260 839
260 694
260 3113
260 2170
260 1370
260 1378
260 849
260 2322
260 2253
260 1527
260 1385
260 1387
260 788
260 3208
260 2406
260 2334
260 1608
260 2193
260 1391
260 2194
260 1610
260 2273
260 1544
260 940
260 1479
260 2427
260 1552
260 886
260 1704
260 1562
260 1499
260 1573
260 3257
260 2528
260 1580
260 110
260 1586
260 1587
260 1801
260 3268
260 1597
260 2616
260 2549
260 204
260 1676
260 208
260 2628
260 1681
260 1687
260 3438
260 2490
260 227
260 2640
260 3444
260 2571
260 1917
260 1918
260 160
260 165
260 2723
260 1858
260 316
260 172
260 173
260 2808
260 2735
260 1792
260 1867
260 181
260 2815
260 405
260 1876
260 260
260 15
260 3623
260 2826
260 21
260 349
260 3704
260 3635
260 2763
260 420
260 353
260 2699
260 288
260 1100
260 1101
260 2916
260 502
260 1036
260 438
260 44
260 1047
260 1049
260 517
260 377
260 379
260 3667
260 1129
260 380
260 3740
260 2002
260 3744
260

267 2124
267 1250
267 720
267 1183
267 2997
267 1187
267 1188
267 585
267 587
267 3948
267 805
267 590
267 597
267 1271
267 1348
267 1429
267 759
267 3100
267 2302
267 1500
267 3107
267 2239
267 832
267 838
267 1513
267 1370
267 1517
267 2324
267 1527
267 780
267 786
267 2336
267 1610
267 1611
267 1617
267 1479
267 2424
267 3155
267 2355
267 1485
267 3301
267 1704
267 1633
267 1569
267 3316
267 1643
267 2447
267 3250
267 3253
267 3255
267 3256
267 3257
267 1580
267 110
267 3260
267 2396
267 1597
267 1810
267 3418
267 1673
267 1747
267 1678
267 3425
267 3357
267 1682
267 213
267 141
267 3502
267 1848
267 232
267 1777
267 307
267 165
267 1784
267 318
267 246
267 249
267 2671
267 10
267 337
267 265
267 17
267 3623
267 3555
267 21
267 344
267 345
267 272
267 2762
267 2763
267 2692
267 1968
267 281
267 356
267 34
267 357
267 39
267 508
267 367
267 296
267 2858
267 50
267 52
267 3730
267 2791
267 1997
267 527
267 454
267 60
267 2943
267 531
267 608
267 539
267 3751
267 1210
267 3753
267 1213

273 2167
273 2094
273 3114
273 1370
273 1517
273 2174
273 1372
273 1374
273 1377
273 919
273 1380
273 780
273 1387
273 785
273 2334
273 2335
273 1608
273 2268
273 1393
273 1396
273 2413
273 3147
273 2424
273 3156
273 1552
273 2355
273 1485
273 3301
273 3160
273 1704
273 3173
273 3247
273 3175
273 3177
273 1646
273 3178
273 2378
273 3255
273 3256
273 3257
273 1580
273 110
273 3185
273 2394
273 2396
273 126
273 1597
273 2541
273 2616
273 2470
273 3273
273 1748
273 2478
273 1676
273 2622
273 2628
273 1753
273 1754
273 141
273 216
273 145
273 3363
273 2707
273 3438
273 3365
273 3439
273 1835
273 153
273 2710
273 3440
273 3444
273 2716
273 2717
273 2571
273 1777
273 163
273 3452
273 3527
273 3386
273 3535
273 329
273 256
273 2671
273 1947
273 405
273 260
273 196
273 1010
273 2683
273 1882
273 1019
273 3489
273 344
273 1020
273 1022
273 2762
273 2763
273 2694
273 1028
273 353
273 32
273 356
273 34
273 1101
273 2916
273 1036
273 3578
273 364
273 44
273 367
273 47
273 2858
273 50
273 377
273 3

284 474
284 2020
284 1220
284 3908
284 2023
284 3836
284 3763
284 1222
284 3837
284 1224
284 3839
284 3767
284 2028
284 1080
284 3697
284 2968
284 3698
284 3699
284 552
284 553
284 480
284 93
284 1302
284 3917
284 2105
284 2033
284 1304
284 1231
284 3919
284 2107
284 2034
284 1234
284 3777
284 1090
284 3923
284 3927
284 2115
284 1240
284 2116
284 1242
284 2119
284 2046
284 711
284 2985
284 2048
284 1246
284 2986
285 2988
285 1179
285 1185
285 1259
285 1186
285 581
285 1189
285 2130
285 3007
285 3948
285 3949
285 2064
285 1263
285 1191
285 2067
285 1192
285 1193
285 3019
285 2076
285 1276
285 1278
285 3897
285 3101
285 902
285 2238
285 903
285 904
285 1299
285 3112
285 2243
285 1527
285 858
285 3204
285 3135
285 1466
285 1395
285 3071
285 2417
285 3147
285 2346
285 1617
285 2348
285 3308
285 1639
285 1645
285 1649
285 1580
285 111
285 1729
285 3330
285 3408
285 1732
285 2391
285 3198
285 3342
285 2542
285 206
285 3421
285 3424
285 3281
285 1682
285 3360
285 3362
285 2561
285 2639
285 32

297 111
297 112
297 1734
297 2395
297 2396
297 2542
297 3418
297 3361
297 3435
297 1914
297 2571
297 232
297 1784
297 318
297 2804
297 2599
297 1002
297 260
297 3624
297 1012
297 3481
297 1954
297 1883
297 21
297 3703
297 1020
297 2692
297 2772
297 1036
297 3578
297 296
297 2858
297 514
297 2791
297 527
297 1060
297 2947
297 1208
297 2948
297 1136
297 2949
297 608
297 1213
297 1214
297 474
297 1221
297 1225
297 2028
297 1304
297 1235
297 562
297 497
297 1240
297 1242
297 2985
297 1247
298 1249
298 648
298 3793
298 661
298 1350
298 1282
298 2160
298 2321
298 1544
298 3159
298 1806
298 2399
298 3512
298 3515
298 3457
298 3538
298 266
298 3624
298 3481
298 2858
298 527
298 1136
298 3751
298 611
298 2966
298 551
298 480
299 648
299 1250
299 2997
299 1259
299 1189
299 1
299 3007
299 3948
299 2064
299 1266
299 1193
299 1276
299 1282
299 3897
299 1284
299 1358
299 3114
299 912
299 1446
299 913
299 919
299 920
299 852
299 1387
299 928
299 858
299 785
299 3060
299 3068
299 1537
299 942
299 2355

302 414
302 3489
302 1958
302 415
302 20
302 21
302 344
302 23
302 419
302 24
302 274
302 25
302 348
302 275
302 349
302 3702
302 3703
302 3704
302 2902
302 3705
302 3633
302 2906
302 2761
302 2762
302 3638
302 2836
302 2763
302 2692
302 420
302 1027
302 421
302 2694
302 1965
302 1892
302 1028
302 423
302 350
302 1894
302 3499
302 1968
302 1895
302 353
302 2699
302 32
302 355
302 429
302 356
302 34
302 357
302 36
302 39
302 3712
302 1100
302 2840
302 3717
302 1103
302 3645
302 2916
302 2770
302 3646
302 2918
302 2772
302 500
302 1034
302 1035
302 1036
302 3578
302 1974
302 504
302 432
302 507
302 434
302 361
302 435
302 364
302 42
302 292
302 293
302 44
302 367
302 368
302 47
302 296
302 511
302 2857
302 2858
302 513
302 440
302 1047
302 514
302 441
302 1985
302 515
302 442
302 1049
302 516
302 370
302 1987
302 517
302 50
302 374
302 52
302 376
302 377
302 379
302 1120
302 3809
302 2861
302 2863
302 3593
302 2791
302 1125
302 2792
302 1126
302 3668
302 2793
302 1127
302 3669
302 2794
3

308 17
308 194
308 2822
308 1010
308 3552
308 2750
308 1013
308 1952
308 1953
308 1016
308 410
308 2683
308 1954
308 1883
308 1885
308 342
308 21
308 417
308 272
308 1889
308 419
308 25
308 348
308 349
308 277
308 29
308 3701
308 3703
308 3704
308 3633
308 1020
308 3635
308 2761
308 1022
308 2762
308 3638
308 1961
308 3639
308 2764
308 1962
308 1025
308 2692
308 420
308 1028
308 1029
308 423
308 1968
308 353
308 2699
308 281
308 32
308 355
308 429
308 356
308 357
308 36
308 2912
308 1101
308 2915
308 2916
308 1031
308 1032
308 2918
308 1033
308 1035
308 1036
308 3578
308 1037
308 432
308 1976
308 1977
308 508
308 364
308 42
308 293
308 45
308 367
308 47
308 296
308 3723
308 3654
308 1982
308 2858
308 440
308 1047
308 514
308 441
308 515
308 442
308 445
308 50
308 52
308 448
308 449
308 379
308 3730
308 2932
308 3809
308 3663
308 3664
308 3591
308 3665
308 2863
308 2791
308 2792
308 3668
308 1127
308 1992
308 450
308 2796
308 1994
308 524
308 2797
308 1059
308 380
308 527
308 454
308 45

319 1556
319 1483
319 3088
319 2359
319 2289
319 953
319 1488
319 3301
319 2504
319 2432
319 1703
319 2433
319 1704
319 2434
319 1633
319 2290
319 2291
319 1635
319 3168
319 2439
319 2294
319 3097
319 3098
319 2369
319 3099
319 2297
319 1569
319 891
319 1499
319 969
319 898
319 899
319 1711
319 3317
319 3244
319 2442
319 1713
319 1641
319 3246
319 3173
319 1642
319 3247
319 1643
319 100
319 3175
319 2446
319 1644
319 3249
319 3176
319 1645
319 2375
319 1719
319 1646
319 1573
319 1647
319 104
319 105
319 3250
319 2521
319 1721
319 3326
319 3253
319 3255
319 3182
319 3256
319 2527
319 3257
319 1653
319 1580
319 110
319 3258
319 1727
319 111
319 3186
319 2385
319 113
319 2386
319 1584
319 3189
319 119
319 986
319 2600
319 1801
319 3260
319 3334
319 2605
319 3408
319 1804
319 3263
319 2391
319 1663
319 2539
319 3269
319 2395
319 3198
319 2469
319 2396
319 1667
319 1594
319 1597
319 3412
319 2611
319 2612
319 2541
319 3272
319 2470
319 3347
319 3274
319 2546
319 1673
319 203
319 1747
319 16

KeyboardInterrupt: 

In [54]:
y_pred = np.array([cf_user_wmean(user, movie) for (user, movie) in id_pairs])

ValueError: shapes (6040,) and (5372,) not aligned: 6040 (dim 0) != 5372 (dim 0)

In [75]:
r_matrix.shape

(5372, 3693)

In [76]:
r_matrix = train_matrix(X.iloc[train_idx])

In [77]:
r_matrix.shape

(5372, 3693)

In [86]:
consne_sim.shape

(5372, 5372)

In [87]:
r_matrix.shape

(5372, 3693)

In [88]:
consine_sim.shape

NameError: name 'consine_sim' is not defined

In [83]:
score(cf_user_wmean)

ValueError: shapes (6040,) and (5372,) not aligned: 6040 (dim 0) != 5372 (dim 0)

In [81]:
X_test.shape

(100021, 4)

In [89]:
cosine_sim.shape

(6040, 6040)

In [91]:
r_matrix.shape

(5372, 3693)

In [92]:
cosine_sim.shape

(5372, 5372)

In [93]:
score(cf_user_wmean)

KeyError: 1

In [90]:
kf = KFold(n_splits=10)
fold_ = 1 
for (train_idx,test_idx) in kf.split(X):
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    
    r_matrix = train_matrix(X.iloc[train_idx])
    cosine_sim = weighted_mean(r_matrix)
    #score_fold = score(cf_user_wmean)
    score_fold = score(cf_user_mean)
    print ("Fold ...: {}  Score: {}".format(fold_,score_fold))
    fold_ = fold_ + 1

KeyError: 1

### Based on the Input Genres, search the Movie and Sort based on the Rating and Pulled the Top 10 Movies

In [None]:
df_movies_ratings_genres = df_movies_ratings[df_movies_ratings['genres'] == 'Comedy']
df_movies_ratings_genres.sort_values(by="rating",ascending=False)[1:10]

In [None]:
### Top 10 Highly Rated Movies Based on the 

In [None]:
df_movies_ratings_genres = df_movies_ratings[df_movies_ratings['genres'] == 'Comedy']
df_movies_ratings_genres.sort_values(by="rating",ascending=False)[1:10]

In [21]:
df_movies_ratings['genres'].value_counts()

Comedy                       116883
Drama                        111423
Comedy|Romance                42712
Comedy|Drama                  42245
Drama|Romance                 29170
                              ...  
Drama|Romance|Western            29
Children's|Fantasy               27
Comedy|Film-Noir|Thriller         5
Film-Noir|Horror                  2
Fantasy                           1
Name: genres, Length: 301, dtype: int64

In [172]:
ratings_matrix_items = df_movies_ratings.pivot_table(index=['movieId'],columns=['userId'],values='rating').reset_index(drop=True)
ratings_matrix_items.fillna( 0, inplace = True )
ratings_matrix_items.shape

(3706, 6040)

In [173]:
ratings_matrix_items

userId,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
0,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [175]:
movie_similarity = 1 - pairwise_distances( ratings_matrix_items.to_numpy(), metric="cosine" )
np.fill_diagonal( movie_similarity, 0 ) #Filling diagonals with 0s for future use when sorting is done
ratings_matrix_items = pd.DataFrame( movie_similarity )
ratings_matrix_items

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3696,3697,3698,3699,3700,3701,3702,3703,3704,3705
0,0.000000,0.390349,0.267943,0.178789,0.256569,0.347373,0.301490,0.125709,0.106620,0.377459,...,0.099502,0.020966,0.084105,0.081826,0.045949,0.309676,0.186633,0.093479,0.042829,0.182691
1,0.390349,0.000000,0.240946,0.155457,0.249970,0.244827,0.262772,0.196521,0.158469,0.386200,...,0.061819,0.015209,0.075310,0.095573,0.074271,0.213650,0.140781,0.087013,0.026063,0.122185
2,0.267943,0.240946,0.000000,0.192788,0.308290,0.187020,0.292230,0.092122,0.128378,0.245601,...,0.038492,0.065507,0.049512,0.087377,0.050985,0.190575,0.104837,0.062258,0.010073,0.097786
3,0.178789,0.155457,0.192788,0.000000,0.271990,0.125170,0.220024,0.049554,0.060334,0.133707,...,0.055486,0.053300,0.002227,0.025278,0.025204,0.118902,0.096318,0.022588,0.024769,0.095154
4,0.256569,0.249970,0.308290,0.271990,0.000000,0.148114,0.305107,0.095512,0.138392,0.237681,...,0.026632,0.083898,0.046399,0.047542,0.016156,0.174554,0.092403,0.051633,0.010750,0.112835
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3701,0.309676,0.213650,0.190575,0.118902,0.174554,0.236447,0.191689,0.090387,0.092347,0.237227,...,0.183859,0.053539,0.109062,0.210272,0.078341,0.000000,0.329339,0.168234,0.122279,0.363838
3702,0.186633,0.140781,0.104837,0.096318,0.092403,0.201419,0.117660,0.080523,0.099554,0.136374,...,0.244371,0.098568,0.070933,0.160150,0.107063,0.329339,0.000000,0.302649,0.199337,0.347805
3703,0.093479,0.087013,0.062258,0.022588,0.051633,0.115331,0.059262,0.084976,0.004956,0.097170,...,0.126068,0.211891,0.057350,0.124186,0.095905,0.168234,0.302649,0.000000,0.202809,0.234638
3704,0.042829,0.026063,0.010073,0.024769,0.010750,0.029136,0.036102,0.072141,0.000000,0.018359,...,0.170983,0.132019,0.086057,0.104873,0.015847,0.122279,0.199337,0.202809,0.000000,0.192972


In [176]:
def item_similarity(movieName): 
    """
    recomendates similar movies
   :param data: name of the movie 
   """
    try:
        #user_inp=input('Enter the reference movie title based on which recommendations are to be made: ')
        user_inp=movieName
        inp=df_movies[df_movies['title']==user_inp].index.tolist()
        inp=inp[0]

        df_movies['similarity'] = ratings_matrix_items.iloc[inp]
        df_movies.columns = ['movie_id', 'title', 'release_date','similarity']
    except:
        print("Sorry, the movie is not in the database!")

In [296]:
def recommendedMoviesAsperItemSimilarity(user_id):
    """
     Recommending movie which user hasn't watched as per Item Similarity
    :param user_id: user_id to whom movie needs to be recommended
    :return: movieIds to user 
    """
    user_movie= df_movies_ratings[(df_movies_ratings.userId==user_id) & df_movies_ratings.rating.isin([5,4.5])][['title']]
    user_movie=user_movie.iloc[0,0]
    item_similarity(user_movie)
    print ("Coming...")
    
    sorted_movies_as_per_userChoice=df_movies.sort_values( ["similarity"], ascending = False )
    
    sorted_movies_as_per_userChoice=sorted_movies_as_per_userChoice[sorted_movies_as_per_userChoice['similarity'] >=0.45]['movie_id']
    
    recommended_movies=list()
    
    df_recommended_item=pd.DataFrame()
    user2Movies= df_ratings[df_ratings['userId']== user_id]['movieId']
    
    for movieId in sorted_movies_as_per_userChoice:
        if movieId not in user2Movies:
            print ("Coming 2..")
            df_new= df_ratings[(df_ratings.movieId==movieId)]
            df_recommended_item=pd.concat([df_recommended_item,df_new])
            print (df_recommended_item.shape)
        print ("Coming 3..")
        #return (df_recommended_item)
    best10=df_recommended_item.sort_values(["rating"], ascending = False )[1:10] 
    return best10['movieId']

In [181]:
def movieIdToTitle(listMovieIDs):
    """
     Converting movieId to titles
    :param user_id: List of movies
    :return: movie titles
    """
    movie_titles= list()
    for id in listMovieIDs:
        movie_titles.append(df_movies[df_movies['movie_id']==id]['title'])
    return movie_titles

In [273]:
user_movie= df_movies_ratings[(df_movies_ratings.userId==user_id) & df_movies_ratings.rating.isin([5,4.5])][['title']]
#user_movie=user_movie.iloc[0,0]
#item_similarity(user_movie)

In [276]:
user_movie=user_movie.iloc[0,0]

In [277]:
item_similarity(user_movie)

In [278]:
sorted_movies_as_per_userChoice=df_movies.sort_values( ["similarity"], ascending = False )

In [280]:
sorted_movies_as_per_userChoice=sorted_movies_as_per_userChoice[sorted_movies_as_per_userChoice['similarity'] >=0.45]['movie_id']

In [282]:
recommended_movies=list()

In [281]:
sorted_movies_as_per_userChoice

2898    2967
1173    1190
574      578
2162    2231
1178    1196
        ... 
573      577
627      632
963      975
1613    1659
1186    1204
Name: movie_id, Length: 87, dtype: int64

In [283]:
df_recommended_item=pd.DataFrame()

In [284]:
user2Movies= df_ratings[df_ratings['userId']== user_id]['movieId']

In [289]:
for movieId in sorted_movies_as_per_userChoice:
    print ("Ok 1:{}",format(movieId))
    if movieId not in user2Movies:
        print ("Ok 2:{}",format(movieId))
        df_new= df_ratings[(df_ratings.movieId==movieId)]
        df_recommended_item=pd.concat([df_recommended_item,df_new])

Ok 1:{} 2967
Ok 1:{} 1190
Ok 2:{} 1190
Ok 1:{} 578
Ok 2:{} 578
Ok 1:{} 2231
Ok 2:{} 2231
Ok 1:{} 1196
Ok 2:{} 1196
Ok 1:{} 34
Ok 2:{} 34
Ok 1:{} 1122
Ok 2:{} 1122
Ok 1:{} 1477
Ok 2:{} 1477
Ok 1:{} 350
Ok 2:{} 350
Ok 1:{} 2443
Ok 2:{} 2443
Ok 1:{} 358
Ok 2:{} 358
Ok 1:{} 1123
Ok 2:{} 1123
Ok 1:{} 256
Ok 2:{} 256
Ok 1:{} 2272
Ok 2:{} 2272
Ok 1:{} 1124
Ok 2:{} 1124
Ok 1:{} 585
Ok 2:{} 585
Ok 1:{} 1806
Ok 2:{} 1806
Ok 1:{} 312
Ok 2:{} 312
Ok 1:{} 447
Ok 2:{} 447
Ok 1:{} 1136
Ok 2:{} 1136
Ok 1:{} 2661
Ok 2:{} 2661
Ok 1:{} 2580
Ok 2:{} 2580
Ok 1:{} 1917
Ok 2:{} 1917
Ok 1:{} 1038
Ok 2:{} 1038
Ok 1:{} 579
Ok 2:{} 579
Ok 1:{} 3098
Ok 1:{} 2779
Ok 2:{} 2779
Ok 1:{} 583
Ok 2:{} 583
Ok 1:{} 470
Ok 2:{} 470
Ok 1:{} 2626
Ok 2:{} 2626
Ok 1:{} 1075
Ok 2:{} 1075
Ok 1:{} 290
Ok 2:{} 290
Ok 1:{} 2197
Ok 2:{} 2197
Ok 1:{} 2720
Ok 2:{} 2720
Ok 1:{} 1422
Ok 2:{} 1422
Ok 1:{} 39
Ok 2:{} 39
Ok 1:{} 1543
Ok 2:{} 1543
Ok 1:{} 1664
Ok 2:{} 1664
Ok 1:{} 597
Ok 2:{} 597
Ok 1:{} 869
Ok 2:{} 869
Ok 1:{} 2062
Ok 2:{}

In [279]:
sorted_movies_as_per_userChoice

Unnamed: 0,movie_id,title,release_date,similarity
2898,2967,"Bad Seed, The (1956)",Drama|Thriller,0.633104
1173,1190,Tie Me Up! Tie Me Down! (1990),Drama,0.610826
574,578,"Hour of the Pig, The (1993)",Drama|Mystery,0.605849
2162,2231,Rounders (1998),Crime|Drama,0.579382
1178,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War,0.570125
...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,
3879,3949,Requiem for a Dream (2000),Drama,
3880,3950,Tigerland (2000),Drama,
3881,3951,Two Family House (2000),Drama,


In [271]:
df_recommended_item

In [297]:
df_recommended_item.sort_values(["rating"], ascending = False )[1:10]

KeyError: 'rating'

In [294]:
user_id

23

In [298]:
df_recommended_item = recommendedMoviesAsperItemSimilarity(user_id)

Coming...
Coming 3..
Coming 2..
(132, 4)
Coming 3..
Coming 2..
(134, 4)
Coming 3..
Coming 2..
(479, 4)
Coming 3..
Coming 2..
(3469, 4)
Coming 3..
Coming 2..
(5220, 4)
Coming 3..
Coming 2..
(5220, 4)
Coming 3..
Coming 2..
(5247, 4)
Coming 3..
Coming 2..
(5762, 4)
Coming 3..
Coming 2..
(5888, 4)
Coming 3..
Coming 2..
(6029, 4)
Coming 3..
Coming 2..
(6062, 4)
Coming 3..
Coming 2..
(6280, 4)
Coming 3..
Coming 2..
(6392, 4)
Coming 3..
Coming 2..
(6797, 4)
Coming 3..
Coming 2..
(7215, 4)
Coming 3..
Coming 2..
(7383, 4)
Coming 3..
Coming 2..
(7495, 4)
Coming 3..
Coming 2..
(7526, 4)
Coming 3..
Coming 2..
(9125, 4)
Coming 3..
Coming 2..
(9219, 4)
Coming 3..
Coming 2..
(9942, 4)
Coming 3..
Coming 2..
(11052, 4)
Coming 3..
Coming 2..
(11063, 4)
Coming 3..
Coming 2..
(11064, 4)
Coming 3..
Coming 3..
Coming 2..
(11409, 4)
Coming 3..
Coming 2..
(11437, 4)
Coming 3..
Coming 2..
(11475, 4)
Coming 3..
Coming 2..
(11510, 4)
Coming 3..
Coming 2..
(11510, 4)
Coming 3..
Coming 2..
(11678, 4)
Coming 3..
Co

In [257]:
df_recommended_item.sort_values(["rating"], ascending = False )[1:10] 

Unnamed: 0,userId,movieId,rating,timestamp
915565,5535,1217,5,959579261
925147,5592,1217,5,959268781
932612,5630,1217,5,980553265
933670,5636,1217,5,959052049
936370,5649,1217,5,958865612
944923,5702,1217,5,958578730
951202,5747,1217,5,958356864
956390,5770,1217,5,958172236
962784,5805,1217,5,958107586


In [300]:
user_id=23
df_recommended_item = print("Recommended movies,:\n",movieIdToTitle(recommendedMoviesAsperItemSimilarity(user_id)))

Coming...
Coming 3..
Coming 2..
(132, 4)
Coming 3..
Coming 2..
(134, 4)
Coming 3..
Coming 2..
(479, 4)
Coming 3..
Coming 2..
(3469, 4)
Coming 3..
Coming 2..
(5220, 4)
Coming 3..
Coming 2..
(5220, 4)
Coming 3..
Coming 2..
(5247, 4)
Coming 3..
Coming 2..
(5762, 4)
Coming 3..
Coming 2..
(5888, 4)
Coming 3..
Coming 2..
(6029, 4)
Coming 3..
Coming 2..
(6062, 4)
Coming 3..
Coming 2..
(6280, 4)
Coming 3..
Coming 2..
(6392, 4)
Coming 3..
Coming 2..
(6797, 4)
Coming 3..
Coming 2..
(7215, 4)
Coming 3..
Coming 2..
(7383, 4)
Coming 3..
Coming 2..
(7495, 4)
Coming 3..
Coming 2..
(7526, 4)
Coming 3..
Coming 2..
(9125, 4)
Coming 3..
Coming 2..
(9219, 4)
Coming 3..
Coming 2..
(9942, 4)
Coming 3..
Coming 2..
(11052, 4)
Coming 3..
Coming 2..
(11063, 4)
Coming 3..
Coming 2..
(11064, 4)
Coming 3..
Coming 3..
Coming 2..
(11409, 4)
Coming 3..
Coming 2..
(11437, 4)
Coming 3..
Coming 2..
(11475, 4)
Coming 3..
Coming 2..
(11510, 4)
Coming 3..
Coming 2..
(11510, 4)
Coming 3..
Coming 2..
(11678, 4)
Coming 3..
Co

In [248]:
df_recommended_item

In [198]:
user_movie= df_movies_ratings[(df_movies_ratings.userId==user_id) & df_movies_ratings.rating.isin([5,4.5])][['title']]

In [199]:
user_movie=user_movie.iloc[0,0]

In [200]:
sorted_movies_as_per_userChoice=df_movies.sort_values( ["similarity"], ascending = False )

In [201]:
sorted_movies_as_per_userChoice=sorted_movies_as_per_userChoice[sorted_movies_as_per_userChoice['similarity'] >=0.45]['movie_id']


In [204]:
recommended_movies=list()

In [203]:
sorted_movies_as_per_userChoice

2898    2967
1173    1190
574      578
2162    2231
1178    1196
        ... 
573      577
627      632
963      975
1613    1659
1186    1204
Name: movie_id, Length: 87, dtype: int64

In [205]:
df_recommended_item=pd.DataFrame()

In [206]:
user2Movies= df_ratings[df_ratings['userId']== user_id]['movieId']

In [208]:
for movieId in sorted_movies_as_per_userChoice:
    if movieId not in user2Movies:
        df_new= df_ratings[(df_ratings.movieId==movieId)]
        df_recommended_item=pd.concat([df_recommended_item,df_new])

In [209]:
df_recommended_item

Unnamed: 0,userId,movieId,rating,timestamp
7418,53,1190,5,977986365
18499,146,1190,3,979854470
19752,149,1190,5,981498843
24757,175,1190,4,977116246
30178,203,1190,2,976929239
...,...,...,...,...
994795,6007,1204,4,956790809
996336,6016,1204,5,956778313
997847,6026,1204,4,956726822
998221,6032,1204,5,956718542


In [211]:
best10=df_recommended_item.sort_values(["rating"], ascending = False )[1:10] 