## Similarity Matrices Calculations & Inference

In [None]:
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('nbagg')

import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})

import seaborn as sns
sns.set_style('whitegrid')
import os
from scipy import sparse
from scipy.sparse import csr_matrix

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import random

In [None]:
from datetime import datetime
globalstart = datetime.now()

## 1.  Loading train and test data into dataframes

In [None]:
raw_data_path = "./data/raw"
movie_titles_csv_path = raw_data_path + "/movie_titles.csv"

processed_data_path = "./data/processed"
models_path = "./models"
master_data_csv_path = processed_data_path + "/" + "data.csv"

In [None]:
start = datetime.now()


train_df = pd.read_csv(processed_data_path + "/" + "train_sliced.csv", parse_dates=['date'])
test_df = pd.read_csv(processed_data_path + "/" + "test_sliced.csv")

print("Time Taken:", datetime.now() - start) #Time Taken: 0:00:00.617040

Time Taken: 0:00:00.542682


In [None]:
train_df.head(2)

Unnamed: 0,movieId,userId,rating,date
0,16242,2248080,3,1999-12-30
1,11064,2248080,3,1999-12-30


In [None]:
train_df.userId.values.shape, train_df.movieId.values.shape, train_df.rating.values.shape

((1150922,), (1150922,), (1150922,))

## 2. Creating sparse matrices from Train and Test Data Frames

### 2.1. Understanding csr_matrix

In [None]:
row = np.array([0, 0, 1, 2, 2, 2])
col = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])

sparse.csr_matrix((data, (row, col)),).toarray()

array([[1, 0, 2],
       [0, 0, 3],
       [4, 5, 6]])

In [None]:
row, col, data

(array([0, 0, 1, 2, 2, 2]),
 array([0, 2, 2, 0, 1, 2]),
 array([1, 2, 3, 4, 5, 6]))

### 2.2. Creating sparse matrix from train data frame

In [None]:
#Userid as rows and Movie Id as Columns. Ratings as the data in the matrix

start = datetime.now()
if os.path.isfile(models_path + "/" +'train_sparse_matrix.npz'):
    print("It is present in your pwd, getting it from disk....")
    
    train_sparse_matrix = sparse.load_npz(models_path + "/" +'train_sparse_matrix.npz')
    print("DONE..")
else: 
    print("We are creating sparse_matrix from the dataframe..")
    
    # create sparse_matrix and store it for after usage.
    train_sparse_matrix = sparse.csr_matrix((train_df.rating.values, (train_df.userId.values, train_df.movieId.values)),)
    
    print('Done. It\'s shape is : (user, movie) : ',train_sparse_matrix.shape)
    print('Saving it into disk for furthur usagemodels..')

    sparse.save_npz(models_path + "/" + "train_sparse_matrix.npz", train_sparse_matrix)
    print('Done..\n')


print("Time taken :", datetime.now()-start) 

It is present in your pwd, getting it from disk....
DONE..
Time taken : 0:00:00.059168


### 2.3. The Sparsity of Train Sparse Matrix

In [None]:
train_sparse_matrix.shape, train_sparse_matrix.count_nonzero()

((2647889, 17765), 1150922)

In [None]:
us,mv = train_sparse_matrix.shape
elem = train_sparse_matrix.count_nonzero()

print("Sparsity Of Train matrix : {} % ".format(  (1-(elem/(us*mv))) * 100) )

Sparsity Of Train matrix : 99.99755329897192 % 


### 2.4 Creating sparse matrix from test data frame

In [None]:

from datetime import datetime
import time

start = datetime.now()
if os.path.isfile(models_path + "/" +'test_sparse_matrix.npz'):
    print("It is present in your pwd, getting it from disk....")
  
    test_sparse_matrix = sparse.load_npz(models_path + "/" +'test_sparse_matrix.npz')
    print("DONE..")
else: 
    print("We are creating sparse_matrix from the dataframe..")
    
    # create sparse_matrix and store it for after usage.
    test_sparse_matrix = sparse.csr_matrix((test_df.rating.values, (test_df.userId.values,
                                               test_df.movieId.values)))
    
    print('Done. It\'s shape is : (user, movie) : ', test_sparse_matrix.shape)
    print('Saving it into disk for furthur usage..')

    sparse.save_npz(models_path + "/" + "test_sparse_matrix.npz", test_sparse_matrix)
    print('Done..\n')
    

print("Time taken :",datetime.now()-start) 

It is present in your pwd, getting it from disk....
DONE..
Time taken : 0:00:00.033293


### 2.5. The Sparsity of Test data Matrix

In [None]:
test_sparse_matrix.shape, test_sparse_matrix.count_nonzero()

((2647889, 17765), 287731)

In [None]:
us,mv = test_sparse_matrix.shape
elem = test_sparse_matrix.count_nonzero()

print("Sparsity Of Test matrix : {} % ".format(  (1-(elem/(us*mv))) * 100) )

Sparsity Of Test matrix : 99.99938832368005 % 


## 3. Computing User-User Cosine Similarity Matrix

### 3.1. Trying with all dimensions (17k dimensions per user)

Calculating User User Similarity_Matrix is __not very easy__(_unless one has huge Computing Power and lots of time_) because of number of users being large.

    * The system could crash or the program stops with **Memory Error**
    * Also, significant time would be required. For 100 users itself, it took significant time

In [None]:
train_sparse_matrix.shape, train_sparse_matrix.count_nonzero()

((2647889, 17765), 1150922)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


def compute_user_similarity(sparse_matrix, compute_for_few=False, top = 100, verbose=False, verb_for_n_rows = 20,
                            draw_time_taken=True):
    
    no_of_users, _ = sparse_matrix.shape
    print('no_of_users:', no_of_users)
    
    # get the indices of  non zero rows(users) from our sparse matrix
    row_ind, col_ind = sparse_matrix.nonzero()
    row_ind = sorted(set(row_ind)) # we don't have to
    time_taken = list() 
    
    # we create rows, cols, and data lists.., which can be used to create sparse matrices
    rows, cols, data = list(), list(), list()
    if verbose: print("Computing top",top,"similarities for each user..")
    
    start = datetime.now()
    temp = 0
    
    for row in row_ind[:top] if compute_for_few else row_ind:
        temp = temp+1
        prev = datetime.now()
        
        # get the similarity row for this user with all other users
        sim = cosine_similarity(sparse_matrix.getrow(row), sparse_matrix).ravel()
        # We will get only the top most similar users.
        top_sim_ind = sim.argsort()[-top:]
        top_sim_val = sim[top_sim_ind]
        
        rows.extend([row]*top)
        cols.extend(top_sim_ind)
        data.extend(top_sim_val)
        time_taken.append(datetime.now().timestamp() - prev.timestamp())
        
        if verbose:
            if temp%verb_for_n_rows == 0:
                print("computing done for {} users [  time elapsed : {}  ]"
                      .format(temp, datetime.now()-start))
            
        
    if verbose: print('Creating Sparse matrix from the computed similarities')
    
    if draw_time_taken:
        plt.plot(time_taken, label = 'time taken for each user')
        plt.plot(np.cumsum(time_taken), label='Total time')
        plt.legend(loc='best')
        plt.xlabel('User')
        plt.ylabel('Time (seconds)')
        plt.show()
        
    return sparse.csr_matrix((data, (rows, cols)), shape=(no_of_users, no_of_users)), time_taken      

In [None]:
# Computing User to User Cosine Similarity Matrix 
start = datetime.now()
if not os.path.isfile(models_path + "/" + 'u_u_sim_sparse.npz'):
    print("It seems you don't have that file. Computing movie_movie similarity...")
    start = datetime.now()

    u_u_sim_sparse, _ = compute_user_similarity(train_sparse_matrix, compute_for_few=True, top = 100,
                                                       verbose=True, verb_for_n_rows=25)


    print("Done..")

    print("Saving it to disk without the need of re-computing it again.. ")
    sparse.save_npz(models_path + "/" + "u_u_sim_sparse.npz", u_u_sim_sparse)
    print("Done..")
else:
    print("It is there, We will get it.")
    u_u_sim_sparse = sparse.load_npz(models_path + "/" + "u_u_sim_sparse.npz")
    print("Done ...")
    

    print("It's a ", u_u_sim_sparse.shape," dimensional matrix")

    
print("Time taken :",datetime.now()-start)

It is there, We will get it.
Done ...
It's a  (2647889, 2647889)  dimensional matrix
Time taken : 0:00:00.024050


In [None]:
train_sparse_matrix.shape

(2647889, 17765)

* We have  **405,041 users** in out training set and computing similarities between them..( **17K dimensional vector..**) is time consuming..


- From above plot, It took roughly __8.88 sec__ for computing simlilar users for __one user__
    
    
- We have __405,041 users__ with us in training set.


- ${ 405041 \times 8.88 = 3596764.08  \sec } =  59946.068 \min = 999.101133333 \text{ hours}
= 41.629213889 \text{ days}...$

    - Even if we run on high preformance cores parallelly (a typical system now a days), It will still take almost __10 and 1/2__ days. Instead, we will try to reduce the dimentsions using SVD, so that __it might__ speed up the process...

In [None]:
#initilaize the algorithm with some parameters..
netflix_svd = TruncatedSVD(n_components=500, algorithm='randomized', random_state=15)
trunc_svd = netflix_svd.fit_transform(train_sparse_matrix)

In [None]:
for i in ind:
    print("({}, {})".format(i, np.round(expl_var[i-1], 2)))

 

---------

-  By just taking __(20 to 30)__ latent factors, explained variance that we could get is __20 %__. 

- To take it to __60%__, we have to take  __almost 400 latent factors__. It is not fare.



- It basically is the __gain of variance explained__, if we ___add one additional latent factor to it.___


- By adding one by one latent factore too it, the ___gain in expained variance__ with that addition is decreasing. (Obviously, because they are sorted that way).
- ___LHS Graph___:
    - __x__ --- ( No of latent factos ),
    - __y__ --- ( The variance explained by taking x latent factors)



- __More decrease in the line (RHS graph) __:
    - We  are getting more expained variance than before.
- __Less decrease in that line (RHS graph)__  :
    - We are not getting benifitted from adding latent factor furthur. This is what is shown in the plots.


- ___RHS Graph___:
    - __x__ --- ( No of latent factors ),
    - __y__ --- ( Gain n Expl_Var by taking one additional latent factor) 

In [None]:
# Let's project our Original U_M matrix into into 500 Dimensional space...
start = datetime.now()
trunc_matrix = train_sparse_matrix.dot(netflix_svd.components_.T)
print(datetime.now()- start)

In [None]:
type(trunc_matrix), trunc_matrix.shape

* Let's convert this to truncated sparse matrix and store it for future purposes

In [None]:
# Convertint truncated_matrin into truncated_sparse_matrix and saving it for future use

start = datetime.now()
if not os.path.isfile(models_path + "/" + 'trunc_sparse_matrix.npz'):
    
    print("It seems you don't have that file. Computing...")

    # create that sparse sparse matrix
    trunc_sparse_matrix = sparse.csr_matrix(trunc_matrix)
    # Save this truncated sparse matrix for later usage..
    sparse.save_npz(models_path + "/" + 'trunc_sparse_matrix', trunc_sparse_matrix)
    
    print("Done..")
    print("Saving it to disk without the need of re-computing it again.. ")  

else:
    print("It is there, We will get it.")
    trunc_sparse_matrix = sparse.load_npz(models_path + "/" + 'trunc_sparse_matrix.npz')
    print("Done..")
        
print("Time taken:", datetime.now()-start)

### 3.3: Computing User-User Similarity matrix (Cosine Similarity) for truncated sparse matrix for train data

In [None]:

start = datetime.now()
if not os.path.isfile(models_path + "/" + 'trunc_u_u_sim_matrix.npz'):
    print("It seems you don't have that file. Computing movie_movie similarity...")
    start = datetime.now()
                                           verb_for_n_rows=25)
    trunc_u_u_sim_matrix, _ = compute_user_similarity(trunc_sparse_matrix, compute_for_few=False, top=50, verbose=True, 
                                                 verb_for_n_rows=2500)

    print("-"*50)
    print("Done..")
    print("Saving it to disk without the need of re-computing it again.. ")
    sparse.save_npz(models_path + "/" + "trunc_u_u_sim_matrix.npz", trunc_u_u_sim_matrix)
    print("Done..")
else:
    print("It is there, We will get it.")
    trunc_u_u_sim_matrix = sparse.load_npz(models_path + "/" + "trunc_u_u_sim_matrix.npz")
    print("Done ...")
    
print("It's a ", trunc_u_u_sim_matrix.shape," dimensional matrix")

print("Total time:", datetime.now() - start)

# It's a  (2647889, 2647889)  dimensional matrix
# 0:00:07.772109

### 3.4 User User Cosine Similarity Inference

In [None]:
# Step1: Loading the cosine similarity matrix for truncated sparse matrix

trunc_u_u_sim_matrix = sparse.load_npz(models_path + "/" + "trunc_u_u_sim_matrix.npz")
print("Loaded ...")
print("It's a ", trunc_u_u_sim_matrix.shape," dimensional matrix")

### 3.4.1 Analysing unique user ids present in the trunc_u_u_sim_matrix

In [None]:
train_df_users = train_df.userId.unique() 
train_df_users.shape

In [None]:
sim_matrix_user_ids = np.unique(trunc_u_u_sim_matrix.nonzero()[1]) # Getting all the unique user ids to run them in a loop
sim_matrix_user_ids.shape

In [None]:
user_ids_intersect = np.intersect1d(train_df_users, sim_matrix_user_ids)
user_ids_intersect.shape 

In [None]:
temp = np.intersect1d(user_ids_intersect, sim_matrix_user_ids)
temp.shape

In [None]:
if 2248080 in sim_matrix_user_ids:
    print('Its there!!')

### 3.4.2 User User Cosine Similarity Inference

In [None]:
# Finding Simialr users - Number of unique users

# The number of users are much more than the number of movies. Hence, finding any similarity for user level will take much more computing time
user_ids = np.unique(trunc_u_u_sim_matrix.nonzero()[1]) 
user_ids.shape, user_ids[0:10]

In [None]:
# Checking if the user id is present in the trunc_u_u_sim_matrix

# input_user_id = 83
input_user_id = 2248080

user_is_present = False

# Check if user id is present
if input_user_id in sim_matrix_user_ids:  
    user_is_present = True
    print('Its there!!')
else:
    user_is_present = False
    print('Its not there!!')

In [None]:
user_is_present

In [None]:
# Finding Simialr users - for a given user id

start = datetime.now()
similar_users = dict()

if user_is_present:
    
    # get the top five similar similar and store them
    sim_users = trunc_u_u_sim_matrix[input_user_id].toarray().ravel().argsort()[::-1][1:]
    similar_users[input_user_id] = sim_users[:5]

    print(datetime.now() - start)

    print("Users similar to {} are: ".format(input_user_id), similar_users[input_user_id])
else:
    print('User is not present!!')

In [None]:
# Finding User Last Liked movies for a given user id


if user_is_present:
    # Getting the lastest watched 5 movies which the user has rated 5
    rating_options = [5]
    num_of_movies = 2  #Number of movies that we want to find

    user_last_liked_movies = list(train_df[(train_df['userId'] == input_user_id) & train_df['rating'].isin(rating_options)].tail(num_of_movies).movieId)

    print(user_last_liked_movies)
else:
    print('User is not present!!')

### 3.5: Computing Movie-Movie Similarity matrix

In [None]:
start = datetime.now()
if not os.path.isfile(models_path + "/" + 'm_m_sim_sparse.npz'):
    print("It seems you don't have that file. Computing movie_movie similarity...")
    start = datetime.now()
    m_m_sim_sparse = cosine_similarity(X=train_sparse_matrix.T, dense_output=False)
    print("Done.. Computed movie movie cosine similarity matrix")
  
    print("Saving it to disk without the need of re-computing it again.. ")
    sparse.save_npz(models_path + "/" + "m_m_sim_sparse.npz", m_m_sim_sparse)
    print("Done..")
else:
    print("It is there, We will get it.")
    m_m_sim_sparse = sparse.load_npz(models_path + "/" + "m_m_sim_sparse.npz")
    print("Done ...")

print("It's a ",m_m_sim_sparse.shape," dimensional matrix")

print(datetime.now() - start)


# It seems you don't have that file. Computing movie_movie similarity...
# Done.. Computed movie movie cosine similarity matrix
# Saving it to disk without the need of re-computing it again.. 
# Done..
# It's a  (17765, 17765)  dimensional matrix
# 0:00:01.315013

In [None]:
m_m_sim_sparse.shape

### 3.6 Movie Movie Cosine Similarity Inference

### 3.6.1 Analysing unique movie ids present in the m_m_sim_sparse

In [None]:
train_df.head()

In [None]:
train_df_movies = train_df.movieId.unique()  
train_df_movies.shape

In [None]:
sim_matrix_movie_ids = np.unique(m_m_sim_sparse.nonzero()[1]) # Getting all the unique movie ids to run them in a loop
sim_matrix_movie_ids.shape

In [None]:
movie_ids_intersect = np.intersect1d(train_df_movies, sim_matrix_movie_ids)
movie_ids_intersect.shape

In [None]:
# Checking if the movie id is present in the m_m_sim_matrix
# It will be present only, as we have not done SVD trucation for movie ids

input_movie_id = 4670

movie_is_present = False  #movie is present in the sparse matrix

# Check if user id is present
if input_movie_id in sim_matrix_movie_ids:  # Only these users have Cosine Similarity Matrix defined
    movie_is_present = True
    print('Its there!!')
else:
    movie_is_present = False
    print('Its not there!!')

In [None]:

start = datetime.now()

similar_movies = dict()

if movie_is_present:
    
    # get the top similar movies and store them
    sim_movies = m_m_sim_sparse[input_movie_id].toarray().ravel().argsort()[::-1][1:]
    similar_movies[input_movie_id] = sim_movies[:100]

    print(similar_movies[input_movie_id])

else:
    print('Movie is not there!!')

print("Time taken: ",datetime.now() - start)

In [None]:
#  For the user liked movies (approach as required by Gokul)

# Aim: Recommend movies based on a user id
# Steps:
# 1. Find the last n liked movies for a user (which he rated high 4 or 5)
# 2. Find movies similar to above n movies, and recommend to the user

start = datetime.now()

recommended_movies_user_dict = dict()
recommended_movies_user_list = []

for movie in user_last_liked_movies:
    # get the top similar movies and store them 
    sim_movies = m_m_sim_sparse[movie].toarray().ravel().argsort()[::-1][1:]
    recommended_movies_user_dict[movie] = sim_movies[:10]
    recommended_movies_user_list.append(sim_movies[:10])
    
print(datetime.now() - start)


# recommended_movies_user_list
t = list(recommended_movies_user_dict.values())
flat_list_movie_reco = [item for sublist in t for item in sublist]
flat_list_movie_reco

In [None]:
# Adding the movie title as well. Movie details are presnt in movie_titles.csv

movie_titles = pd.read_csv(movie_titles_csv_path, names=['movie_id', 'year_of_release', 'title'],  index_col="movie_id", encoding = "ISO-8859-1")
movie_titles.head()

In [None]:
# Movie Recommendation

movie_reco_titles = []

for mv_id in flat_list_movie_reco:
    movie_title = movie_titles.loc[mv_id].values[1]
    movie_year = int(movie_titles.loc[mv_id].values[0])
    
    movie_reco_titles.append(movie_title)
    print('{} : {} ({})'.format(mv_id, movie_title, movie_year))

In [None]:

start = datetime.now()
similar_movies = dict()

# Getting all the unique movie ids to run them in a loop
movie_ids = np.unique(m_m_sim_sparse.nonzero()[1]) 
for movie in movie_ids:
    # get the top similar movies and store them
    sim_movies = m_m_sim_sparse[movie].toarray().ravel().argsort()[::-1][1:]
    similar_movies[movie] = sim_movies[:100]
print(datetime.now() - start)


similar_movies[input_movie_id]

In [None]:
print("Total Time taken :",datetime.now()-globalstart) 