In [14]:
import pandas as pd
import numpy as np 
from numpy.linalg import svd

In [38]:
store = pd.read_csv('listcoffee.csv',index_col=0).drop_duplicates()
rating = pd.read_csv('listrating.csv',index_col=0).drop_duplicates(['User_ID','Store_ID'])

In [39]:
store.head()

Unnamed: 0,Store,type,Store_ID
0,NAP's Coffee & Roasters,speedbar,0
178,sangob,speedbar,1
235,SongSarn,slowbar,2
287,LIFE Roasters,"hybrid,bakery",3
542,Nap x Warin,speedbar,4


In [40]:
rating

Unnamed: 0,User_ID,Store_ID,Rating
0,33,0,5
1,34,0,3
2,35,0,5
3,36,0,5
4,37,0,5
...,...,...,...
3348,2266,32,5
3349,2267,32,5
3350,2268,32,5
3351,1831,32,5


In [49]:
mat = rating.pivot(index='User_ID',columns='Store_ID',values='Rating').fillna(0)

In [50]:
u, sigma, v = svd(mat.values)

In [51]:
def cosine_similarity(a, b):
    '''
    This function will calculate the cosine similarity between two vectors
    '''
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

def get_similarities(mat, id_):
    '''
    This function will use the cosine similarity function to generate a similarity 
    dictionary assocaited to an id the user passes in. The similarity dictionary will
    have the ids as the keys and the similarity in comparison to the user input id as
    the values.
    
    params:
        mat (List -> List) : A 2-D array assocaited to either the user / item matrix
                             after SVD
        id_ (Integer) : The id of the user / item you want to find similarities for.
                        The id must be in the range of the input matrix shape.
    
    returns:
        This function will return the similarity dictionary ordered by the values in 
        descending order.
        
    example:
        mat = np.asarray([
            [2,3,4],
            [6,5,3],
            [5,3,2]
        ])
        id_ = 2
        get_similarities(mat, id_)
    '''
    # create similarity hashmap, keys are ids and values are similarities
    sim_dct = {} 
    for col in range(0, mat.shape[1]):
        sim = cosine_similarity(mat[:,id_], mat[:,col])
        sim_dct[col] = sim
    
    # sort dictionary based on similarities 
    sim_dct = {k: v for k, v in sorted(sim_dct.items(), key=lambda item: item[1], reverse = True)}
    return sim_dct

def recommend(mat, id_, n_recs):
    '''
    This function will get the top n recommendations assocaited to an id.
    
    params:
        mat (List -> List) : A 2-D array assocaited to either the user / item matrix
                             after SVD
        id_ (Integer) : The id of the user / item you want to find similarities for.
                        The id must be in the range of the input matrix shape.
        n_recs (Integer) : The number of recommendations you want.
        
    returns:
        This function will return a list of ids most similar to the input id you passed.
    '''
    sim_dct = get_similarities(mat, id_)
    similar_ids = list(sim_dct.keys())[1:n_recs+1]
    return similar_ids

In [54]:
id_ = 33
n_recs = 10
print(recommend(u, id_, n_recs))


[6, 4, 11, 7, 105, 0, 9, 143, 928, 932]


In [64]:
data.head()

Unnamed: 0,User_ID,Store_ID,Rating
0,33,0,5
1,34,0,3
2,35,0,5
3,36,0,5
4,37,0,5


In [61]:
data = rating
movie_data = store

In [63]:
movie_data.head()

Unnamed: 0,Store,type,Store_ID
0,NAP's Coffee & Roasters,speedbar,0
178,sangob,speedbar,1
235,SongSarn,slowbar,2
287,LIFE Roasters,"hybrid,bakery",3
542,Nap x Warin,speedbar,4


In [65]:
#Importing Libraries
import numpy as np
import pandas as pd


#Reading dataset (MovieLens 1M movie ratings dataset: downloaded from https://grouplens.org/datasets/movielens/1m/)

# data = pd.io.parsers.read_csv('data/ratings.dat', 
#     names=['user_id', 'movie_id', 'rating', 'time'],
#     engine='python', delimiter='::')
# movie_data = pd.io.parsers.read_csv('data/movies.dat',
#     names=['movie_id', 'title', 'genre'],
#     engine='python', delimiter='::')


#Creating the rating matrix (rows as movies, columns as users)
ratings_mat = np.ndarray(
    shape=(np.max(data.Store_ID.values), np.max(data.User_ID.values)),
    dtype=np.uint8)

print(ratings_mat)
ratings_mat[data.Store_ID.values-1, data.User_ID.values-1] = data.Rating.values
print(ratings_mat)

#Normalizing the matrix(subtract mean off)
normalised_mat = ratings_mat - np.asarray([(np.mean(mat.values, 1))]).T

#Computing the Singular Value Decomposition (SVD)
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
U, S, V = np.linalg.svd(A)

#Function to calculate the cosine similarity (sorting by most similar and returning the top N)
def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1 # Movie id starts from 1 in the dataset
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

# Function to print top N similar movies
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
    movie_data[movie_data.movie_id == movie_id].title.values[0]))
    for id in top_indexes + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

#k-principal components to represent movies, movie_id to find recommendations, top_n print n results        
k = 50
movie_id = 10 # (getting an id from movies.dat)
top_n = 10
sliced = V.T[:, :k] # representative data
indexes = top_cosine_similarity(sliced, movie_id, top_n)

#Printing the top N similar movies
print_similar_movies(movie_data, movie_id, indexes)

[[224 246 203 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
[[224 246 203 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   5   5   5]]


ValueError: operands could not be broadcast together with shapes (32,2269) (2237,1) 