In [None]:
from scipy.sparse import csr_matrix

def create_X(df):
    """
    Generates a sparse matrix from ratings dataframe.
    
    Args:
        df: pandas dataframe containing 3 columns (userId, businessId, rating)
    
    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        business_mapper: dict that maps business id's to business indices
        business_inv_mapper: dict that maps business indices to business id's
    """
    M = df['userId'].nunique()
    N = df['businessId'].nunique()

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
    business_mapper = dict(zip(np.unique(df["businessId"]), list(range(N))))
    
    user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
    business_inv_mapper = dict(zip(list(range(N)), np.unique(df["businessId"])))
    
    user_index = [user_mapper[i] for i in df['userId']]
    item_index = [business_mapper[i] for i in df['businessId']]

    X = csr_matrix((df["rating"], (user_index, item_index)), shape=(M, N))
    
    return X, user_mapper, business_mapper, user_inv_mapper, business_inv_mapper

X, user_mapper, business_mapper, user_inv_mapper, business_inv_mapper = create_X(ratings)
X.shape

In [None]:
n_total = X.shape[0] * X.shape[1]
n_ratings = X.nnz
sparsity = n_ratings / n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%")

In [None]:
n_ratings_per_user = np.diff(X.indptr)
len(n_ratings_per_user)

print(f"Most active user rated {n_ratings_per_user.max()} businesses.")
print(f"Least active user rated {n_ratings_per_user.min()} businesses.")

n_ratings_per_business = np.diff(X.tocsc().indptr)
len(n_ratings_per_business)

print(f"Most rated business has {n_ratings_per_business.max()} ratings.")
print(f"Least rated business has {n_ratings_per_business.min()} ratings.")

plt.figure(figsize=(16,4))
plt.subplot(1,2,1)
sns.kdeplot(n_ratings_per_user, shade=True)
plt.xlim(0)
plt.title("Number of Ratings Per User", fontsize=14)
plt.xlabel("number of ratings per user")
plt.ylabel("density")
plt.subplot(1,2,2)
sns.kdeplot(n_ratings_per_business, shade=True)
plt.xlim(0)
plt.title("Number of Ratings Per Business", fontsize=14)
plt.xlabel("number of ratings per business")
plt.ylabel("density")
plt.show()

In [None]:
from sklearn.neighbors import NearestNeighbors

def find_similar_businesses(business_id, X, business_mapper, business_inv_mapper, k, metric='cosine'):
    """
    Finds k-nearest neighbours for a given business id.
    
    Args:
        business_id: id of the business of interest
        X: user-item utility matrix
        k: number of similar businesses to retrieve
        metric: distance metric for kNN calculations
    
    Output: returns list of k similar business ID's
    """
    X = X.T
    neighbour_ids = []
    
    business_ind = business_mapper[business_id]
    business_vec = X[business_ind]
    if isinstance(business_vec, (np.ndarray)):
        business_vec = business_vec.reshape(1, -1)
    # use k+1 since kNN output includes the businessId of interest
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    kNN.fit(X)
    neighbour = kNN.kneighbors(business_vec, return_distance=False)
    for i in range(1, k+1):
        n = neighbour.item(i)
        neighbour_ids.append(business_inv_mapper[n])
    return neighbour_ids

In [None]:
similar_businesses = find_similar_businesses(1, X, business_mapper, business_inv_mapper, k=10)
similar_businesses

In [None]:
business_titles = dict(zip(businesses['businessId'], businesses['title']))

business_id = 1

similar_businesses = find_similar_businesses(business_id, X, business_mapper, business_inv_mapper, metric='cosine', k=10)
business_title = business_titles[business_id]

print(f"Because you liked {business_title}:")
for i in similar_businesses:
    print(business_titles[i])

In [None]:
business_id = 1

similar_businesses = find_similar_businesses(business_id, X, business_mapper, business_inv_mapper, metric='euclidean', k=10)
business_title = business_titles[business_id]

print(f"Because you liked {business_title}:")
for i in similar_businesses:
    print(business_titles[i])


In [None]:
n_businesses = businesses['businessId'].nunique()
print(f"There are {n_businesses} unique businesses in our dataset.")

categories = set(cat for sublist in businesses['categories'].apply(lambda x: x.split(', ')).tolist() for cat in sublist)

for c in categories:
    businesses[c] = businesses['categories'].apply(lambda x: 1 if c in x else 0)
    
business_categories = businesses.drop(columns=['businessId', 'title', 'categories'])

business_categories.head()

from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(business_categories, business_categories)
print(f"Dimensions of our categories cosine similarity matrix: {cosine_sim.shape}")


In [None]:
from fuzzywuzzy import process

def business_finder(title):
    all_titles = businesses['title'].tolist()
    closest_match = process.extractOne(title, all_titles)
    return closest_match[0]

# Let's test this out with our Jumanji example.
title = business_finder('americano')
title


In [None]:
business_idx = dict(zip(businesses['title'], list(businesses.index)))
idx = business_idx[title]
print(f"Business index for Jumanji: {idx}")
Using this handy business_idx dictionary, we know that Jumanji is represented by index 1 in our matrix. Let's get the top 10 most similar businesses to Jumanji.
n_recommendations = 10
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:(n_recommendations+1)]

similar_businesses = [i[0] for i in sim_scores]

print(f"Because you liked {title}:")
print(businesses['title'].iloc[similar_businesses])


In [None]:
def get_content_based_recommendations(title_string, n_recommendations=10):
    title = business_finder(title_string)
    idx = business_idx[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:(n_recommendations+1)]
    similar_businesses = [i[0] for i in sim_scores]
    print(f"Because you liked {title}:")
    print(businesses['title'].iloc[similar_businesses])


In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=20, n_iter=10)
Q = svd.fit_transform(X)
Q.shape
Now, let's use the reduced dimensions to find similar businesses:
business_id = 1
similar_businesses = find_similar_businesses(business_id, Q, business_mapper, business_inv_mapper, metric='cosine', k=10)
business_title = business_titles[business_id]

print(f"Because you liked {business_title}:")
for i in similar_businesses:
    print(business_titles[i])


 X_{mn}\approx P_{mk}\times Q_{nk}^T = \hat{X} 