In [None]:
#library imports
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy import sparse


In [None]:
# Jovian Commit Essentials
# Please retain and execute this cell without modifying the contents for `jovian.commit` to work
!pip install jovian --upgrade -q
import jovian

[?25l[K     |████▊                           | 10 kB 23.6 MB/s eta 0:00:01[K     |█████████▌                      | 20 kB 5.2 MB/s eta 0:00:01[K     |██████████████▎                 | 30 kB 7.3 MB/s eta 0:00:01[K     |███████████████████             | 40 kB 3.5 MB/s eta 0:00:01[K     |███████████████████████▉        | 51 kB 3.6 MB/s eta 0:00:01[K     |████████████████████████████▋   | 61 kB 4.2 MB/s eta 0:00:01[K     |████████████████████████████████| 68 kB 3.0 MB/s 
[?25h  Building wheel for uuid (setup.py) ... [?25l[?25hdone


**Import google drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Dataset

In [None]:
travel_ratings_df = pd.read_csv("/content/drive/MyDrive/GCN_TripRecommendation/datasets/yelp_training_set_review.csv")
travel_ratings_df.shape
print(travel_ratings_df.head())

   Unnamed: 0  business_blank              business_categories business_city  \
0           0           False  Breakfast & Brunch; Restaurants       Phoenix   
1           1           False      Italian; Pizza; Restaurants       Phoenix   
2           2           False      Middle Eastern; Restaurants         Tempe   
3           3           False    Active Life; Dog Parks; Parks    Scottsdale   
4           4           False                Tires; Automotive          Mesa   

                     business_full_address             business_id  \
0        6106 S 32nd St\nPhoenix, AZ 85042  9yKzy9PApeiPPOUJEtnvkg   
1  4848 E Chandler Blvd\nPhoenix, AZ 85044  ZRJwVLyzEJq1VAihDhYiow   
2     1513 E  Apache Blvd\nTempe, AZ 85281  6oRAC4uyJCsJl1X0WZpVSA   
3   5401 N Hayden Rd\nScottsdale, AZ 85250  _1QQZuf4zZOyFCvXc0o6Vg   
4        1357 S Power Road\nMesa, AZ 85206  6ozycU1RpktNG2-1BroVtw   

   business_latitude  business_longitude       business_name  \
0          33.390792         -112.

Getting rid of the ratings that are not present & make the datset to contain only the items that are rated >=4

In [None]:
travel_ratings = travel_ratings_df.loc[travel_ratings_df.stars >= 4].reset_index()[['user_id','business_id','stars']]
print(travel_ratings.shape)
travel_ratings.head()

(156071, 3)


Unnamed: 0,user_id,business_id,stars
0,rLtl8ZkDX5vH5nAx9C3q5Q,9yKzy9PApeiPPOUJEtnvkg,5
1,0a2KyEL0d3Yb1V6aivbIuQ,ZRJwVLyzEJq1VAihDhYiow,5
2,0hT2KtfLiobPvh6cDC8JQg,6oRAC4uyJCsJl1X0WZpVSA,4
3,uZetl9T0NcROGOyFfughhg,_1QQZuf4zZOyFCvXc0o6Vg,5
4,vYmM4KTsC8ZfQBg-j5MWkw,6ozycU1RpktNG2-1BroVtw,5


Create train set, validation set,test set

Train-Valid Split

In [None]:
train_df, valid_df = train_test_split(travel_ratings, test_size=0.1)
valid_df, test_df  = train_test_split(travel_ratings, test_size=0.5)



In [None]:
#resetting indices to avoid indexing errors in the future
train_df = train_df.reset_index()[['user_id', 'business_id', 'stars']]
valid_df = valid_df.reset_index()[['user_id', 'business_id', 'stars']]
test_df  = test_df.reset_index()[['user_id', 'business_id', 'stars']]

Training

**Encoding columns with continuous ids**

Because we'll be using PyTorch's embedding layers to create our user and item embeddings, we need continuous IDs to be able to index into the embedding matrix and access each user/item embedding.

In [None]:
def encode_column(column):
    """ Encodes a pandas column with continous IDs"""
    keys = column.unique()
    key_to_id = {key:idx for idx,key in enumerate(keys)}
    return key_to_id, np.array([key_to_id[x] for x in column]), len(keys)

In [None]:
def encode_df(travel_df):
    """Encodes rating data with continuous user and business ids"""
    
    place_ids, travel_df['business_id'], num_places = encode_column(travel_df['business_id'])
    user_ids, travel_df['user_id'], num_users = encode_column(travel_df['user_id'])
    return travel_df, num_users, num_places, user_ids, place_ids

In [None]:
travel_df, num_users, num_places, user_ids, place_ids = encode_df(train_df)
print("Number of users :", num_users)
print("Number of places :", num_places)
travel_df.head()

Number of users : 35530
Number of places : 10843


Unnamed: 0,user_id,business_id,stars
0,0,0,4
1,1,1,4
2,2,2,4
3,3,3,5
4,4,4,4


In [None]:
valid_df, num_users, num_places, user_ids, place_ids = encode_df(valid_df)
print("Number of users :", num_users)
print("Number of places :", num_places)
valid_df.head()

Number of users : 25593
Number of places : 9632


Unnamed: 0,user_id,business_id,stars
0,0,0,4
1,1,1,5
2,2,2,5
3,3,3,4
4,4,4,4


In [None]:
test_df, num_users, num_places, user_ids, place_ids = encode_df(test_df)
print("Number of users :", num_users)
print("Number of places :", num_places)
test_df.head()

Number of users : 25888
Number of places : 9665


Unnamed: 0,user_id,business_id,stars
0,0,0,5
1,1,1,4
2,2,2,4
3,3,3,4
4,4,4,5


### Initializing user and item embeddings

In [None]:
def create_embeddings(n, K):
    """
    Creates a random numpy matrix of shape n, K with uniform values in (0, 5/K)
    n: number of items/users
    K: number of factors in the embedding 
    """
    return 5*np.random.random((n, K)) / K

In [None]:
5*np.random.random((5, 3)) 

array([[0.23135609, 3.20432752, 2.94379329],
       [4.8681229 , 4.59151085, 2.51373164],
       [2.0197417 , 4.37350756, 3.42413086],
       [1.27267088, 4.94157008, 0.56131774],
       [2.64262649, 2.72453191, 0.09757005]])

Creating Sparse utility matrix

In [None]:
def create_sparse_matrix(df, rows, cols, column_name="stars"):
    """ Returns a sparse utility matrix""" 
    return sparse.csc_matrix((df[column_name].values,(df['user_id'].values, df['business_id'].values)),shape=(rows, cols))

In [None]:
travel_df, num_users, num_places, user_ids, place_ids = encode_df(train_df)
Y = create_sparse_matrix(travel_df, num_users, num_places)

In [None]:
# to view matrix
Y.todense()

matrix([[4, 0, 0, ..., 0, 0, 0],
        [0, 4, 0, ..., 0, 0, 0],
        [0, 0, 4, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

### Making predictions

In [None]:
# Generates predicted values but dot product of user and item embeddings
def predict(df, emb_user, emb_places):
    """ This function computes df["prediction"] without doing (U*V^T).
    
    Computes df["prediction"] by using elementwise multiplication of the corresponding embeddings and then 
    sum to get the prediction u_i*v_j. This avoids creating the dense matrix U*V^T.
    """
    df['prediction'] = np.sum(np.multiply(emb_places[df['business_id']],emb_user[df['user_id']]), axis=1)
    return df

Cost

In [None]:
lmbda = 0.0002

In [None]:
# computes the loss here, subtracting the predicted vs actual values
def cost(df, emb_user, emb_places):
    """ Computes mean square error"""
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_places.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_places), emb_user.shape[0], emb_places.shape[0], 'prediction')
    return np.sum((Y-predicted).power(2))/df.shape[0] 

### Gradient Descent

In [None]:
def gradient(df, emb_user, emb_places):
    """ Computes the gradient for user and place embeddings"""
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_places.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_places), emb_user.shape[0], emb_places.shape[0], 'prediction')
    delta =(Y-predicted)
    grad_user = (-2/df.shape[0])*(delta*emb_places) + 2*lmbda*emb_user
    grad_places = (-2/df.shape[0])*(delta.T*emb_user) + 2*lmbda*emb_places
    return grad_user, grad_places

In [None]:
import matplotlib.pyplot as plt

In [None]:
# for optimzation we use the gradient descent.
def gradient_descent(df, emb_user, emb_places, iterations=2000, learning_rate=0.01, df_val=None):
    """ 
    Computes gradient descent with momentum (0.9) for given number of iterations.
    emb_user: the trained user embedding
    emb_places: the trained places embedding
    """
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_places.shape[0])
    beta = 0.9
    grad_user, grad_places = gradient(df, emb_user, emb_places)
    v_user = grad_user
    v_places = grad_places
    for i in range(iterations):
        grad_user, grad_places = gradient(df, emb_user, emb_places)
        v_user = beta*v_user + (1-beta)*grad_user
        v_places = beta*v_places + (1-beta)*grad_places
        emb_user = emb_user - learning_rate*v_user
        emb_places = emb_places - learning_rate*v_places
        if(not (i+1)%50):
            print("\niteration", i+1, ":")
            print("train mse:",  cost(df, emb_user, emb_places))
            if df_val is not None:
                print("validation mse:",  cost(df_val, emb_user, emb_places))

    return emb_user, emb_places

In [None]:
# calls the gradient descent prints the appropriate loss for each iteration
emb_user = create_embeddings(num_users, 3)
emb_places = create_embeddings(num_places, 3)
emb_user, emb_places = gradient_descent(travel_df, emb_user, emb_places, iterations=800, learning_rate=1)


iteration 50 :
train mse: 6.0918765304644555

iteration 100 :
train mse: 5.442022831949239

iteration 150 :
train mse: 5.030427297909383

iteration 200 :
train mse: 4.75279488521228

iteration 250 :
train mse: 4.5589889419659615

iteration 300 :
train mse: 4.421243255852848

iteration 350 :
train mse: 4.322884891484207

iteration 400 :
train mse: 4.25330267457884

iteration 450 :
train mse: 4.205408426683128

iteration 500 :
train mse: 4.174268375194656

iteration 550 :
train mse: 4.156327785754057

iteration 600 :
train mse: 4.148950975191657

iteration 650 :
train mse: 4.1501363498021115

iteration 700 :
train mse: 4.158332836700726

iteration 750 :
train mse: 4.1723175289136964

iteration 800 :
train mse: 4.191111707034548


In [None]:
# from matplotlib import pyplot as plt  

In [None]:
# plt.plot(iterations, train_mse, label='train')
# plt.plot(iters, val_losses, label='validation')
# plt.xlabel('Precision')
# plt.ylabel('Recall')
# plt.title('Precision versus Recall')
# plt.show()

In [None]:
train_mse = cost(train_df, emb_user, emb_places)
test_mse = cost(test_df, emb_user, emb_places)
val_mse=cost(valid_df, emb_user, emb_places)
print(train_mse, val_mse,test_mse)

4.191111707034548 7.2416487694618885 7.2056799402598966
