In [221]:
import numpy as np
import pandas as pd
import os

os.chdir("C:/PHD")
%run Library_list.ipynb

# Model-Based Collaborative Filtering

Model-based Collaborative Filtering is based on matrix factorization (MF) which has received greater exposure, mainly as an unsupervised learning method for latent variable decomposition and dimensionality reduction. Matrix factorization is widely used for recommender systems where it can deal better with scalability and sparsity than Memory-based CF:

The goal of MF is to learn the latent preferences of users and the latent attributes of items from known ratings (learn features that describe the characteristics of ratings) to then predict the unknown ratings through the dot product of the latent features of users and items.

When you have a very sparse matrix, with a lot of dimensions, by doing matrix factorization, we can restructure the user-item matrix into low-rank structure, and you can represent the matrix by the multiplication of two low-rank matrices, where the rows contain the latent vector.

You fit this matrix to approximate your original matrix, as closely as possible, by multiplying the low-rank matrices together, which fills in the entries missing in the original matrix.

In [209]:
userData = pd.read_csv('user_hotel_rating-1555730075105.csv')
userData.head(1)

Unnamed: 0,userid,Hotelid,OverallRating
0,user_78131,hotel_558,3


In [210]:
##Slicing userid and converting into int type

userData['userid'] = userData['userid'].str.slice(5,10)
userData['userid'] = userData['userid'].astype(int)

In [250]:
##Pivoting Userdata with index as UserId and Columns as Hotels.

Ratings = userData.pivot(index = 'Hotelid', columns ='userid', values = 'OverallRating').fillna(0)
Ratings.head()

userid,21002,21025,21042,21051,21054,21058,21062,21073,21101,21114,...,79950,79957,79970,80019,80020,80060,80071,80089,80090,80123
Hotelid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
hotel_501,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hotel_502,3.0,0.0,0.0,2.0,1.0,0.0,0.0,2.0,2.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0
hotel_503,0.0,0.0,0.0,5.0,0.0,3.0,5.0,0.0,5.0,0.0,...,0.0,1.0,4.0,0.0,2.0,0.0,4.0,0.0,2.0,0.0
hotel_504,3.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,...,0.0,1.0,4.0,0.0,4.0,0.0,0.0,0.0,4.0,4.0
hotel_506,3.0,0.0,4.0,5.0,0.0,3.0,4.0,0.0,4.0,5.0,...,4.0,2.0,0.0,3.0,0.0,4.0,0.0,3.0,2.0,0.0


In [251]:
#de-normalize the data (normalize by each users mean) and convert it from a dataframe to a numpy array.

R = Ratings.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)
Ratings_demeaned
# R

  This is separate from the ipykernel package so we can avoid doing imports until


array([[-0.03373253, -0.03373253, -0.03373253, ..., -0.03373253,
        -0.03373253, -0.03373253],
       [ 1.87984032, -1.12015968, -1.12015968, ..., -1.12015968,
         1.87984032,  1.87984032],
       [-1.4756487 , -1.4756487 , -1.4756487 , ..., -1.4756487 ,
         0.5243513 , -1.4756487 ],
       ...,
       [ 1.36207585, -1.63792415, -1.63792415, ..., -1.63792415,
         0.36207585,  2.36207585],
       [-1.49720559,  1.50279441,  0.50279441, ...,  1.50279441,
        -1.49720559, -1.49720559],
       [-1.53772455,  1.46227545, -1.53772455, ..., -1.53772455,
         0.46227545, -1.53772455]])

In [252]:
n_users = userData.userid.unique().shape[0]
n_hotels = userData.Hotelid.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of Hotels = ' + str(n_hotels))

#Checking for Sparsity of userData with userid and Hotelid 
sparsity = round(1.0 - len(userData) / float(n_users * n_hotels), 3)
print('The sparsity level of user_hotel_ratings dataset is ' +  str(sparsity * 100) + '%')

Number of users = 5010 | Number of Hotels = 130
The sparsity level of user_hotel_ratings dataset is 58.199999999999996%


# Singular Vector Decomposition

-A well-known matrix factorization method is Singular value decomposition (SVD) in which single matrix (say matrix A) is decomposed into U,Σ,V^T , 'U' and V^T are the unitary Matrices and Σ is the Diagonal Matrix

-In our care U is 'userid', Σ is the weights of instances, V^T is 'Hotelid'. 

-Through this method we can achieve dimensionalty reduction to find the Principal component.


In [300]:
#Setting Up SVD with Unitary matrices and diagonal Matrix

from scipy.sparse.linalg import svds
U, sigma, Vt = svds(Ratings_demeaned, k = 129)
Ratings_demeaned.shape

(130, 5010)

In [301]:
#Getting Diagonal Matrix from above 

sigma = np.diag(sigma)
sigma

array([[ 14.2555376 ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,  14.71655308,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,  15.42089162, ...,   0.        ,
          0.        ,   0.        ],
       ...,
       [  0.        ,   0.        ,   0.        , ..., 208.04984856,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
        235.32856414,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        , 268.20529316]])

In [302]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
all_user_predicted_ratings

array([[ 1.17903001e-01, -2.07852412e-03, -1.90985365e-04, ...,
        -1.15284615e-04, -3.66969066e-03, -3.37768959e-03],
       [ 3.00063386e+00, -1.11744293e-05, -1.02676339e-06, ...,
        -6.19785824e-07,  2.99998027e+00,  2.99998184e+00],
       [-6.95208199e-04,  1.22558968e-05,  1.12613412e-06, ...,
         6.79769038e-07,  2.00002164e+00,  1.99163505e-05],
       ...,
       [ 2.99972493e+00,  4.84930886e-06,  4.45579156e-07, ...,
         2.68965223e-07,  2.00000856e+00,  4.00000788e+00],
       [ 8.56868782e-04,  2.99998489e+00,  1.99999861e+00, ...,
         2.99999916e+00, -2.66697483e-05, -2.45476089e-05],
       [ 1.18068111e-03,  2.99997919e+00, -1.91252818e-06, ...,
        -1.15446058e-06,  1.99996325e+00, -3.38241965e-05]])

In [303]:
Ratings.columns
preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)


In [304]:
userData.dtypes

originalRatings = userData['OverallRating']

In [305]:
pd.unique(userData['userid'])

array([78131, 21002, 24128, ..., 60292, 61810, 68476], dtype=int64)

In [306]:
def recommend_hotels(predictions, userData, userID, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = userID-1# User ID starts at 1, not 0
    sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
    # Get the user's data and merge in the Hotel information.
    
    user_data = userData[userData['userid'] == (userID)]

    print('User {0} has already rated {1} Hotels'.format(userID, user_data.shape[0]))
    print('Recommending highest {0} predicted ratings Hotels not already rated'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (userData[userData['userid'].isin(userData['userid'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index()).
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return userID, recommendations

In [307]:
userID, recommendations = recommend_hotels(preds, userData,129, 10)

User 129 has already rated 0 Hotels
Recommending highest 10 predicted ratings Hotels not already rated


In [308]:
recommendations

# pd.unique(recommendations['Hotelid'])

Unnamed: 0,userid,Hotelid,OverallRating
3172,50775,hotel_564,5
3186,50775,hotel_637,5
3175,50775,hotel_629,5
3176,50775,hotel_516,5
3177,50775,hotel_583,5
3178,50775,hotel_566,5
3179,50775,hotel_560,5
3180,50775,hotel_519,5
3181,50775,hotel_572,5
3182,50775,hotel_563,5


In [194]:
!pip install scikit-surprise

Collecting scikit-surprise
  Using cached https://files.pythonhosted.org/packages/4d/fc/cd4210b247d1dca421c25994740cbbf03c5e980e31881f10eaddf45fdab0/scikit-surprise-1.0.6.tar.gz
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py): started
  Building wheel for scikit-surprise (setup.py): finished with status 'done'
  Stored in directory: C:\Users\CBPLDEV11\AppData\Local\pip\Cache\wheels\ec\c0\55\3a28eab06b53c220015063ebbdb81213cd3dcbb72c088251ec
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.0.6


You are using pip version 19.0.1, however version 19.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [309]:
# Import libraries from Surprise package
from surprise import Reader, Dataset, SVD, evaluate

# Load Reader library
reader = Reader()

# Load ratings dataset with Dataset library
data = Dataset.load_from_df(userData[['userid', 'Hotelid', 'OverallRating']], reader)

# Split the dataset for 5-fold evaluation
data.split(n_folds=5)

In [310]:
# Use the SVD algorithm.
svd = SVD()

In [311]:
from surprise.model_selection import cross_validate
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=2, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 2 split(s).

                  Fold 1  Fold 2  Mean    Std     
RMSE (testset)    0.8559  0.8556  0.8557  0.0001  
MAE (testset)     0.6584  0.6574  0.6579  0.0005  
Fit time          18.63   17.54   18.08   0.55    
Test time         2.09    9.62    5.86    3.76    


{'test_rmse': array([0.85585583, 0.85557445]),
 'test_mae': array([0.65838771, 0.6574403 ]),
 'fit_time': (18.629802227020264, 17.53909683227539),
 'test_time': (2.094073534011841, 9.618374586105347)}

In [275]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x158a29f0ac8>

In [281]:
testset = trainset.build_anti_testset()

In [284]:
predictions = svd.test(testset)

predictions[0:4]

[Prediction(uid=47178, iid='hotel_587', r_ui=3.2571129130123007, est=2.9285255233184637, details={'was_impossible': False}),
 Prediction(uid=47178, iid='hotel_640', r_ui=3.2571129130123007, est=2.7380313857998813, details={'was_impossible': False}),
 Prediction(uid=47178, iid='hotel_577', r_ui=3.2571129130123007, est=3.1377960059118215, details={'was_impossible': False}),
 Prediction(uid=47178, iid='hotel_506', r_ui=3.2571129130123007, est=3.2743677912197615, details={'was_impossible': False})]

# Conclusion:

The observation which I got to learn from the Dataset was,

-The dataset had only ratings which is not enough to determine whether the business is going to be successful or not.

-The dataset should have had demographics details of the existing Hotels, so that we can guess which Hotel is the competitor to what Hotel.

-Since the Hotel data is masked we do not know to what brand it belongs to, if branding is provided it would be easier for a new user to chose the hotel right away if only he/she is familiar with branding.

-If location was present in columns the recommendation engine would also recommend the cynic places for people who are new to the city.

-If seasonality was included in user_ratings dataset, our recommendation system would recommend hotels which have promotions during the season which ccould improve the occupancy of hotels.

-Location specific search is also possible if locations of hotels are included.