In [1]:
#start a sessio to create a dataframe
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pandas as pd
from sklearn.model_selection import train_test_split
from pyspark.mllib.recommendation import ALS,MatrixFactorizationModel,Rating
import pickle
#Need to be able to square root the Mean-Squared Error
from math import sqrt

In [2]:
#Load data
df = pd.read_csv('Clean_ratings.csv')

In [3]:
df

Unnamed: 0,reviewer_id,listing_id,rating
0,128182033,2595,4.0
1,115721479,2595,5.0
2,3588281,5121,5.0
3,22508539,5203,5.0
4,44462,6848,5.0
...,...,...,...
1098057,106040507,42156917,4.0
1098058,302696386,42298416,4.0
1098059,30650727,42300468,5.0
1098060,156596817,42312718,5.0


In [4]:
len(df.listing_id.unique())

39829

In [5]:
#If listing id has appeard 10 times or less keep that in training data before splitting into train,test
counts = df['listing_id'].value_counts()
ten_list = df[df['listing_id'].isin(counts[counts <= 10].index)]

In [6]:
ten_list

Unnamed: 0,reviewer_id,listing_id,rating
161,6076978,151478,5.0
200,12766552,212544,5.0
314,1557845,403712,4.0
320,166043249,412180,2.0
367,3829107,503585,4.0
...,...,...,...
1098057,106040507,42156917,4.0
1098058,302696386,42298416,4.0
1098059,30650727,42300468,5.0
1098060,156596817,42312718,5.0


In [7]:
#If listing id has appeard more than 10 times randomly split it test and train
grt_10_df = df[df['listing_id'].isin(counts[counts > 10].index)]

In [8]:
grt_10_df.reset_index(drop=True,inplace=True)

In [9]:
grt_10_df

Unnamed: 0,reviewer_id,listing_id,rating
0,128182033,2595,4.0
1,115721479,2595,5.0
2,3588281,5121,5.0
3,22508539,5203,5.0
4,44462,6848,5.0
...,...,...,...
1017670,323549492,41175544,4.0
1017671,128401910,41535260,4.0
1017672,223573887,41535260,4.0
1017673,83602369,41568190,5.0


In [10]:
#create a spark session
spark = SparkSession.builder.master('local[*]').config('spark.dirver.maxResultSize', '32g') .config("spark.driver.memory", "64g").appName('AirBnB').getOrCreate()

In [11]:
#create a spark context
sc = spark.sparkContext

In [12]:
#Split data into training and testing with 0.9 and 0.1 ratio
df_train, df_test = train_test_split(grt_10_df, test_size=0.1,random_state=42)

In [13]:
#concatinating the splitted data and preserved listings with value counts <10
cons_train = pd.concat([df_train,ten_list]).reset_index(drop=True)

In [14]:
cons_train.shape

(996294, 3)

In [15]:
len(cons_train.listing_id.unique())

39829

In [16]:
df_test.shape

(101768, 3)

In [17]:
#Store training and testing dataset as seperate csv files
cons_train.to_csv('Training.csv',index = False)
df_test.to_csv('Testing.csv',index=False)

In [18]:
#load Training.csv into df
sp_train = spark.read.csv("Training.csv",inferSchema="true", header="true")

In [19]:
sp_train.take(2)

[Row(reviewer_id=3882125, listing_id=3479639, rating=5.0),
 Row(reviewer_id=2690979, listing_id=23273676, rating=4.0)]

In [20]:
#View dataframe
sp_train.show(10)

+-----------+----------+------+
|reviewer_id|listing_id|rating|
+-----------+----------+------+
|    3882125|   3479639|   5.0|
|    2690979|  23273676|   4.0|
|   63631541|     62465|   5.0|
|    3095417|  35183442|   4.0|
|   62432081|  27472111|   5.0|
|  293246596|  38489721|   4.0|
|    6642037|    667502|   2.0|
|  248199756|  28608063|   4.0|
|  287949091|  34749200|   5.0|
|   99372679|  24258565|   5.0|
+-----------+----------+------+
only showing top 10 rows



In [21]:
#load Training.csv into df
sp_test = spark.read.csv("Testing.csv",inferSchema="true", header="true")

In [22]:
#View dataframe
sp_test.show(10)

+-----------+----------+------+
|reviewer_id|listing_id|rating|
+-----------+----------+------+
|   60342127|   9434513|   5.0|
|   93608760|   5390882|   2.0|
|   32500897|   2563611|   4.0|
|   16540824|     50447|   5.0|
|  193316648|  21448949|   4.0|
|   29147966|  13529037|   5.0|
|  110867659|  30246118|   5.0|
|   24093677|   6882689|   4.0|
|    2254917|  13723345|   5.0|
|    3132338|   4226098|   5.0|
+-----------+----------+------+
only showing top 10 rows



In [23]:
#Cach to speed up the process.
sp_train.cache()
sp_test.cache()

DataFrame[reviewer_id: int, listing_id: int, rating: double]

In [32]:
sc.setCheckpointDir('checkpoint')

In [33]:
ALS.checkpointInterval = 4

In [34]:
#Build ALS model with train data
model = ALS.train(ratings=sp_train,    # (userID, productID, rating) tuple
                iterations=25,
                rank=200,
                lambda_=0.015,   nonnegative=True,        # regularization param
                seed=99)

In [36]:
#Save model
model.save(sc,'ALS_model.model')

In [None]:
#Load saved model
#sameModel = MatrixFactorizationModel.load(sc, 'ALS_model.model')

In [37]:
#To calculate rmse predict rating for all and compare with original rating
pred_input = sp_train.rdd.map(lambda x:(x[0],x[1]))  
#Returns Ratings(user, item, prediction)
pred = model.predictAll(pred_input) 

In [38]:
#Get Performance Estimate
#Organize the data to make (user, product) the key)
true_reorg = sp_train.rdd.map(lambda x:((x[0],x[1]), x[2]))
pred_reorg = pred.map(lambda x:((x[0],x[1]), x[2]))
#join reorganized data
true_pred = true_reorg.join(pred_reorg)
#Calculate mean squared error
MSE = true_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()
RMSE = sqrt(MSE)
print('Training RMSE : ',RMSE)

Training RMSE :  0.006771193199121022


In [39]:
#Test Set Evaluation
test_input = sp_test.rdd.map(lambda x:(x[0],x[1])) 
pred_test = model.predictAll(test_input)
test_reorg = sp_test.rdd.map(lambda x:((x[0],x[1]), x[2]))
pred_reorg = pred_test.map(lambda x:((x[0],x[1]), x[2]))
test_pred = test_reorg.join(pred_reorg)
test_MSE = test_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()
test_RMSE = sqrt(test_MSE)
print('Testing error : ',test_RMSE)

Testing error :  1.4134078954522193


In [40]:
# For User Y Find N Products to Promote
rec = model.recommendProducts(6076978,5)

In [41]:
rec

[Rating(user=6076978, product=151478, rating=4.9974147226480214),
 Rating(user=6076978, product=1175400, rating=4.533415312066755),
 Rating(user=6076978, product=841075, rating=4.5230501901090285),
 Rating(user=6076978, product=13495558, rating=4.514859995477027),
 Rating(user=6076978, product=34595255, rating=4.514848857893221)]

### Building a hybrid recommendation


In [42]:
#Load tfidf score calculated for every listings from content based recommender system
with open('results_tf.pkl', 'rb') as f:
    res = pickle.load(f)

In [43]:
#Load listings dataset
pdf1 = pd.read_csv('Clean_listings.csv')

In [44]:
pdf1.head()

Unnamed: 0,id,content,neighbourhood_group_cleansed,latitude,longitude,bathrooms,bedrooms,room_type,price,minimum_nights,number_of_reviews,review_scores_rating
0,2060,"Modern NYC // Lovely, spacious, sunny 1 BR apa...",Manhattan,40.85722,-73.9379,1.0,1.0,Private room,100.0,1,1,80.0
1,2595,"Skylit Midtown Castle // Beautiful, spacious s...",Manhattan,40.75362,-73.98377,1.0,0.0,Entire home/apt,225.0,7,48,94.0
2,3831,Cozy Entire Floor of Brownstone // Urban retre...,Brooklyn,40.68514,-73.95976,1.0,1.0,Entire home/apt,89.0,1,313,90.0
3,5099,Large Cozy 1 BR Apartment In Midtown East // M...,Manhattan,40.74767,-73.975,1.0,1.0,Entire home/apt,200.0,3,78,90.0
4,5114,"Super Room in Great area. // First, the area i...",Manhattan,40.7191,-73.9904,1.0,1.0,Private room,94.0,1,75,94.0


In [45]:
#Function to get a understandable listing name and description from the content field, given an item ID
def item(id):
    name   = pdf1.loc[pdf1['id'] == id]['content'].tolist()[0].split(' // ')[0]
    desc   = ' \nDescription: ' + pdf1.loc[pdf1['id'] == id]['content'].tolist()[0].split(' // ')[1][0:165] + '...'
    prediction = name  + desc
    return prediction

def con_based(item_id,num):
    recs = res[item_id][:num]
    for rec in recs:
        print('\nRecommended: ' + item(rec[1]) + '\n(similarity score : ' + str(rec[0]) + ')' + ' Listing id :' +str(rec[1])+'\n')

#Function to read the results out of the dictionary
def recommend(userid, item_id, num):
    if item_id==None:
        #Collaborative filtering
        print('Recommending based on your rating history... \n')
        for i in range(num):    
            rec = model.recommendProducts(userid,num) 
            print(item(rec[i][1]) + '\n(Predicted rating : ' + str(rec[i][2]) + ')\n' + ' Listing id :' +str(rec[i][1]) +'\n')  
    else:
        print('Recommending ' + str(num) + ' listings similar to ' + item(item_id))
        print('---')
        avail = []
        pred_rat = {}
        recs = res[item_id][:num]
        
        if cons_train['reviewer_id'].astype(str).str.contains(str(userid)).any() == False:
            print('You are a new user, recommending based on your listing_id preference..')
            con_based(item_id,num)    
    
        if cons_train['reviewer_id'].astype(str).str.contains(str(userid)).any() &  (item_id!=None):
            for i in range(num):
                if cons_train['listing_id'].astype(str).str.contains(str(recs[i][1])).any():
                     pred_rat[i] = model.predict(userid,recs[i][1])
                else:
                    pred_rat[recs[i][1]] = 0

            sorted_d = sorted(pred_rat, key=pred_rat.get, reverse=True)
            for i in sorted_d:
                if i > num:   
                    print('\033[1m'+'\nThis is a new listing, Check it out!')
                    con_based(i,1)
                else:             
                    print('\nRecommended based on your preferences: ' + item(recs[i][1]) + '\n(similarity score : ' + str(recs[i][0]) + ')' + '\nPredicted_rating : '+str((pred_rat[i])) +' Listing id :' +str(recs[i][1])+'\n')

### Let's recommend!

In [46]:
#Recommend n listings for user Id 51235419, listing_id 12223159
recommend(51235419,12223159, 5)

Recommending 5 listings similar to Bright Modern Greenwich Village 1BR 
Description: Bright, newly renovated 1BR/1BA in Greenwich village, steps away from Washington Square Park. The apartment features all of the charm of a pre-war building with a mo...
---

Recommended based on your preferences: Entire West Village Apartment 
Description: Chic one bedroom located in the heart of West Village.  One block away from IFC Center, Cafe Dante downstairs- a true Downtown New York experience.  Two block walk t...
(similarity score : 0.21152255450985494)
Predicted_rating : 4.272400051976689 Listing id :26080646


Recommended based on your preferences: Quiet 1-Bedroom in Greenwich Village 
Description: 1-bedroom in Greenwich Village. Just a block away from Washington Square park, tons of bars and restaurants, and easy access to 8 different subway lines within just ...
(similarity score : 0.2228586291257114)
Predicted_rating : 3.1529778109847633 Listing id :18911633


Recommended based on your pr

In [47]:
#Recommend n listings for user Id 3132338, listing_id 12223159
recommend(3132338,12223159, 5)

Recommending 5 listings similar to Bright Modern Greenwich Village 1BR 
Description: Bright, newly renovated 1BR/1BA in Greenwich village, steps away from Washington Square Park. The apartment features all of the charm of a pre-war building with a mo...
---
You are a new user, recommending based on your listing_id preference..

Recommended: Idyllic Greenwich Village Apartment 
Description: Charming, cozy apartment in the heart of the best neighborhood in Manhattan.  High ceilings, hardwood floors and an abundance of natural light.  Steps from the West ...
(similarity score : 0.25230335851478636) Listing id :9797510


Recommended: Quiet 1-Bedroom in Greenwich Village 
Description: 1-bedroom in Greenwich Village. Just a block away from Washington Square park, tons of bars and restaurants, and easy access to 8 different subway lines within just ...
(similarity score : 0.2228586291257114) Listing id :18911633


Recommended: Lovely room in Greenwich Village 
Description: 1 Room in a 2-Bedro

In [48]:
recommend(51235419,None, 5)

Recommending based on your rating history... 

Gorgeous Apt in Prime Williamsburg 
Description: Cozy and romantic apartment with high ceilings situated between two subway lines and just one block from away from the beautiful Williamsburg bridge and a few moment...
(Predicted rating : 5.199794132957075)
 Listing id :1325759

Bright apartment in Soho with roof access 
Description: Bright and charming apartment in a top floor (6th floor walk up) apartment full of light located in West Soho, with lots of windows. The large bedroom has great ligh...
(Predicted rating : 5.197042408057872)
 Listing id :13495558

LARGE room* light filled loft *PRIME Williamsburg* 
Description: Our spacious apartment comes with everything you need, including AC, 9 functioning windows (including one in the bathroom), a well equipped kitchen and rooftop with ...
(Predicted rating : 5.1871801040107925)
 Listing id :1175400

Upscale Historic District Apt.  Near  Central Park 
Description: For the traveler who wants

#### Summary :
##### 1.  Notice that for the same listing id, recommendation differ based on reviewer id because of hybrid approach
##### 2. Notice that even a new listing without any rating is also recommended based on item similarity
##### 3. Notice that for a new user without any history rating, listings are recommended based on item similarity