In [1]:
from pyspark.context import SparkContext
import json
from datetime import datetime
import pytz
from xgboost import XGBRegressor
import time
import ast
from datetime import datetime
import pickle
import numpy as np
import decimal
from keras.models import Sequential
from keras.layers import Dense

Using Theano backend.


In [2]:
sc = SparkContext()
sc.setLogLevel('ERROR')

In [3]:
FOLDER_PATH = '/Users/veersingh/Desktop/competition_files/'
SAVE_MODEL_PATH = '/Users/veersingh/Desktop/Recommendation-System-to-predict-Yelp-ratings/NN_few_feats/model_nn_few.h5'

TRAIN_FILE_PATH = FOLDER_PATH + 'yelp_train.csv'
BUSINESS_FILE_PATH = FOLDER_PATH + 'business.json'
CHECKIN_FILE_PATH = FOLDER_PATH + 'checkin.json'
PHOTO_FILE_PATH = FOLDER_PATH + 'photo.json'
TIP_FILE_PATH = FOLDER_PATH + 'tip.json'
USER_FILE_PATH = FOLDER_PATH + 'user.json'

# Train data

In [4]:
# Read in the training dataset. Remove the header and convert a csv string into a list of 3 elements
# [user_id, business_id, rating(float type)]
train_RDD = sc.textFile(TRAIN_FILE_PATH)
headers_train = train_RDD.first()
train_RDD = train_RDD.filter(lambda x:x!=headers_train).map(lambda x:x.split(',')).map(lambda x:[x[0], x[1], float(x[2])])

#----------- Functions for feature extraction
def get_latitude(latitude_value):
    if not latitude_value:
        return 0
    return latitude_value

def get_longitude(longitude_value):
    if not longitude_value:
        return 0
    return longitude_value

def get_num_attributes(attributes_dict):
    if not attributes_dict:
        return 0
    return len(attributes_dict)

def get_rate_true_attributes(attributes_dict):
    if not attributes_dict:
        return 0
    num_total = 0
    num_true = 0
    for k,v in attributes_dict.items():
        if v in ('True', 'False'):
            num_total += 1
            if v == 'True':
                num_true += 1
    if num_total == 0:
        return 0
    return num_true/num_total
            
def get_num_categories(categories):
    if not categories:
        return 0
    categories = categories.split(',')
    return len(categories)

def get_num_checkins(checkin_data):
    return sum(checkin_data.values())

def get_yelping_since(yelping_since):
    date_obj = datetime.strptime(yelping_since, '%Y-%m-%d')
    utc_date = pytz.utc.localize(date_obj)
    return int(utc_date.timestamp())

def get_num_friends(friends):
    if friends == 'None':
        return 0
    friends = friends.split(',')
    return len(friends)

def get_num_elites(elite):
    if elite == 'None':
        return 0
    elite = elite.split(',')
    return len(elite)

#---------------------------------------------

# Get the following features for each business: id, latitude, longitude, stars, review_count, if its open or closed, rate of true attributes i.e. num true attributes/total attributes and number of categories
business_RDD = sc.textFile(BUSINESS_FILE_PATH).map(lambda x: json.loads(x)).map(lambda x: (x['business_id'],
                                                                                              [float(get_latitude(x['latitude'])),
                                                                                              float(get_longitude(x['longitude'])),
                                                                                              float(x['stars']),
                                                                                              int(x['review_count']),
                                                                                              int(x['is_open']),
                                                                                              get_rate_true_attributes(x['attributes']),
                                                                                              get_num_categories(x['categories'])]
                                                                                          ))

# Get the total number of check ins for a business
checkIn_RDD = sc.textFile(CHECKIN_FILE_PATH).map(lambda x: json.loads(x)).map(lambda x: (x['business_id'], get_num_checkins(x['time']))).map(lambda x: (x[0], [x[1]]))

# Get the total number of photos for a business
photo_RDD = sc.textFile(PHOTO_FILE_PATH).map(lambda x: json.loads(x)).map(lambda x: (x['business_id'], 1)).reduceByKey(lambda x,y: x+y).map(lambda x: (x[0], [x[1]]))

# Get the total number of tips given by a user and the total number of tips for each business
tip_RDD = sc.textFile(TIP_FILE_PATH).map(lambda x: json.loads(x))

tips_business_RDD = tip_RDD.map(lambda x: (x['business_id'], 1)).reduceByKey(lambda x,y: x+y).map(lambda x: (x[0], [x[1]]))
tips_user_RDD = tip_RDD.map(lambda x: (x['user_id'], 1)).reduceByKey(lambda x,y: x+y).map(lambda x: (x[0], [x[1]]))

# Get the features for each user
user_RDD = sc.textFile(USER_FILE_PATH).map(lambda x: json.loads(x)).map(lambda x: (x['user_id'],
                                                                               [
                                                                                   int(x['review_count']),
                                                                                   get_yelping_since(x['yelping_since']),
                                                                                   get_num_friends(x['friends']),
                                                                                   int(x['useful']),
                                                                                   int(x['funny']),
                                                                                   int(x['cool']),
                                                                                   int(x['fans']),
                                                                                   get_num_elites(x['elite']),
                                                                                   float(x['average_stars']),
                                                                                   int(x['compliment_hot']),
                                                                                   int(x['compliment_more']),
                                                                                   int(x['compliment_profile']),
                                                                                   int(x['compliment_cute']),
                                                                                   int(x['compliment_list']),
                                                                                   int(x['compliment_note']),
                                                                                   int(x['compliment_plain']),
                                                                                   int(x['compliment_cool']),
                                                                                   int(x['compliment_funny']),
                                                                                   int(x['compliment_writer']),
                                                                                   int(x['compliment_photos'])
                                                                               ]))


#----------- Create train X and Y
def combine_lists(data_row):
    # fix nonetype error
    if data_row[1][1] == None:
        return[data_row[0], data_row[1][0] + [0]]
    if type(data_row[1][0]) == str:
        return [data_row[0], [data_row[1][0]] + data_row[1][1]]
    return [data_row[0], data_row[1][0] + data_row[1][1]]

# Combine the following RDDs to create a vector for each business with business id as key and list of features as value
# business_RDD + checkIn_RDD + photo_RDD + tips_business_RDD
# make sure to fix NoneType error when combining lists since some values are None
business_features_RDD = business_RDD.leftOuterJoin(checkIn_RDD).map(lambda x: combine_lists(x)).leftOuterJoin(photo_RDD).map(lambda x: combine_lists(x)).leftOuterJoin(tips_business_RDD).map(lambda x: combine_lists(x))


# Combine the following RDDs to create a vector for each user with user id as key and list of features as value
# user_RDD + tips_user_RDD
# make sure to fix NoneType error when combining lists since some values are None
user_features_RDD = user_RDD.leftOuterJoin(tips_user_RDD).map(lambda x: combine_lists(x))

def switch_keys(data_row):
    bus_id = data_row[0]
    usr_id = data_row[1][0]
    features = data_row[1][1:]
    
    return (usr_id, [bus_id] + features)

def join_all(data_row):
    usr_id = data_row[0]
    bus_id = data_row[1][0][0]
    bus_features = data_row[1][0][1:]
    usr_features = data_row[1][1]
    
    return ((usr_id, bus_id), bus_features + usr_features)

# join the train_RDD and business_features_RDD
# we need to have the business_id as the key for this
train_RDD_tmp = train_RDD.map(lambda x: (x[1], x[0]))
train_join_business_features_RDD = train_RDD_tmp.leftOuterJoin(business_features_RDD).map(lambda x: combine_lists(x))

# now join this with the user_features_RDD. We need to have the user_id as key for this
train_join_business_features_RDD_tmp = train_join_business_features_RDD.map(lambda x: switch_keys(x))
train_join_business_features_user_features_RDD = train_join_business_features_RDD_tmp.leftOuterJoin(user_features_RDD)

# format the data as (user_id, business_id) [feature1, feature2, ...]
train_all_joined_MAP = train_join_business_features_user_features_RDD.map(lambda x: join_all(x)).collectAsMap()

# get the values in trainRDD
labels_MAP = train_RDD.map(lambda x: ((x[0], x[1]), x[2])).collectAsMap()

# create the x and y training lists
x_train = []
y_train = []

for k in train_all_joined_MAP:
    x_train.append(train_all_joined_MAP[k])
    y_train.append(labels_MAP[k])

In [5]:
len(x_train)

455854

In [6]:
len(y_train)

455854

In [27]:
from sklearn.preprocessing import MinMaxScaler

# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Assuming your original feature vector is stored in X
# X should be a 2D array or dataframe, where each row represents a data point and each column represents a feature

# Fit the scaler to the data to compute the min and max values for each feature
scaler.fit(x_train)

# Transform the features to normalized values
x_train_normalized = scaler.transform(x_train)

# X_normalized now contains the normalized feature values


In [28]:
print(x_train[0])

[36.1571926, -115.2930234, 4.0, 243, 1, 0.6, 5, 1270, 12, 73, 65, 1325203200, 12, 3, 1, 0, 3, 0, 4.77, 0, 1, 0, 0, 0, 0, 1, 3, 3, 3, 0, 28]


In [29]:
print(x_train_normalized[0])

[1.15632654e-01 5.18814240e-03 7.50000000e-01 2.68329554e-02
 1.00000000e+00 6.00000000e-01 2.63157895e-01 9.17119811e-03
 1.04347826e-02 2.03853672e-02 2.85450424e-03 5.32861476e-01
 1.30250733e-03 1.48663515e-05 5.97514340e-06 0.00000000e+00
 1.33037694e-03 0.00000000e+00 9.35754190e-01 0.00000000e+00
 4.04040404e-04 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 6.29049506e-05 1.60737248e-04 1.60737248e-04
 3.76317110e-04 0.00000000e+00 1.77777778e-02]


# Test data

In [33]:
FOLDER_PATH = '/Users/veersingh/Desktop/competition_files/'
TESTING_FILE_PATH = '/Users/veersingh/Desktop/competition_files/yelp_val.csv'
OUTPUT_FILE_PATH = '/Users/veersingh/Desktop/Recommendation-System-to-predict-Yelp-ratings/output_NN_few.csv'

BUSINESS_FILE_PATH = FOLDER_PATH + 'business.json'
CHECKIN_FILE_PATH = FOLDER_PATH + 'checkin.json'
PHOTO_FILE_PATH = FOLDER_PATH + 'photo.json'
TIP_FILE_PATH = FOLDER_PATH + 'tip.json'
USER_FILE_PATH = FOLDER_PATH + 'user.json'

# MODEL_FILE_PATH = '/Users/veersingh/Desktop/Recommendation-System-to-predict-Yelp-ratings/NN_few_feats/model_nn_few.h5'
# LOADED_MODEL = load_model(MODEL_FILE_PATH)

In [34]:
#----------- Functions for feature extraction
def get_latitude(latitude_value):
    if not latitude_value:
        return 0
    return latitude_value

def get_longitude(longitude_value):
    if not longitude_value:
        return 0
    return longitude_value

def get_num_attributes(attributes_dict):
    if not attributes_dict:
        return 0
    return len(attributes_dict)

def get_rate_true_attributes(attributes_dict):
    if not attributes_dict:
        return 0
    num_total = 0
    num_true = 0
    for k,v in attributes_dict.items():
        if v in ('True', 'False'):
            num_total += 1
            if v == 'True':
                num_true += 1
    if num_total == 0:
        return 0
    return num_true/num_total
            
def get_num_categories(categories):
    if not categories:
        return 0
    categories = categories.split(',')
    return len(categories)

def get_num_checkins(checkin_data):
    return sum(checkin_data.values())

def get_yelping_since(yelping_since):
    date_obj = datetime.strptime(yelping_since, '%Y-%m-%d')
    utc_date = pytz.utc.localize(date_obj)
    return int(utc_date.timestamp())

def get_num_friends(friends):
    if friends == 'None':
        return 0
    friends = friends.split(',')
    return len(friends)

def get_num_elites(elite):
    if elite == 'None':
        return 0
    elite = elite.split(',')
    return len(elite)

#---------------------------------------------

# Get the following features for each business: id, latitude, longitude, stars, review_count, if its open or closed, rate of true attributes i.e. num true attributes/total attributes and number of categories
business_RDD = sc.textFile(BUSINESS_FILE_PATH).map(lambda x: json.loads(x)).map(lambda x: (x['business_id'],
                                                                                              [float(get_latitude(x['latitude'])),
                                                                                              float(get_longitude(x['longitude'])),
                                                                                              float(x['stars']),
                                                                                              int(x['review_count']),
                                                                                              int(x['is_open']),
                                                                                              get_rate_true_attributes(x['attributes']),
                                                                                              get_num_categories(x['categories'])]
                                                                                          ))

# Get the total number of check ins for a business
checkIn_RDD = sc.textFile(CHECKIN_FILE_PATH).map(lambda x: json.loads(x)).map(lambda x: (x['business_id'], get_num_checkins(x['time']))).map(lambda x: (x[0], [x[1]]))

# Get the total number of photos for a business
photo_RDD = sc.textFile(PHOTO_FILE_PATH).map(lambda x: json.loads(x)).map(lambda x: (x['business_id'], 1)).reduceByKey(lambda x,y: x+y).map(lambda x: (x[0], [x[1]]))

# Get the total number of tips given by a user and the total number of tips for each business
tip_RDD = sc.textFile(TIP_FILE_PATH).map(lambda x: json.loads(x))

tips_business_RDD = tip_RDD.map(lambda x: (x['business_id'], 1)).reduceByKey(lambda x,y: x+y).map(lambda x: (x[0], [x[1]]))
tips_user_RDD = tip_RDD.map(lambda x: (x['user_id'], 1)).reduceByKey(lambda x,y: x+y).map(lambda x: (x[0], [x[1]]))

# Get the features for each user
user_RDD = sc.textFile(USER_FILE_PATH).map(lambda x: json.loads(x)).map(lambda x: (x['user_id'],
                                                                               [
                                                                                   int(x['review_count']),
                                                                                   get_yelping_since(x['yelping_since']),
                                                                                   get_num_friends(x['friends']),
                                                                                   int(x['useful']),
                                                                                   int(x['funny']),
                                                                                   int(x['cool']),
                                                                                   int(x['fans']),
                                                                                   get_num_elites(x['elite']),
                                                                                   float(x['average_stars']),
                                                                                   int(x['compliment_hot']),
                                                                                   int(x['compliment_more']),
                                                                                   int(x['compliment_profile']),
                                                                                   int(x['compliment_cute']),
                                                                                   int(x['compliment_list']),
                                                                                   int(x['compliment_note']),
                                                                                   int(x['compliment_plain']),
                                                                                   int(x['compliment_cool']),
                                                                                   int(x['compliment_funny']),
                                                                                   int(x['compliment_writer']),
                                                                                   int(x['compliment_photos'])
                                                                               ]))


#----------- Create train X and Y
def combine_lists(data_row):
    # fix nonetype error
    if data_row[1][1] == None:
        return[data_row[0], data_row[1][0] + [0]]
    if type(data_row[1][0]) == str:
        return [data_row[0], [data_row[1][0]] + data_row[1][1]]
    return [data_row[0], data_row[1][0] + data_row[1][1]]

# Combine the following RDDs to create a vector for each business with business id as key and list of features as value
# business_RDD + checkIn_RDD + photo_RDD + tips_business_RDD
# make sure to fix NoneType error when combining lists since some values are None
business_features_RDD = business_RDD.leftOuterJoin(checkIn_RDD).map(lambda x: combine_lists(x)).leftOuterJoin(photo_RDD).map(lambda x: combine_lists(x)).leftOuterJoin(tips_business_RDD).map(lambda x: combine_lists(x))


# Combine the following RDDs to create a vector for each user with user id as key and list of features as value
# user_RDD + tips_user_RDD
# make sure to fix NoneType error when combining lists since some values are None
user_features_RDD = user_RDD.leftOuterJoin(tips_user_RDD).map(lambda x: combine_lists(x))

def switch_keys(data_row):
    bus_id = data_row[0]
    usr_id = data_row[1][0]
    features = data_row[1][1:]
    
    return (usr_id, [bus_id] + features)

def join_all(data_row):
    usr_id = data_row[0]
    bus_id = data_row[1][0][0]
    bus_features = data_row[1][0][1:]
    usr_features = data_row[1][1]
    
    return ((usr_id, bus_id), bus_features + usr_features)

#----------- Testing Phase -----------
# Read in the testing dataset. Remove the header and convert a csv string into a list of 2 elements
# [user_id, business_id]
test_RDD = sc.textFile(TESTING_FILE_PATH)
headers_test = test_RDD.first()
test_RDD = test_RDD.filter(lambda x:x!=headers_test).map(lambda x:x.split(',')).map(lambda x:(x[0], x[1]))

# join the test_RDD and business_features_RDD
# we need to have the business_id as the key for this
test_RDD_tmp = test_RDD.map(lambda x: (x[1], x[0]))
test_join_business_features_RDD = test_RDD_tmp.leftOuterJoin(business_features_RDD).map(lambda x: combine_lists(x))

# now join this with the user_features_RDD. We need to have the user_id as key for this
test_join_business_features_RDD_tmp = test_join_business_features_RDD.map(lambda x: switch_keys(x))
test_join_business_features_user_features_RDD = test_join_business_features_RDD_tmp.leftOuterJoin(user_features_RDD)

# format the data as (user_id, business_id) [feature1, feature2, ...]
test_all_joined_MAP = test_join_business_features_user_features_RDD.map(lambda x: join_all(x)).collectAsMap()

# create the x testing list
x_test = []
test_labels = []
for k in test_all_joined_MAP:
    x_test.append(test_all_joined_MAP[k])
    test_labels.append(k)
#--------------------------------------

In [44]:
# create y_test for the target labels of the validation data
validation_targets = sc.textFile(TESTING_FILE_PATH)
header = validation_targets.first()
validation_targets = validation_targets.filter(lambda x: x!=header).map(lambda x: x.split(','))
validation_targets_RDD = validation_targets.map(lambda x: ((x[0], x[1]), x[2]))
validation_targets_MAP = validation_targets_RDD.collectAsMap()

y_test = []

for usr_bus in test_labels:
    y_test.append(float(validation_targets_MAP[usr_bus]))

In [35]:
len(x_test)

142044

In [36]:
len(test_labels)

142044

In [38]:
from sklearn.preprocessing import MinMaxScaler

# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Assuming your original feature vector is stored in X
# X should be a 2D array or dataframe, where each row represents a data point and each column represents a feature

# Fit the scaler to the data to compute the min and max values for each feature
scaler.fit(x_test)

# Transform the features to normalized values
x_test_normalized = scaler.transform(x_test)

# X_normalized now contains the normalized feature values


In [40]:
print(x_test[0])

[35.2205594, -80.9438737, 3.5, 1732, 1, 0, 2, 52675, 0, 934, 291, 1267228800, 31, 14, 4, 3, 7, 5, 3.41, 0, 7, 2, 0, 0, 7, 8, 12, 12, 11, 0, 22]


In [39]:
print(x_test_normalized[0])

[2.78111196e-02 1.53512529e-01 6.25000000e-01 2.14411691e-01
 1.00000000e+00 0.00000000e+00 1.05263158e-01 3.80388079e-01
 0.00000000e+00 2.60821000e-01 2.18285618e-02 3.97168857e-01
 3.36481059e-03 6.93763070e-05 2.39005736e-05 1.44697558e-05
 3.10421286e-03 3.57142857e-01 5.55865922e-01 0.00000000e+00
 2.82828283e-03 7.28332119e-04 0.00000000e+00 0.00000000e+00
 9.61538462e-04 5.03239605e-04 6.42948993e-04 6.42948993e-04
 1.37982940e-03 0.00000000e+00 1.39682540e-02]


In [41]:
print(test_labels[0])

('Wz6nekYXj4wZ39UKgaVHJA', 'yQab5dxZzgBLTEHCw9V7_w')


In [45]:
print(len(y_test))

142044


In [46]:
print(y_test[0])

3.0


# Train

In [59]:
from keras import backend as K

# Change the backend to TensorFlow
K._BACKEND = 'tensorflow'

# Verify the backend has been changed
backend_name = K._BACKEND
print("Backend changed to: ", backend_name)

Backend changed to:  tensorflow


In [61]:
# Create the neural network model
model = Sequential()

# Add the input layer with the same input shape as your feature vectors (1423)
model.add(Dense(64, input_dim=31, activation='relu'))  # Example hidden layer with 64 units and ReLU activation

# Add additional hidden layers as needed
model.add(Dense(32, activation='relu'))  # Example additional hidden layer with 32 units and ReLU activation

# Add the output layer with a single output unit for regression or multiple output units for classification
model.add(Dense(1, activation='linear'))  # Example output layer with a single output unit for regression

# Compile the model
# Example compilation with Adam optimizer and mean squared error loss for regression
model.compile(optimizer='adam', loss='mean_squared_error')

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_34 (Dense)                 (None, 64)            2048        dense_input_10[0][0]             
____________________________________________________________________________________________________
dense_35 (Dense)                 (None, 32)            2080        dense_34[0][0]                   
____________________________________________________________________________________________________
dense_36 (Dense)                 (None, 1)             33          dense_35[0][0]                   
Total params: 4161
____________________________________________________________________________________________________


In [62]:
# Train the model
model.fit(x=x_train_normalized, y=y_train, batch_size=32, nb_epoch=20, verbose=2, validation_data = (x_test_normalized, y_test))

Train on 455854 samples, validate on 142044 samples
Epoch 1/20
5s - loss: 1.0201 - val_loss: 0.9767
Epoch 2/20
5s - loss: 0.9836 - val_loss: 0.9831
Epoch 3/20
4s - loss: 0.9813 - val_loss: 0.9713
Epoch 4/20
4s - loss: 0.9804 - val_loss: 0.9777
Epoch 5/20
4s - loss: 0.9797 - val_loss: 0.9741
Epoch 6/20
5s - loss: 0.9789 - val_loss: 0.9755
Epoch 7/20
5s - loss: 0.9781 - val_loss: 0.9718
Epoch 8/20
5s - loss: 0.9778 - val_loss: 0.9737
Epoch 9/20
5s - loss: 0.9771 - val_loss: 0.9697
Epoch 10/20
5s - loss: 0.9764 - val_loss: 0.9733
Epoch 11/20
5s - loss: 0.9767 - val_loss: 0.9711
Epoch 12/20
5s - loss: 0.9764 - val_loss: 0.9711
Epoch 13/20
5s - loss: 0.9758 - val_loss: 0.9682
Epoch 14/20
5s - loss: 0.9756 - val_loss: 0.9735
Epoch 15/20
5s - loss: 0.9754 - val_loss: 0.9682
Epoch 16/20
5s - loss: 0.9751 - val_loss: 0.9681
Epoch 17/20
5s - loss: 0.9749 - val_loss: 0.9684
Epoch 18/20
5s - loss: 0.9746 - val_loss: 0.9695
Epoch 19/20
5s - loss: 0.9747 - val_loss: 0.9670
Epoch 20/20
5s - loss: 0.9

<keras.callbacks.History at 0x7f8321ae6438>

In [51]:
# save the model
model.save(SAVE_MODEL_PATH)