In [1]:
from pyspark.context import SparkContext
import json
from datetime import datetime
import pytz
from xgboost import XGBRegressor
import time
import sys
import ast
from collections import defaultdict

In [2]:
sc = SparkContext()
sc.setLogLevel('ERROR')

In [3]:
FOLDER_PATH = '/Users/veersingh/Desktop/competition_files/'
TESTING_FILE_PATH = '/Users/veersingh/Desktop/competition_files/yelp_val.csv'
OUTPUT_FILE_PATH = '/Users/veersingh/Desktop/Recommendation-System-to-predict-Yelp-ratings/output.csv'

TRAIN_FILE_PATH = FOLDER_PATH + 'yelp_train.csv'
BUSINESS_FILE_PATH = FOLDER_PATH + 'business.json'
CHECKIN_FILE_PATH = FOLDER_PATH + 'checkin.json'
PHOTO_FILE_PATH = FOLDER_PATH + 'photo.json'
TIP_FILE_PATH = FOLDER_PATH + 'tip.json'
USER_FILE_PATH = FOLDER_PATH + 'user.json'

In [4]:
business_RDD = sc.textFile(BUSINESS_FILE_PATH).map(lambda x: json.loads(x))

# Get all the unique features from all businesses in training data

In [5]:
def extract_features(data_row):
    
    features = set()
    
    for k,v in data_row.items():
        if type(v) != dict:   
            # if it is a categories, create a separate feature for each category
            if k == 'categories':
                if v is not None:
                    categories = v.split(',')
                    for category in categories:
                        features.add(category.strip())
            else:
                features.add(k.strip())
            
        else:
            # if it is a dict then expand
            for k2, v2 in v.items():
                if v2[0] == '{' and v2[-1] == '}':
                    # convert to dict
                    v2 = ast.literal_eval(v2)
                    
                    for k3,v3 in v2.items():
                        # use delimitter $ to rename
                        feature_name = k + '$' + k2 + '$' + k3
                        features.add(feature_name.strip())
                else:
                    # use delimitter $ to rename
                    feature_name = k + '$' + k2
                    features.add(feature_name.strip())
        
    return features

In [6]:
all_features = set(business_RDD.map(lambda x: extract_features(x)).flatMap(lambda x: x).collect())

# remove useless features
feats_to_remove = {'business_id', 'name', 'neighborhood', 'address', 'city', 'postal_code', 'hours', 'attributes'}
all_features.difference_update(feats_to_remove)
print(all_features)

all_features = list(all_features)
all_features.sort()

{'Muay Thai', 'Spine Surgeons', 'Cannabis Clinics', 'Dental Hygienists', 'Mattresses', 'Disc Golf', 'Nutritionists', 'Bus Rental', 'Colonics', 'Employment Law', 'Engraving', 'Visitor Centers', 'attributes$Ambience$romantic', 'Emergency Pet Hospital', 'Dry Cleaning', 'Check Cashing/Pay-day Loans', 'Powder Coating', 'Water Delivery', 'Condominiums', 'Rolfing', 'Mobile Home Repair', 'Ethnic Food', 'Hang Gliding', 'Specialty Schools', 'Sugaring', 'Malaysian', 'Pakistani', 'DUI Law', 'Diagnostic Services', 'Bed & Breakfast', 'Ethical Grocery', 'Fish & Chips', 'Rugs', 'Post Offices', 'Buddhist Temples', 'Vacation Rental Agents', 'Vocational & Technical School', 'Italian', 'Shredding Services', 'Sugar Shacks', 'Personal Care Services', 'Accountants', 'Carpet Installation', 'Comedy Clubs', 'attributes$DietaryRestrictions$halal', 'attributes$DogsAllowed', 'Flea Markets', 'Taiwanese', 'Pole Dancing Classes', 'Pensions', 'Historical Tours', 'Kids Activities', 'Scavenger Hunts', 'Preschools', 'att

In [7]:
fhand = open('all_features.txt', 'w')
for row in all_features:
    fhand.writelines(row + '\n')
fhand.close()

# Get all unique values for all features for all businesses

In [None]:
def get_feat_values(data_row):
    
    feat_value = defaultdict(set)
    
    for k,v in data_row.items():
        if type(v) != dict:
            # if it is categories, give it a value 1
            if k == 'categories':
                if v is not None:
                    categories = v.split(',')
                    for category in categories:
                        feature_name = category.strip()
                        feat_value[feature_name].add(1)
            else:
                feature_name = k.strip()
                feat_value[feature_name].add(v)

        else:
            # if it is a dict then expand
            for k2, v2 in v.items():
                if v2[0] == '{' and v2[-1] == '}':
                    # convert to dict
                    v2 = ast.literal_eval(v2)

                    for k3, v3 in v2.items():
                        # use delimitter $ to rename feature
                        feature_name = k + '$' + k2 + '$' + k3
                        feature_name = feature_name.strip()

                        feat_value[feature_name].add(v3)

                else:
                    # use delimitter $ to rename feature
                    feature_name = k + '$' + k2
                    feature_name = feature_name.strip()

                    feat_value[feature_name].add(v2)
        
    return tuple(feat_value.items())

In [None]:
feature_vals = business_RDD.map(lambda x: get_feat_values(x)).flatMap(lambda x: x).reduceByKey(lambda x,y: x.union(y))



In [None]:
# create a dict of features to vals
features_and_vals = {}
for feat, val in feature_vals.collect():
    features_and_vals[feat] = val

# convert all features into a set and remove useless features and sort
features_from_vals = set(features_and_vals.keys())
feats_to_remove = {'business_id', 'name', 'neighborhood', 'address', 'city', 'postal_code', 'hours', 'attributes'}
features_from_vals.difference_update(feats_to_remove)
features_from_vals = list(features_from_vals)
features_from_vals.sort()

In [None]:
fhand = open('features_and_vals.txt', 'w')
for k in features_from_vals:
    fhand.writelines(k + ' : ' + str(list(features_and_vals[k])) + '\n')
fhand.close()

### After verifying all unique values for all features, there was no odd value

In [24]:
from datetime import datetime, timedelta


def convert_timings_to_hours(timing):
    start_time, end_time = data.split('-')
    start_time = datetime.strptime(start_time, '%H:%M')
    end_time = datetime.strptime(end_time, '%H:%M')
    time_diff = end_time - start_time
    num_hours = time_diff.total_seconds() / 3600

    if num_hours < 0:
        num_hours = 24+num_hours

    return num_hours

13.0


In [26]:
print(all_features)

['& Probates', '3D Printing', 'ATV Rentals/Tours', 'Acai Bowls', 'Accessories', 'Accountants', 'Acne Treatment', 'Active Life', 'Acupuncture', 'Addiction Medicine', 'Adoption Services', 'Adult', 'Adult Education', 'Adult Entertainment', 'Advertising', 'Aerial Fitness', 'Aerial Tours', 'Aestheticians', 'Afghan', 'African', 'Air Duct Cleaning', 'Aircraft Dealers', 'Aircraft Repairs', 'Airlines', 'Airport Lounges', 'Airport Shuttles', 'Airport Terminals', 'Airports', 'Airsoft', 'Allergists', 'Alternative Medicine', 'Amateur Sports Teams', 'American (New)', 'American (Traditional)', 'Amusement Parks', 'Anesthesiologists', 'Animal Assisted Therapy', 'Animal Physical Therapy', 'Animal Shelters', 'Antiques', 'Apartment Agents', 'Apartments', 'Appliances', 'Appliances & Repair', 'Appraisal Services', 'Aquarium Services', 'Aquariums', 'Arabian', 'Arcades', 'Archery', 'Architects', 'Architectural Tours', 'Argentine', 'Armenian', 'Art Classes', 'Art Galleries', 'Art Museums', 'Art Restoration', '

In [27]:
# create a hashmap of feature to index
features_index_map = {}

for i, feature in enumerate(all_features):
    features_index_map[feature] = i
    

In [29]:
print(features_index_map)

{'& Probates': 0, '3D Printing': 1, 'ATV Rentals/Tours': 2, 'Acai Bowls': 3, 'Accessories': 4, 'Accountants': 5, 'Acne Treatment': 6, 'Active Life': 7, 'Acupuncture': 8, 'Addiction Medicine': 9, 'Adoption Services': 10, 'Adult': 11, 'Adult Education': 12, 'Adult Entertainment': 13, 'Advertising': 14, 'Aerial Fitness': 15, 'Aerial Tours': 16, 'Aestheticians': 17, 'Afghan': 18, 'African': 19, 'Air Duct Cleaning': 20, 'Aircraft Dealers': 21, 'Aircraft Repairs': 22, 'Airlines': 23, 'Airport Lounges': 24, 'Airport Shuttles': 25, 'Airport Terminals': 26, 'Airports': 27, 'Airsoft': 28, 'Allergists': 29, 'Alternative Medicine': 30, 'Amateur Sports Teams': 31, 'American (New)': 32, 'American (Traditional)': 33, 'Amusement Parks': 34, 'Anesthesiologists': 35, 'Animal Assisted Therapy': 36, 'Animal Physical Therapy': 37, 'Animal Shelters': 38, 'Antiques': 39, 'Apartment Agents': 40, 'Apartments': 41, 'Appliances': 42, 'Appliances & Repair': 43, 'Appraisal Services': 44, 'Aquarium Services': 45, '