In [1]:
from pyspark.context import SparkContext
import json
from datetime import datetime
import pytz
from xgboost import XGBRegressor
import time
import sys
import ast
from collections import defaultdict

In [2]:
sc = SparkContext()
sc.setLogLevel('ERROR')

In [3]:
FOLDER_PATH = '/Users/veersingh/Desktop/competition_files/'
TESTING_FILE_PATH = '/Users/veersingh/Desktop/competition_files/yelp_val.csv'
OUTPUT_FILE_PATH = '/Users/veersingh/Desktop/Recommendation-System-to-predict-Yelp-ratings/output.csv'

TRAIN_FILE_PATH = FOLDER_PATH + 'yelp_train.csv'
BUSINESS_FILE_PATH = FOLDER_PATH + 'business.json'
CHECKIN_FILE_PATH = FOLDER_PATH + 'checkin.json'
PHOTO_FILE_PATH = FOLDER_PATH + 'photo.json'
TIP_FILE_PATH = FOLDER_PATH + 'tip.json'
USER_FILE_PATH = FOLDER_PATH + 'user.json'

In [4]:
business_RDD = sc.textFile(BUSINESS_FILE_PATH).map(lambda x: json.loads(x))

# Get all the unique features from all businesses in training data

In [5]:
def extract_features(data_row):
    
    features = set()
    
    for k,v in data_row.items():
        if type(v) != dict:   
            # if it is a categories, create a separate feature for each category
            if k == 'categories':
                if v is not None:
                    categories = v.split(',')
                    for category in categories:
                        features.add(category.strip())
            else:
                features.add(k.strip())
            
        else:
            # if it is a dict then expand
            for k2, v2 in v.items():
                if v2[0] == '{' and v2[-1] == '}':
                    # convert to dict
                    v2 = ast.literal_eval(v2)
                    
                    for k3,v3 in v2.items():
                        # use delimitter $ to rename
                        feature_name = k + '$' + k2 + '$' + k3
                        features.add(feature_name.strip())
                else:
                    # use delimitter $ to rename
                    feature_name = k + '$' + k2
                    features.add(feature_name.strip())
        
    return features

In [6]:
all_features = set(business_RDD.map(lambda x: extract_features(x)).flatMap(lambda x: x).collect())

# remove useless features
feats_to_remove = {'business_id', 'name', 'neighborhood', 'address', 'city', 'postal_code', 'hours', 'attributes'}
all_features.difference_update(feats_to_remove)
print(all_features)

all_features = list(all_features)
all_features.sort()

{'Buddhist Temples', 'Department Stores', 'Outlet Stores', 'Paint Stores', 'attributes$BusinessAcceptsCreditCards', 'Vocal Coach', 'Fish & Chips', 'Indoor Playcentre', 'Printing Services', 'Beverage Store', 'Pet Insurance', 'attributes$BusinessAcceptsBitcoin', 'Duplication Services', 'Business Financing', 'attributes$RestaurantsTakeOut', 'hours$Friday', 'Screen Printing', 'Vintage & Consignment', 'Dermatologists', 'Breweries', 'Pediatricians', 'Cheesesteaks', 'Data Recovery', 'Misting System Services', 'Dry Cleaning & Laundry', 'Comic Books', 'Matchmakers', 'Tattoo Removal', 'Medical Supplies', 'Wallpapering', 'Home Network Installation', 'Chiropractors', 'Graphic Design', 'Bus Tours', 'Furniture Repair', 'Mediators', 'Head Shops', 'Nursing Schools', 'Wine Tasting Room', 'Oxygen Bars', 'Mailbox Centers', 'Ethnic Food', 'Towing', 'Middle Schools & High Schools', 'Artificial Turf', 'Veterans Organizations', 'attributes$DogsAllowed', 'Hungarian', 'Flowers & Gifts', 'Neurotologists', 'Boat

In [7]:
fhand = open('all_features.txt', 'w')
for row in all_features:
    fhand.writelines(row + '\n')
fhand.close()

# Get all unique values for all features for all businesses

In [None]:
def get_feat_values(data_row):
    
    feat_value = defaultdict(set)
    
    for k,v in data_row.items():
        if type(v) != dict:
            # if it is categories, give it a value 1
            if k == 'categories':
                if v is not None:
                    categories = v.split(',')
                    for category in categories:
                        feature_name = category.strip()
                        feat_value[feature_name].add(1)
            else:
                feature_name = k.strip()
                feat_value[feature_name].add(v)

        else:
            # if it is a dict then expand
            for k2, v2 in v.items():
                if v2[0] == '{' and v2[-1] == '}':
                    # convert to dict
                    v2 = ast.literal_eval(v2)

                    for k3, v3 in v2.items():
                        # use delimitter $ to rename feature
                        feature_name = k + '$' + k2 + '$' + k3
                        feature_name = feature_name.strip()

                        feat_value[feature_name].add(v3)

                else:
                    # use delimitter $ to rename feature
                    feature_name = k + '$' + k2
                    feature_name = feature_name.strip()

                    feat_value[feature_name].add(v2)
        
    return tuple(feat_value.items())

In [None]:
feature_vals = business_RDD.map(lambda x: get_feat_values(x)).flatMap(lambda x: x).reduceByKey(lambda x,y: x.union(y))



In [None]:
# create a dict of features to vals
features_and_vals = {}
for feat, val in feature_vals.collect():
    features_and_vals[feat] = val

# convert all features into a set and remove useless features and sort
features_from_vals = set(features_and_vals.keys())
feats_to_remove = {'business_id', 'name', 'neighborhood', 'address', 'city', 'postal_code', 'hours', 'attributes'}
features_from_vals.difference_update(feats_to_remove)
features_from_vals = list(features_from_vals)
features_from_vals.sort()

In [None]:
fhand = open('features_and_vals.txt', 'w')
for k in features_from_vals:
    fhand.writelines(k + ' : ' + str(list(features_and_vals[k])) + '\n')
fhand.close()

### After verifying all unique values for all features, there was no odd value

In [None]:
from datetime import datetime, timedelta


def convert_timings_to_hours(timing):
    start_time, end_time = data.split('-')
    start_time = datetime.strptime(start_time, '%H:%M')
    end_time = datetime.strptime(end_time, '%H:%M')
    time_diff = end_time - start_time
    num_hours = time_diff.total_seconds() / 3600

    if num_hours < 0:
        num_hours = 24+num_hours

    return num_hours

In [None]:
print(all_features)

In [None]:
# create a hashmap of feature to index
features_index_map = {}

for i, feature in enumerate(all_features):
    features_index_map[feature] = i
    

In [None]:
print(features_index_map)