In [2]:
import pandas as pd

### clean the pages json

In [127]:
# the columns in trail_info dataset
columns = ['activities', 'area_id', 'area_name', 'avgRating', 'completedCount',
       'difficulty', 'duration', 'elevationGain', 'elevationMax',
       'elevationStart', 'features', 'length', 'obstacles', 'overview',
       'photoCount', 'popularity', 'reviewCount', 'rountType', 'trackCount',
       'trailId', 'trail_name']

In [106]:
def decode_trails(raw, packed):
    for trail in raw:
        new_dict = dict()
        
        # names and id
        if (trail["area"] != None):
            new_dict["area_id"] = trail["area"]["id"]
            new_dict["area_name"] = trail["area"]["name"]
        else: 
            new_dict["area_id"] = None
            new_dict["area_name"] = None
        # deal with attributes
        attributes = trail["attributes"]
        attr_keys = attributes.keys()
        for key in attr_keys:
            tmp = []
            for i in range(len(attributes[key])):
                tmp.append(attributes[key][i]["name"])
            new_dict[key] = tmp
        
        new_dict["avgRating"] = trail["avgRating"]
        new_dict["overview"] = trail["overview"]
        # geo stats
        new_dict["elevationGain"] = trail["trailGeoStats"]["elevationGain"]
        new_dict["elevationMax"] = trail["trailGeoStats"]["elevationMax"]
        new_dict["elevationStart"] = trail["trailGeoStats"]["elevationStart"]
        new_dict["length"] = trail["trailGeoStats"]["length"]
        
        new_dict["trail_name"] = trail["slug"]
        new_dict["popularity"] = trail["popularity"]
        new_dict["trailId"] = trail["id"]
        
        new_dict["completedCount"] = trail["trailCounts"]['completedCount']
        new_dict["photoCount"] = trail["trailCounts"]['photoCount']  
        new_dict["reviewCount"] = trail["trailCounts"]['reviewCount']
        new_dict["trackCount"] = trail["trailCounts"]['trackCount']
        
        new_dict["rountType"] = trail["routeType"]["name"]
        new_dict["difficulty"] = trail["defaultActivityStats"]["difficulty"]
        new_dict["duration"] = trail["defaultActivityStats"]["duration"]
        
        packed.append(new_dict)
    return packed

In [131]:
pages = [1,2,4,5,6,7,10]
trail_info =  pd.DataFrame(columns=columns)
for page in pages:
    filename = "data/page_" + str(page) + ".json"
    print(filename)
    tmp_data = pd.read_json(filename, typ='series')
    tmp_new = []
    tmp_new = decode_trails(tmp_data.trails, tmp_new)
    tmp_df = pd.DataFrame(tmp_new)
    trail_info = trail_info.append(tmp_df)

data/page_1.json
data/page_2.json
data/page_4.json
data/page_5.json
data/page_6.json
data/page_7.json
data/page_10.json


### clean the reviews dataset 

In [134]:
reviews = pd.read_json("data/review.json", typ = 'series')

In [139]:
reviews.trail_reviews[0].keys()

dict_keys(['date', 'comment', 'weather', 'trailAvgRating', 'conditions', 'difficulty', 'associatedRecording', 'trailId', 'metadata', 'id', 'activity', 'duration', 'rating', 'length', 'visitorUsage', 'user'])

In [153]:
reviews.trail_reviews[2]["date"]

'2018-02-27T00:00:00Z'

In [155]:
review_clean = []
for review in reviews.trail_reviews:
    new_dict = dict()
    new_dict["date"] = review["date"]
    new_dict["comment"] = review["comment"]
    new_dict["review_rating"] = review["rating"]
    new_dict["trailId"] = review["trailId"]
    review_clean.append(new_dict)

In [157]:
review_clean = pd.DataFrame(review_clean)

In [158]:
review_clean

Unnamed: 0,comment,date,review_rating,trailId
0,Incredible hike! A little chilly with some bru...,2018-03-17T00:00:00Z,5,10029098
1,Hard but rewarding. We didn’t even finish this...,2018-03-01T00:00:00Z,5,10029098
2,Some pretty good climbing,2018-02-27T00:00:00Z,5,10029098
3,"Amazing day! A little tough for the kids, but...",2018-02-20T00:00:00Z,5,10029098
4,"First time there, won’t be my last!! Pretty ha...",2018-02-19T00:00:00Z,5,10029098
5,"Great scramble to start, then it eases up. I l...",2018-02-19T00:00:00Z,5,10029098
6,Love this hike❤,,5,10029098
7,"Climb is great but the trail is crowded, litte...",2018-01-29T00:00:00Z,3,10029098
8,Scramble is an understatement,2018-01-28T00:00:00Z,5,10029098
9,Fun scramble. Not so fun in the ice... slipped...,2018-01-21T00:00:00Z,4,10029098


### saving the files to csv

In [None]:
review_clean.to_csv("sample_reviews.csv", encoding = "utf-8")