In [27]:
import traverser as tv
import souper as sr
import pandas as pd

In [28]:
ROUTE_DUMP = './america_data/route_info.txt'
AREA_DUMP = './america_data/area_info.txt'

In [29]:
def read_info(fn):

    # read the entire file into a python array
    with open(fn, 'rb') as f:
        data = f.readlines()

    # remove the trailing "\n" from each line
    data = map(lambda x: x.rstrip(), data)

    # each element of 'data' is an individual JSON object.
    # have all the individual business JSON objects
    data_json_str = "[" + ','.join(data) + "]"

    # now, load it into pandas
    data_df = pd.read_json(data_json_str)
    
    return(data_df)

In [30]:
d = read_info(ROUTE_DUMP) # takes 10 seconds
# p = read_info(AREA_DUMP)
# p.shape

In [31]:
d.columns

Index([   u'Description',             u'FA',  u'Getting There',
             u'Location',           u'Name',     u'Page Views',
           u'Protection',         u'Season',   u'Submitted By',
                 u'Type', u'area_hierarchy',           u'href',
          u'rateBritish',    u'rateEwbanks',       u'rateFont',
           u'rateFrench',      u'rateHueco',       u'rateUIAA',
              u'rateYDS',         u'rateZA',    u'staraverage',
             u'starbest',      u'starvotes'],
      dtype='object')

In [32]:
def get_area(d, limit = None):
    
    hierarchy = d.iloc[:limit]['area_hierarchy']
    
    # iterate through route hierarchy records in the data
    area = list()
    for hier in hierarchy:
        # exclude the root element 0
        for potentially_new_area in hier[1:]:
            area.append(potentially_new_area)

    return list(set(area))

In [33]:
area = get_area(d)
len(area)

26396

In [34]:
area[:5]

[u'/v/mount-harwood/108262233',
 u'/v/tralfamadore-buttress/109590763',
 u'/v/the-boulder-of-bloody-madness/105734489',
 u'/v/dc-memorial-boulder/105992253',
 u'/v/five-ten-wall/105851747']

In [7]:
import re
from urllib2 import urlopen
from bs4 import BeautifulSoup
import souper as sr
import json
# import codecs

In [8]:
class Scraper(object):
    """
    Scraper extracts information from mountain project page
    Heavy use of BeautifulSoup to navigate html tree
    Creates a Destination object
    """

    def __init__(self, href):
        self.href = href

        try:
            mp_page = urlopen('http://www.mountainproject.com' + href)
        except:
            print href + ' failed to load!!'
            return None
        else:
            mp_html = mp_page.read()
            self.soup = BeautifulSoup(mp_html) # , 'html.parser')

    def get_child_href(self, dest_iter):
    
        href = []
        for dest in dest_iter:
            if len(dest.get_text()) > 0: # children are labeled with text
                if dest.a != None: # sometimes <a> is within a <span>
                    dest = dest.a
                h = dest.get('href')
                h = h.encode('utf-8', errors = 'ignore') # encoding is crucial
                href.append(h)
        
        # only routes and areas have an href containing /v/
        href = [h for h in href if '/v/' in h]
        
        return href

    def get_children(self):

        # every route and area page has this container
        youContainer = self.soup.find(id="youContainer")
        root = youContainer == None

        if root: # at /v/ or /destinations/
            # get tags for the 50 states and International
            dest_iter = self.soup.find_all('span', { 'class': "destArea" })

            # pull out href key
            children = self.get_child_href(dest_iter)

            return children
        else:
            
            is_route = re.search('You & This Route',youContainer.get_text()) != None
            is_area = re.search('You & This Area',youContainer.get_text()) != None

            if is_route:
                return None
            elif is_area:
                # get div for any area or route
                leftnavdiv = self.soup.find(id='viewerLeftNavColContent')
                dest_iter = leftnavdiv.find_all('a')

                # pull out href key
                children = self.get_child_href(dest_iter)

                return children

    def create_destination(self):

        # grab features from html
        feature = sr.get_route_info(self.soup)
        dest = Destination(self.href, feature)

        # what type of destination is this?
        youContainer = self.soup.find(id="youContainer")
        dest.is_area = re.search('You & This Area',youContainer.get_text()) != None
        dest.is_route = re.search('You & This Route',youContainer.get_text()) != None

        # more scraping is necessary for routes
        if dest.is_route:
            dest.grade = sr.get_grade(self.soup)
            dest.protect_rate = sr.get_protect_rate(self.soup)
            dest.star_rating = sr.get_star_rating(self.soup)

        return dest

In [9]:
class Destination(object):

    def __init__(self, href, feature):
        self.href = href

        # initialize self with dictionary of features
        for n, feat in feature.items():
            setattr(self, n, feat)

    def update_feature():

        # update self new dictionary
        for n, feat in feature.items():
            setattr(self, n, feat)
        

In [10]:
root_href = '/v/made-in-the-shade/108015049'
scrap = Scraper(root_href)
dest = scrap.create_destination()
dest.children_href = scrap.get_children()

json.dumps(dest.__dict__)

'{"Submitted By": "Salamanizer on Feb 25, 2013", "is_area": false, "star_rating": {"starbest": "4", "starvotes": "3", "staraverage": "2.0"}, "Name": "Made in the Shade", "grade": {}, "rateFont": "6A", "protect_rate": "", "rateHueco": "V3", "is_route": true, "FA": "Aaron Rough", "href": "/v/made-in-the-shade/108015049", "Location": "On the North face of the lower Saddle Boulder just left of the obvious Y shaped crack.", "area_hierarchy": ["/destinations/", "/v/california/105708959", "/v/san-francisco-bay-area/105733851", "/v/wine-country/111499894", "/v/the-nut-tree-boulders/105734016", "/v/hillcrest-boulders/108012775", "/v/saddle-boulders/105734483"], "Page Views": "131", "Protection": "Pad", "Type": "Boulder", "children_href": null, "Description": "Sit start just left of the Y Crack and pull onto, then through the bulge created by the center(chockstone) of the Y."}'

In [11]:
# def traverse_routes(href, HREF_OUTFILE):

#     children = get_children(href)

#     for child in children:
#         if get_children(child) != None:
#             # recursively deeper into the rabbit hole
#             traverse_routes(child, HREF_OUTFILE)
#         else:
#             with open(HREF_OUTFILE, 'a') as f:
#                 f.write(child+'\n')
#                 print child

#     return children

# initialize root destination with children
root_href = '/v/the-nut-tree-boulders/105734016'
scrap = Scraper(root_href)
dest = scrap.create_destination()
dest.children_href = scrap.get_children()


def traverse_routes(node):
    children = []
    for href in node.children_href:
        scrap = Scraper(href)
        dest = scrap.create_destination()
        dest.children_href = scrap.get_children()
        if dest.children_href != None:
            traverse_routes(dest)
        children.append(dest)
    node.children = children
    return node

all_dest = traverse_routes(dest)

In [17]:
BIG_JSON = './mp_tree.json'
with open(BIG_JSON, 'a') as dump:
    flat = json.dumps(all_dest, default=lambda o: o.__dict__)
    dump.write(flat)

In [72]:
# for c in all_dest.children:
#     for gc in c.children:
#         for ggc in gc.children:
#             print ggc.Description[:10]

In [80]:
# import json

json.dumps(dest.__dict__)

def print_json(node):
    for child in node.children:
        if node.children != None:
            print_json(node)
        else:
            return()
    return node.__dict__

print_json(dest)

TypeError: <__main__.Destination object at 0x10afc2390> is not JSON serializable

In [66]:
dest.is_area

True

In [12]:
class Area(Destination):

    climbable = False

    def __init__(self, Destination):
        self.children = get_children(self.href)

    def localize(self, location):
        self.location = location
        # GPS_coord_re = re.compile(r'(/d+)[ ,]+(/d+)')

    def add_description(self, description):
        self.description = description

In [13]:
class Route(Area):

    climbable = True
    
    def __init__(self):
        self.children = None
        
#    def add_grade(self, grade):
#        self.grade = grade
#
#    def add_protect_rate(self, protect_rate):
#        self.protect_rate = protect_rate

    self.star_rating = get_star_rating(Destination.soup)
    
    return feature

SyntaxError: 'return' outside function (<ipython-input-13-c6ab975bb2ca>, line 16)