In [3]:
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
import pymongo
import math
import pandas as pd
from multiprocessing.dummy import Pool
from multiprocessing import cpu_count

In [46]:
mc = pymongo.MongoClient()
db = mc['reports']
trail_reports = db['trail_reports']
raw_html = db['html']

 WARNING: If you want to clear the current database

In [45]:
mt_ellinor_reports.drop()

If you need to problem solve: %pdb

In [50]:
def select_text(parent_element, css_selector):
    element = parent_element.select_one(css_selector)
    return get_text_if_not_none(element)

In [51]:
def get_text_if_not_none(element):
    if element is None:
        text = None
    else:
        text = element.text
    return text

In [52]:
def select_date(parent_element, css_selector):
    element = parent_element.select_one(css_selector)
    return get_date_if_not_none(element)

In [53]:
def get_date_if_not_none(element):
    if element is None:
        text = None
    else:
        text = element.attrs.get('title')
    return text

In [54]:
def parse_trip_report(title,trip_report_div):
    """Return a dictionary representing a single trip report."""
    creator = select_text(trip_report_div, 'div.CreatorInfo span a')
    date = select_date(trip_report_div, 'span.elapsed-time')
    report = select_text(trip_report_div,'div.show-with-full')
    trail_conditions = select_text(trip_report_div,'div.trail-issues')
    votes = select_text(trip_report_div,'span.UpvoteCount')
    return {
        "Trail": title,
        "Creator": creator,
        "Date": date,
        "Report": report,
        "Trail_condtions": trail_conditions,
        "Votes": votes
    }

In [71]:
def get_trail_report(title, hikeurl, params=None):
    """Accepts a url of the hike, saves all raw html of that hike, then finds all of the subsequent trip reports for that hike,
    scrapes them and inserts them into a MongoDB
    **Input parameters**
    ------------------------------------------------------------------------------
    title: string.  Hike name.
    hikeurl: string. Base URL for the request.
    params: dictionary.  Parameters to be included in the request.
    **Output**
    ------------------------------------------------------------------------------
    None. Appends entry to MongoDB using Pymongo
    """
    r = requests.get(hikeurl + '/@@related_tripreport_listing', params).text
    soup = BeautifulSoup(r, 'lxml')
    save_raw_html(r)
    for trip_report_div in soup.select('div#trip-reports div.item'):
        trip_report = parse_trip_report(title,trip_report_div)
        trail_reports.insert_one(trip_report)

In [73]:
def save_raw_html(r):
    raw_insert = {"raw_html": r}
    raw_html.insert_one(raw_insert)


In [57]:
def iterate_all_reports(title, hikeurl):
    """Determines the number of times to call getTripReports function based on
    the number of trip reports listed on the hike homepage.
    **Input parameters**
    ------------------------------------------------------------------------------
    title: string.  Hike name.
    hikeurl: string. Base URL for the request.
    **Output**
    ------------------------------------------------------------------------------
    None. Appends entry to MongoDB using pymongo.
    """
    #lists how many reports are on the page
    r = requests.get(hikeurl + '/@@related_tripreport_listing').text
    soup = BeautifulSoup(r, 'lxml')
    numit = math.ceil(float(soup.find('div', {'id': 'count-data'}).text)/5)
    for i in range(int(numit)):
        get_trail_report(title, hikeurl, params={'b_start:int': str(i*5)})

In [58]:
def TripReportBuilder(df):
    """Iterates through the rows of loaded pandas dataframe and calls
    iterateTripReports for each hike/row.
    
    **Input parameters**
    ------------------------------------------------------------------------------
    title: pandas dataframe. Dataframe must contain columns entitled 'numReports'
            and 'hike_name'.
    **Output**
    ------------------------------------------------------------------------------
    None. Calls following functions for input of data into MongoDB using Pymongo
    """
    for row in range(len(df)):
        if df['numReports'][row]:
            iterateTripReports(df['hike_name'][row], df['url'][row])
        else:
            continue

In [74]:
iterate_all_reports('mt_ellinor',"https://www.wta.org/go-hiking/hikes/mount-ellinor")

In [78]:
df = pd.DataFrame(list( trail_reports.find()))

In [79]:
df

Unnamed: 0,Creator,Date,Report,Trail,Trail_condtions,Votes,_id
0,\nMoritzCrackers,"May 05, 2018",Friday evening my boyfriend and I drove from S...,mt_ellinor,\nBeware of:\n snow conditions\n ...,1,5af0d2feacf3d61d9c0f1fe3
1,\nJunBug,"May 05, 2018","Found women's shoes and gaiters on May 5th, ar...",mt_ellinor,"\nBeware of:\n road, snow conditi...",1,5af0d2feacf3d61d9c0f1fe4
2,\nGKeeffe,"May 04, 2018",started hiking at 7 a.m. and the snow conditio...,mt_ellinor,\nBeware of:\n snow conditions\n ...,12,5af0d2feacf3d61d9c0f1fe5
3,\nOld Mountain Man,"Apr 29, 2018",We climbed Ellinor via the winter route. The r...,mt_ellinor,"\nBeware of:\n road, snow & trail...",13,5af0d2feacf3d61d9c0f1fe6
4,\nGKeeffe,"Apr 27, 2018",Road to lower trail now completely snow free. ...,mt_ellinor,"\nBeware of:\n snow, trail condit...",28,5af0d2feacf3d61d9c0f1fe7
5,\njasonturnerwa,"Apr 25, 2018",Got a late start today and hit it from the low...,mt_ellinor,\nBeware of:\n snow conditions\n ...,8,5af0d2ffacf3d61d9c0f1fe9
6,\nMafHoney,"Apr 22, 2018","Road to the lower trailhead is snow free, but ...",mt_ellinor,\nBeware of:\n snow conditions\n ...,2,5af0d2ffacf3d61d9c0f1fea
7,\nkjmac,"Apr 21, 2018",Wowww such a beautiful place. The views from t...,mt_ellinor,\nBeware of:\n snow conditions\n ...,4,5af0d2ffacf3d61d9c0f1feb
8,\nufda94,"Apr 21, 2018",Driving to the lower trailhead in any vehicle ...,mt_ellinor,"\nBeware of:\n snow, trail condit...",4,5af0d2ffacf3d61d9c0f1fec
9,\njfarias1986,"Apr 21, 2018",Took my step son up here today! Great day for ...,mt_ellinor,"\nBeware of:\n snow, trail condit...",6,5af0d2ffacf3d61d9c0f1fed


In [80]:
df_html = pd.DataFrame(list( raw_html.find()))

In [85]:
df_html['raw_html'][3]

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><body><div id="count-data">664</div>\n\n    \n    <div class="js-tab-target">\n        <div>\n            \n\n    <!-- Navigation -->\n\n\n  <nav class="pagination">\n\n    <ul>\n\n      \n      \n\n      \n      \n\n      \n      \n\n      \n      \n\n      \n      <li class="active">\n        <span>1</span>\n      </li>\n\n      \n      <li>\n        <a href="https://www.wta.org/go-hiking/hikes/mount-ellinor/@@related_tripreport_listing?b_start:int=5">2</a>\n      </li>\n      <li>\n        <a href="https://www.wta.org/go-hiking/hikes/mount-ellinor/@@related_tripreport_listing?b_start:int=10">3</a>\n      </li>\n      <li>\n        <a href="https://www.wta.org/go-hiking/hikes/mount-ellinor/@@related_tripreport_listing?b_start:int=15">4</a>\n      </li>\n      <li>\n        <a href="https://www.wta.org/go-hiking/hikes/moun