# Scrape Problems
In this notebook we will use the selenium web driver to scrape problem data from the moonboard website. The general approach is to scan through each page of problems on the website and extract the urls to each individual problem. We then visit each url individually and extract the holds used for each problem. 

In [1]:
import os
import copy
import json
import time
import string
import pickle

from bs4 import BeautifulSoup
from selenium import webdriver

## Helper functions

In [2]:
def find_element(browser, tag_name, attribute, value, num_tries=10, sleep_val=1):
    """
    Finds elements of a given tag name, attribute, and value on a browser page
    """
    elem = None
    for i in range(num_tries):
        try:
            elems = browser.find_elements_by_tag_name(tag_name)
            for e in elems:
                if e.get_attribute(attribute) == value:
                    elem = e
                    break
            if elem is not None:
                break
        except:
            time.sleep(sleep_val)
            continue
    if elem is None:
        print('Failed to find ' + str(attribute) + ' ' + str(value))
    return elem

def find_and_click(browser, tag_name, attribute, value):
    """
    Finds a specific HTML element and clicks on it
    """
    elem = find_element(browser, tag_name, attribute, value)         
    elem.click()         
    return elem

def process_all_problems(browser, problems_dict):
    """
    For a given page, collect all problems' metadata
    """
    problems, data_ids = get_problems(browser)
    for i, problem in enumerate(problems):
        if data_ids[i] in problems_dict:
            continue
        problems_dict[data_ids[i]] = get_problem_meta(problem)

    return problems_dict

def get_problems(browser):
    """
    For a single page, get problem IDs and problem objects
    """
    problems = []
    data_ids = []
    elems = browser.find_elements_by_tag_name('tr')

    for e in elems:
        uid = e.get_attribute('data-uid')
        check1 = uid is not None
        check2 = e.get_attribute('onclick') == 'problemSelected();'
        if check1 and check2:
            data_ids.append(uid)
            problems.append(e)

    return problems, data_ids

def get_problem_meta(problem):
    """
    Finds metadata tags from a problem object
    """
    meta = {}

    h3 = problem.find_elements_by_tag_name('h3')[0]
    meta['problem_name'] = h3.text
    meta['info'] = [p.text for p in problem.find_elements_by_tag_name('p')]
    meta['url'] = h3.find_elements_by_tag_name('a')[0].get_attribute('href')

    

    return meta

def click_next_page(browser, current_page=1):
    """
    Clicks on button to access next page of routes
    """
    next_page = current_page + 1
    page_elem = find_and_click(browser, 'a', 'data-page', str(next_page))
    return page_elem




## 1. Open browser, login, and navigate to the problems page.

In [3]:
# Load an incognito Chrome browser

options = webdriver.ChromeOptions()
options.add_argument('--incognito')
driver_path = r"C:\Users\Tom\Downloads\chromedriver_win32\chromedriver.exe"

browser = webdriver.Chrome(executable_path=driver_path, options=options)
browser.set_window_size(1500, 910)

In [4]:
# load moonboard website    
browser.get('https://moonboard.com/')


# Click login area
a_elems = browser.find_elements_by_tag_name('a')
for a in a_elems:
    if a.text == 'LOGIN/REGISTER':
        login_elem = a
        break
if login_elem is None:
    print('Failed to find Login Button')
else:
    login_elem.click()


# Fill in credentials and login   

username = ''
password = ''

username_elem = None
password_elem = None
input_field_elems = browser.find_elements_by_tag_name('input')

# Iterate through input field elements
for i in input_field_elems:
    if i.get_attribute('placeholder') == 'Username':
        username_elem = i
    if i.get_attribute('placeholder') == 'Password':
        password_elem = i

# Populate fields
if username_elem is not None and password_elem is not None:
    username_elem.send_keys(username)
    password_elem.send_keys(password)


# click login
find_and_click(browser, 'button', 'type', 'submit')

<selenium.webdriver.remote.webelement.WebElement (session="baf55c85f7f74e3981f30be061764310", element="439f6c22-8cac-4e6f-bf84-89b1d0a3e28b")>

In [5]:
#click view problems
click_problems = find_and_click(browser, 'a', 'id', 'lProblems')
click_view = find_and_click(browser, 'li', 'id', 'm-viewproblem')


    


# Finds hold setup dropdown
target_elem = None
elems = browser.find_elements_by_tag_name('select')
for e in elems:
    if e.get_attribute('id') == 'Holdsetup':
        target_elem = e
        break


# Selects appropriate dropdown item
elems = target_elem.find_elements_by_tag_name('option')
target_elem = None
for e in elems:
    if e.text == 'MoonBoard Masters 2017':
        target_elem = e


# Select hold configuration
target_elem.click()


## 2. Click through each page of problems and extract problem urls.
Put num_pages = -1 to process all pages

In [6]:
problems_dict = {}

page_cnt = 0
found_page = True
current_page = 1
num_pages = 10   
while found_page:
    page_cnt += 1
    
    for i in range(20):
            try:
                problems_dict = process_all_problems(browser, problems_dict)
                print('Processed page: %s!' % current_page)
                break
            except:
                print('Failed to process problems on page ' + str(current_page))
                time.sleep(1)
                continue
                 

    page_elem = None
    for i in range(20):
        try:
            page_elem = click_next_page(browser, current_page)
            print('Clicked page %s' % (current_page + 1))
            break
        except:
            print('Failed to click page %s' % (current_page + 1))
            time.sleep(1)
            continue
            
        

    # If the end of pages is reached
    if page_elem is None or page_cnt == num_pages:    
        break

    # Flip to next page
    current_page += 1

Processed page: 1!
Clicked page 2
Processed page: 2!
Clicked page 3
Processed page: 3!
Clicked page 4
Processed page: 4!
Clicked page 5
Failed to process problems on page 5
Processed page: 5!
Clicked page 6
Processed page: 6!
Clicked page 7
Processed page: 7!
Clicked page 8
Failed to process problems on page 8
Processed page: 8!
Clicked page 9
Processed page: 9!
Clicked page 10
Processed page: 10!
Clicked page 11


In [16]:
import json

# Serialize data into file:
with open('rawproblems1.json', 'w') as fp:
    json.dump(problems_dict, fp)

# Read data from file:
with open('rawproblems1.json', 'r') as fp:
    problems_dict = json.load(fp)

## 3. Visit each problem url and extract holds used in each problem.

In [9]:

uids = sorted(problems_dict.keys())

# Iterate through each problem
for i, uid in enumerate(uids):
    print(str(i + 1) + ' / ' + str(len(uids)))
    tmp_problem = copy.deepcopy(problems_dict[uid])
    
    browser.get(tmp_problem['url'])  # Navigate to page

    for x in range(10):
        try:
            target_text = ''
            html = browser.page_source
            soup = BeautifulSoup(html, 'html.parser')
            scripts = soup.find_all("script")
            for s in scripts:
                if 'JSON' in s.text:
                    target_text =  s.text

            json_dict = json.loads(target_text.split("var problem = JSON.parse('")[1].split("');")[0])
            tmp_problem['moves'] = json_dict['Moves']
                          
        except:
            print('Failed to find moves')
            time.sleep(1)
            continue

    
    problems_dict[uid] = tmp_problem
                



1 / 61
2 / 61
3 / 61
4 / 61
5 / 61
6 / 61
7 / 61
8 / 61
9 / 61
10 / 61
11 / 61
12 / 61
13 / 61
14 / 61
15 / 61
16 / 61
17 / 61
18 / 61
19 / 61
20 / 61
21 / 61
22 / 61
23 / 61
24 / 61
25 / 61
26 / 61
27 / 61
28 / 61
29 / 61
30 / 61
31 / 61
32 / 61
33 / 61
34 / 61
35 / 61
36 / 61
37 / 61
38 / 61
39 / 61
40 / 61
41 / 61
42 / 61
43 / 61
44 / 61
45 / 61
46 / 61
47 / 61
48 / 61
49 / 61
50 / 61
51 / 61
52 / 61
53 / 61
54 / 61
55 / 61
56 / 61
57 / 61
58 / 61
59 / 61
60 / 61
61 / 61


In [10]:
problems_dict

{'372426': {'problem_name': 'KRAFTIS 1.1',
  'info': ['Kimi',
   'Be the first to repeat this problem',
   '6A',
   'Feet follow hands',
   '25° MoonBoard'],
  'url': 'https://moonboard.com/Problems/View/372426/kraftis-1-1',
  'moves': [{'Id': 2154996,
    'Description': 'I4',
    'IsStart': True,
    'IsEnd': False},
   {'Id': 2154997, 'Description': 'F6', 'IsStart': True, 'IsEnd': False},
   {'Id': 2154998, 'Description': 'J9', 'IsStart': False, 'IsEnd': False},
   {'Id': 2154999, 'Description': 'H11', 'IsStart': False, 'IsEnd': False},
   {'Id': 2155000, 'Description': 'I15', 'IsStart': False, 'IsEnd': False},
   {'Id': 2155001, 'Description': 'K17', 'IsStart': False, 'IsEnd': False},
   {'Id': 2155002, 'Description': 'K13', 'IsStart': False, 'IsEnd': False},
   {'Id': 2155003, 'Description': 'J18', 'IsStart': False, 'IsEnd': True}]},
 '372424': {'problem_name': 'JOELQUISGAIS',
  'info': ['Sergio jimenez del rio',
   'Be the first to repeat this problem',
   '6B',
   'Feet follow ha

In [23]:
import json

# Serialize data into file:
with open('problems1.json', 'w') as fp:
    json.dump(problems_dict, fp)



In [24]:
# Read data from file:
with open('problems1.json', 'r') as fp:
    test = json.load(fp)