Tutorial: <https://towardsdatascience.com/how-to-web-scrape-with-python-in-4-minutes-bc49186a8460>

In [1]:
import requests
import urllib.request
from urllib.parse import urlparse
import time
from bs4 import BeautifulSoup
from markdownify import markdownify
import json
import os

In [2]:
def get_soup(url):
    response = requests.get(url)
    return BeautifulSoup(response.text,"html.parser")

    
def get_event_project_pages(subdomain):
    x = []
    i=0
    while(True):
        i+=1
        y =  get_event_project_pages_paginated(subdomain,i)
        if len(y) == 0:
            return x
        else:
            x += y

def get_event_project_pages_paginated(subdomain, page):
    url = f'https://{subdomain}.devpost.com/project-gallery?page={page}'
    print(url)
    blocks = get_soup(url).find_all('a',class_="link-to-software")
    return [block['href'] for block in blocks]


def extract_project_page(url):
    soup = get_soup(url)
    content_html = str(get_content_html(soup))
    if(content_html == '' or content_html == 'None'):
        print(f'[!] Empty content html from {url}')
    return {
        'source': url,
        'title': soup.find('meta',{'property':'og:title'})['content'],
        'blurb': soup.find('meta',{'property':'og:description'})['content'],
        'awards': [str(k.next_sibling.string).strip() for k in soup.find_all('span',class_='winner')],
        'videos': [k['src'] for k in soup.find_all(class_='video-embed')],
        'images': [{'title': k['data-title'], 'src': k['href']} for k in soup.find_all('a',{'data-title':True})],
        'team': [extract_user_info(li) for li in soup.find_all('li',class_='software-team-member')],
        'built_with': [a.string for a in soup.find_all('span',class_='cp-tag')],
        'content_html': content_html,
        'content_md': markdownify(str(content_html),heading_style="atx"),
    }

def extract_user_info(li):
    photo_block = li.find('img',class_='software-member-photo')
    bubble = li.find('div',class_='bubble')

    return {
        'name': photo_block['title'],
        'about': '' if bubble is None else bubble.find('p').text,
        'photo': photo_block['src']
    }

def get_content_html(soup):
    div0 = soup.find('div',id='app-details-left')
    if div0 is None: return ''
    return div0.find('div',{'id':False},recursive=False)


In [3]:
# Download everything
def download_all(years):
    for year in years:
        projects = get_event_project_pages(year)
        i=0
        for project_url in projects:
            try:
                our_id = urlparse(project_url).path[len('/software/'):]
                data = extract_project_page(project_url)
                output_file = f'dataset/{year}/p{i:03d}-{our_id}.json'
                os.makedirs(os.path.dirname(output_file), exist_ok=True)
                with open(output_file,'w') as outfile:
                    json.dump(data,outfile,indent=4)
                i+=1
            except Exception as e:
                print(f"ERROR handling {project_url}")
                raise e

In [8]:
years=['realityvrhack','arvr-hackathon-2017','rv2019','mit-reality-hack-2020']
download_all(years)

https://realityvrhack.devpost.com/project-gallery?page=1
https://realityvrhack.devpost.com/project-gallery?page=2
https://realityvrhack.devpost.com/project-gallery?page=3
https://realityvrhack.devpost.com/project-gallery?page=4
https://realityvrhack.devpost.com/project-gallery?page=5
ERROR handling https://devpost.com/software/unnamed-vr-project


From cffi callback <function _verify_callback at 0x00000299992AB0D8>:
Traceback (most recent call last):
  File "C:\Users\Wiley\anaconda3\lib\site-packages\OpenSSL\SSL.py", line 311, in wrapper
    @wraps(callback)
KeyboardInterrupt


SSLError: HTTPSConnectionPool(host='devpost.com', port=443): Max retries exceeded with url: /software/unnamed-vr-project (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))

In [4]:
download_all(['realityfest-exhibition-hall'])

https://realityfest-exhibition-hall.devpost.com/project-gallery?page=1
https://realityfest-exhibition-hall.devpost.com/project-gallery?page=2


In [11]:
# Combine
def read_one(filename):
    with open(filename) as f:
        return json.load(f)
        
combined = {year: [read_one(f"./dataset/{year}/{proj}") for proj in os.listdir(f"./dataset/{year}")] for year in os.listdir("./dataset")}

with open(f"./dataset/combined.json",'w') as outfile:
    json.dump(combined,outfile,indent=4)

NotADirectoryError: [WinError 267] The directory name is invalid: './dataset/combined.json'