In [156]:
# coding:utf-8
# Textual analysis through the open web
from __future__ import print_function
import io
import os
import bs4
import sys
import json
import time
import urllib
import requests
import wikipedia

In [183]:
def read_json(path):
    data = ''
    with io.open(path, 'r', encoding='utf-8') as f:
        data = json.loads(f.read())
        print(__message('Loaded {}'.format(path)))
    return data
    
def modify_game_meta(data):
    for k, v in data.items():
        data[k].pop('Last', None)
        data[k]['Year'] = int(data[k]['First'][:4])
        data[k].pop('First', None)
        data[k].pop('Title', None)
        data[k]['Summary'] = scrape_wiki(k)
        
#         # Scraping Duck Duck Go
#         print(__message('Scraping DuckDuckGo for a list of URLs...'))
#         new_links = scrape_duckduckgo(k, data[k]['Developer'])
#         data[k]['Links'] = new_links | set(data[k]['Links'])

    print(__message('Done'))
    return data

def scrape_wiki(title):
    try:
        searchstring = title
        summary = wikipedia.summary(searchstring)
        print(__success(searchstring))
    except wikipedia.DisambiguationError:
        try:
            searchstring = u'{} (video game)'.format(title).replace(u' ', u'_')
            summary = wikipedia.page(searchstring, auto_suggest=False).summary
            print(__success(searchstring))
        except wikipedia.DisambiguationError:
            searchstring = u'{} (Unix video game)'.format(title).replace(u' ', u'_')
            summary = wikipedia.page(searchstring, auto_suggest=False).summary
            print(__success(searchstring))
        except wikipedia.PageError:
            summary = 'No summary found on Wikipedia.'
            print(__warning('Wikipedia cannot find "{}"'.format(searchstring)))
    except wikipedia.PageError:
        try:
            summary = wikipedia.page(searchstring, auto_suggest=False).summary
            print(__success(searchstring))
        except wikipedia.PageError:
            summary = 'No summary found on Wikipedia.'
            print(__warning('Search term "{}" returned nothing'.format(searchstring)))
    return summary

def scrape_duckduckgo(keywords, developer=""):
    searchstring = u'"{}" {} {}'.format(keywords, developer, u'interview game')
    response = requests.get('http://duckduckgo.com/html/?q={}'.format(
                urllib.quote(searchstring)),
                timeout=(10, 15)
             )
    print(__message(u'DDG: {}'.format(searchstring)))
    soup = bs4.BeautifulSoup(response.text)
    links = []
    for node in soup.select('div.web-result'):
        if 'web-result-sponsored' in node['class']:
            continue
        try:
            links.append(node.select('a.large')[0].get('href'))
        except Exception as e:
            print(__failure(e))
            pass
    if links:
        print(__success(u'DDG: {}'.format(searchstring)))
    return links

def scrape_url(url):
    try:
        response = requests.get(url, timeout=(10, 15))
        return response.text
    except Exception as e:
        print(__failure(e))

def clean_url_data(links, data):
    links = set(links)
    for link in data.keys():
        if link not in links:
            data.pop(link, None)
    return data
    
def save_json(path, data):
    with io.open(path, 'w', encoding='utf-8') as f:
        try:
            output = json.dumps(data, indent=2, ensure_ascii=False)
            f.write(output)
        except UnicodeEncodeError:
            f.write(output.encode('utf-8'))
    print(__message(u'Written to {}'.format(path)))
        
def __success(text):
    return u'  (SUCC) {}'.format(text).encode('utf-8')
    
def __failure(text):
    return u'!!FAIL!! {}'.format(text).encode('utf-8')
    
def __warning(text):
    return u'??WARN?? {}'.format(text).encode('utf-8')
    
def __message(text):
    return u'   |MSG| {}'.format(text).encode('utf-8')

In [185]:
cwd = os.getcwd()

game_meta = read_json(os.path.join(cwd, 'data', 'game-sources.json'))
game_articles = read_json(os.path.join(cwd, 'data', 'game-articles.json'))

# Format and standardise metadata
game_meta = modify_game_meta(game_meta)

# Clean up unused data in processing
for article, data in game_articles.items():
    if article not in game_meta:
        game_articles.pop(article, None)
        continue
    game_articles[article] = clean_url_data(game_meta[article]['Links'], data)

# # Scrape new data from URL
# for k, v in game_meta.items():
#     for link in game_meta[k]['Links']:
#         if k not in game_articles:
#             game_articles[k] = {}
#         if link not in game_articles[k] or not game_articles[k][link]:
#             game_articles[k][link] = scrape_url(link)
#             print(__message('Scraped {} in {}'.format(k, link)))
            
save_json(os.path.join(cwd, 'generated', 'processed-game-articles.json'), game_articles)
save_json(os.path.join(cwd, 'generated', 'processed-game-sources.json'), game_meta)

   |MSG| Loaded /Users/spaxe/dev/roguelike-universe/data/game-sources.json
   |MSG| Loaded /Users/spaxe/dev/roguelike-universe/data/game-articles.json
  (SUCC) Dungeon
  (SUCC) Falcon's Eye
  (SUCC) Island of Kesmai
  (SUCC) Torneko no Daiboken: Fushigi no Dungeon
  (SUCC) Pokémon Mystery Dungeon: Explorers of Sky
  (SUCC) Linley's Dungeon Crawl
  (SUCC) Tales of Middle Earth
  (SUCC) TowerClimb
  (SUCC) Teleglitch
  (SUCC) Brogue
??WARN?? Search term "Deadly Dungeons" returned nothing
  (SUCC) Advanced Dungeons & Dragons: Cloudy Mountain
  (SUCC) Dungeon Hack
  (SUCC) Dungeon of Doom
  (SUCC) Dungeon Crawl Stone Soup
  (SUCC) Slash'EM
  (SUCC) Beneath Apple Manor
  (SUCC) Mystery Dungeon: Shiren the Wanderer
  (SUCC) Smart Kobold
  (SUCC) Larn_(video_game)
  (SUCC) Lost Labyrinth
  (SUCC) Tower of Guns
  (SUCC) Omega (video game)
  (SUCC) UnReal World
  (SUCC) Dragon Quest: Shonen Yangus to Fushigi no Dungeon
  (SUCC) Wizards Encounters
  (SUCC) Rogue Legacy
  (SUCC) The Binding of Is