In [55]:
# coding:utf-8
# Textual analysis through the open web
from __future__ import print_function
import io
import os
import re
import bs4
import sys
import json
import time
import nltk
import urllib
import pprint
import random
import requests
import wikipedia
import collections

In [2]:
def scrape_wiki(title):
    try:
        searchstring = title
        summary = wikipedia.summary(searchstring)
        print(__success(searchstring))
    except wikipedia.DisambiguationError:
        try:
            searchstring = u'{} (video game)'.format(title).replace(u' ', u'_')
            summary = wikipedia.page(searchstring, auto_suggest=False).summary
            print(__success(searchstring))
        except wikipedia.DisambiguationError:
            searchstring = u'{} (Unix video game)'.format(title).replace(u' ', u'_')
            summary = wikipedia.page(searchstring, auto_suggest=False).summary
            print(__success(searchstring))
        except wikipedia.PageError:
            summary = 'No summary found on Wikipedia.'
            print(__warning(u'Wikipedia cannot find "{}"'.format(searchstring)))
    except wikipedia.PageError:
        try:
            summary = wikipedia.page(searchstring, auto_suggest=False).summary
            print(__success(searchstring))
        except wikipedia.PageError:
            summary = 'No summary found on Wikipedia.'
            print(__warning(u'Search term "{}" returned nothing'.format(searchstring)))
    return summary

def scrape_duckduckgo(keywords, developer=""):
    searchstring = u'"{}" {} {}'.format(keywords, developer, u'interview game')
    response = requests.get(u'http://duckduckgo.com/html/?q={}'.format(
                urllib.quote(searchstring.encode('utf-8'))),
                            timeout=(9.1, 12.1)
             )
    print(__message(u'DDG: {}'.format(searchstring)))
    soup = bs4.BeautifulSoup(response.text)
    links = []
    for node in soup.select('div.web-result'):
        if 'web-result-sponsored' in node['class']:
            continue
        try:
            links.append(node.select('a.large')[0].get('href'))
        except Exception as e:
            print(__failure(e))
            pass
    if links:
        print(__success(u'DDG: {}'.format(searchstring)))
        print('\t\t\n'.join(links))
    return links

def scrape(url):
    try:
        response = requests.get(url, timeout=(10, 15))
    except Exception as e:
        print(__failure(e))
        return ''
    
    return response.text
    
def read_json(path):
    data = ''
    with io.open(path, 'r', encoding='utf-8') as f:
        data = json.loads(f.read())
        print(__message(u'Loaded {}'.format(path)))
    return data
    
def save_json(path, data):
    with io.open(path, 'w', encoding='utf-8') as f:
        try:
            output = json.dumps(data, indent=2, ensure_ascii=False)
            f.write(output)
        except UnicodeEncodeError:
            f.write(output.encode('utf-8'))
    print(__message(u'Written to {}'.format(path)))
        
def __success(text):
    return u'  (SUCC) {}'.format(text).encode('utf-8')
    
def __failure(text):
    return u'!!FAIL!! {}'.format(text).encode('utf-8')
    
def __warning(text):
    return u'??WARN?? {}'.format(text).encode('utf-8')
    
def __message(text):
    return u'   |MSG| {}'.format(text).encode('utf-8')

In [None]:
# Scrape for links
game_meta = read_json(os.path.join(os.getcwd(), 'data', 'game-sources.json'))

shuffled_game_meta = game_meta.items()
random.shuffle(shuffled_game_meta)
for game, meta in shuffled_game_meta:
    game_meta[game]['Links'] += scrape_duckduckgo(game, game_meta[game]['Developer'])
    game_meta[game]['Links'] = list(set(game_meta[game]['Links']))
    save_json(os.path.join(os.getcwd(), 'data', 'game-sources.json'), game_meta)
    time.sleep(1)

In [67]:
# Load URLs in interviews
game_meta = read_json(os.path.join(os.getcwd(), 'data', 'game-sources.json'))
cached = read_json(os.path.join(os.getcwd(), 'data', 'interviews.json'))

output = cached
for game, meta in game_meta.items():
    if game not in output:
        output[game] = {}
    print(__message(game))
    for url in meta['Links']:
        if url in output[game]:
            continue
        print(__message(url))
        html = scrape(url)
        if 'interview' in url.lower() or 'interview' in html.lower() or \
           'review' in url.lower() or 'review' in html.lower() :
            soup = bs4.BeautifulSoup(html)
            content = soup.select('div > p') + soup.select('body > p')
            data = [c.string.strip() for c in content if c.string and c.string.strip()]
            output[game][url] = data
            save_json(os.path.join(os.getcwd(), 'data', 'corpus.json'), output)
        else:
            output[game][url] = []
        save_json(os.path.join(os.getcwd(), 'data', 'corpus.json'), output)

   |MSG| Loaded /Users/spaxe/dev/roguelike-universe/data/game-sources.json
   |MSG| Loaded /Users/spaxe/dev/roguelike-universe/data/interviews.json
   |MSG| Dungeon
   |MSG| Falcon's Eye
   |MSG| Island of Kesmai
   |MSG| Torneko no Daiboken: Fushigi no Dungeon
   |MSG| Pokémon Mystery Dungeon: Explorers of Sky
   |MSG| Linley's Dungeon Crawl
   |MSG| TowerClimb
   |MSG| Teleglitch
   |MSG| Brogue
   |MSG| Weird Worlds: Return to Infinite Space
   |MSG| Advanced Dungeons & Dragons: Cloudy Mountain
   |MSG| Dungeon Hack
   |MSG| Dungeon of Doom
   |MSG| Slash'EM
   |MSG| Beneath Apple Manor
   |MSG| Torneko: The Last Hope
   |MSG| Smart Kobold
   |MSG| Larn
   |MSG| Lost Labyrinth
   |MSG| Tower of Guns
   |MSG| UnReal World
   |MSG| Dragon Quest: Shonen Yangus to Fushigi no Dungeon
   |MSG| Dungeons of the Unforgiven
   |MSG| Rogue
   |MSG| Rogue Legacy
   |MSG| The Binding of Isaac
   |MSG| Dark Chronicle
   |MSG| Mystery Dungeon: Shiren the Wanderer
   |MSG| Risk of Rain
   |MSG| Anc

In [34]:
# Locate mentions of games
game_meta = read_json(os.path.join(os.getcwd(), 'data', 'game-sources.json'))
game_interviews = read_json(os.path.join(os.getcwd(), 'data', 'interviews.json'))

# Create a look up table for games
game_LUT = {}
for game, meta in game_meta.items():
    game_LUT[game] = game
    if 'AKA' in meta:
        for aka in meta['AKA']:
            game_LUT[aka] = game
                
pprint.pprint(game_LUT, indent=2)

   |MSG| Loaded /Users/spaxe/dev/roguelike-universe/data/game-sources.json
   |MSG| Loaded /Users/spaxe/dev/roguelike-universe/data/interviews.json
{ u'100 Rogues': u'100 Rogues',
  u'99 Levels to Hell': u'99 Levels to Hell',
  u'A.D.O.M.': u'Ancient Domains of Mystery',
  u'ADOM': u'Ancient Domains of Mystery',
  u'ADVENT': u'Adventure',
  u'Advanced Dungeons & Dragons: Cloudy Mountain': u'Advanced Dungeons & Dragons: Cloudy Mountain',
  u'Adventure': u'Adventure',
  u'Alphaman': u'Alphaman',
  u'Ancient Domains of Mystery': u'Ancient Domains of Mystery',
  u'Angband': u'Angband',
  u'Azure Dreams': u'Azure Dreams',
  u'Baroque': u'Baroque',
  u'Beneath Apple Manor': u'Beneath Apple Manor',
  u'Blue Rescue Team': u'Pok\xe9mon Mystery Dungeon',
  u'Brogue': u'Brogue',
  u'Castle of the Winds': u'Castle of the Winds',
  u'Chocobo no Fushigina Dungeon': u'Chocobo no Fushigina Dungeon',
  u"Chocobo's Dungeon 2": u"Chocobo's Dungeon 2",
  u"Chocobo's Mysterious Dungeon": u'Chocobo no Fushi

In [63]:
# Look through the interview articles
relations = {}
for game, interviews in game_interviews.items():
#     print(__message(game))
    relations[game] = []
    counter = collections.Counter()
    for url, interview in interviews.items():
        for paragraph in interview:
            for title in sorted(game_LUT.keys(), reverse=True, key=lambda x: len(x)):
                if game_LUT[title] == game:
                    continue
                try:
                    i = paragraph.index(title)
                    if (i-1          >= 0              and paragraph[i-1].isalpha()) or \
                       (i+len(title) <  len(paragraph) and paragraph[i+len(title)].isalpha()):
                        continue
                    counter[game_LUT[title]] += 1
                    paragraph.replace(title, '')
                except ValueError:
                    continue
    frequent_games = counter.most_common(3)
    relations[game] = frequent_games
pprint.pprint(relations, indent=2)

{ u'100 Rogues': [(u'Rogue', 13)],
  u'99 Levels to Hell': [(u'Spelunky', 10), (u'Dungeon', 1)],
  u'Advanced Dungeons & Dragons: Cloudy Mountain': [],
  u'Adventure': [],
  u'Alphaman': [],
  u'Ancient Domains of Mystery': [ (u'Dungeon Crawl Stone Soup', 10),
                                   (u'Dungeons of Dredmor', 8),
                                   (u'Brogue', 4)],
  u'Angband': [(u'Rogue', 2)],
  u'Azure Dreams': [],
  u'Baroque': [],
  u'Beneath Apple Manor': [(u'Adventure', 4), (u'NetHack', 4), (u'Rogue', 3)],
  u'Brogue': [],
  u'Castle of the Winds': [],
  u'Chocobo no Fushigina Dungeon': [(u'Baroque', 1), (u'Adventure', 1)],
  u"Chocobo's Dungeon 2": [(u'Dungeon', 1), (u'Hack', 1)],
  u'Dark Chronicle': [(u'Rogue', 24), (u'Dark Cloud', 14), (u'Hack', 1)],
  u'Dark Cloud': [(u'Rogue', 27), (u'Dark Chronicle', 5), (u'Adventure', 1)],
  u'Deadly Dungeons': [(u'Dungeon Hack', 1), (u'Hack', 1)],
  u"Don't Starve": [(u'Adventure', 3)],
  u'Doom, the Roguelike': [],
  u'Dragon 