In [1]:
# coding:utf-8
# Textual analysis through the open web
from __future__ import print_function
import io
import os
import re
import bs4
import sys
import json
import time
import nltk
import urllib
import pprint
import random
import string
import requests
import wikipedia
import itertools
import collections

In [4]:
def scrape_wiki(title):
    try:
        searchstring = title
        summary = wikipedia.summary(searchstring)
        print(__success(searchstring))
    except wikipedia.DisambiguationError:
        try:
            searchstring = u'{} (video game)'.format(title).replace(u' ', u'_')
            summary = wikipedia.page(searchstring, auto_suggest=False).summary
            print(__success(searchstring))
        except wikipedia.DisambiguationError:
            searchstring = u'{} (Unix video game)'.format(title).replace(u' ', u'_')
            summary = wikipedia.page(searchstring, auto_suggest=False).summary
            print(__success(searchstring))
        except wikipedia.PageError:
            summary = 'No summary found on Wikipedia.'
            print(__warning(u'Wikipedia cannot find "{}"'.format(searchstring)))
    except wikipedia.PageError:
        try:
            summary = wikipedia.page(searchstring, auto_suggest=False).summary
            print(__success(searchstring))
        except wikipedia.PageError:
            summary = 'No summary found on Wikipedia.'
            print(__warning(u'Search term "{}" returned nothing'.format(searchstring)))
    return summary

def scrape_duckduckgo(keywords, developer=""):
    searchstring = u'"{}" {} {}'.format(keywords, developer, u'interview game')
    response = requests.get(u'http://duckduckgo.com/html/?q={}'.format(
                urllib.quote(searchstring.encode('utf-8'))),
                            timeout=(9.1, 12.1)
             )
    print(__message(u'DDG: {}'.format(searchstring)))
    soup = bs4.BeautifulSoup(response.text)
    links = []
    for node in soup.select('div.web-result'):
        if 'web-result-sponsored' in node['class']:
            continue
        try:
            links.append(node.select('a.large')[0].get('href'))
        except Exception as e:
            print(__failure(e))
            pass
    if links:
        print(__success(u'DDG: {}'.format(searchstring)))
        print('\t\t\n'.join(links))
    return links

def scrape(url):
    try:
        response = requests.get(url, timeout=(10, 15))
    except Exception as e:
        print(__failure(u'Failed to load {}'.format(url)))
        print(e)
        return None
    
    return response.text
    
def read_json(path):
    data = ''
    with io.open(path, 'r', encoding='utf-8') as f:
        data = json.loads(f.read())
        print(__message(u'Loaded {}'.format(path)))
    return data
    
def save_json(path, data):
    with io.open(path, 'w', encoding='utf-8') as f:
        try:
            output = json.dumps(data, indent=2, ensure_ascii=False)
            f.write(output)
        except UnicodeEncodeError:
            f.write(output.encode('utf-8'))
    print(__message(u'Written to {}'.format(path)))
        
def __success(text):
    return u'  (SUCC) {}'.format(text).encode('utf-8')
    
def __failure(text):
    return u'!!FAIL!! {}'.format(text).encode('utf-8')
    
def __warning(text):
    return u'??WARN?? {}'.format(text).encode('utf-8')
    
def __message(text):
    return u'   |MSG| {}'.format(text).encode('utf-8')

In [None]:
# Scrape for links
game_meta = read_json(os.path.join(os.getcwd(), 'data', 'game-sources.json'))

shuffled_game_meta = game_meta.items()
random.shuffle(shuffled_game_meta)
for game, meta in shuffled_game_meta:
    game_meta[game]['Links'] += scrape_duckduckgo(game, game_meta[game]['Developer'])
    game_meta[game]['Links'] = list(set(game_meta[game]['Links']))
    save_json(os.path.join(os.getcwd(), 'data', 'game-sources.json'), game_meta)
    time.sleep(1)

In [5]:
# Load content in search results
game_meta = read_json(os.path.join(os.getcwd(), 'data', 'game-sources.json'))
cached = read_json(os.path.join(os.getcwd(), 'data', 'corpus.json'))

output = cached
for game, meta in game_meta.items():
    if game not in output:
        output[game] = {}
    print(__message(game))
    for url in meta['Links']:
        if url in output[game] and output[game][url]:
            continue
        data = []
        html = scrape(url)
        if any(word in html.lower() for word in ['interview', 'mortem', 'review', 'history']):
            soup = bs4.BeautifulSoup(html)
            content = soup.select('div > p') + soup.select('body > p')
            data = [c.string.strip() for c in content if c.string and c.string.strip()]
            output[game][url] = data
            print(__message(u'Scrapped {}'.format(url)))
            save_json(os.path.join(os.getcwd(), 'data', 'corpus.json'), output)

   |MSG| Loaded /Users/spaxe/dev/roguelike-universe/data/game-sources.json
   |MSG| Loaded /Users/spaxe/dev/roguelike-universe/data/corpus.json
   |MSG| Dungeon
??WARN?? Failed to load http://kristinandcory.com/family_interview_2.html
!!FAIL!! HTTPConnectionPool(host='kristinandcory.com', port=80): Read timed out. (read timeout=15)
   |MSG| Falcon's Eye
??WARN?? Failed to load https://libregamewiki.org/Falcon%27s_Eye
!!FAIL!! [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)
   |MSG| Island of Kesmai
   |MSG| Torneko no Daiboken: Fushigi no Dungeon
   |MSG| Pokémon Mystery Dungeon: Explorers of Sky
   |MSG| Linley's Dungeon Crawl
   |MSG| TowerClimb
   |MSG| Teleglitch
   |MSG| Brogue
   |MSG| Weird Worlds: Return to Infinite Space
   |MSG| Scrapped http://www.reddit.com/r/weirdworlds/
   |MSG| Written to /Users/spaxe/dev/roguelike-universe/data/corpus.json
??WARN?? Failed to load http://steam-sale.com/oplata.php?id=1564869
!!FAIL!! ('Connection aborted.', gaierro

In [21]:
# Locate mentions of games
game_LUT = set(read_json(os.path.join(os.getcwd(), 'data', 'games.json')))
game_meta = read_json(os.path.join(os.getcwd(), 'data', 'game-sources.json'))
game_articles = read_json(os.path.join(os.getcwd(), 'data', 'corpus.json'))
not_games = set(read_json(os.path.join(os.getcwd(), 'data', 'not-games.json')))

# Create a look up table for games
roguelike_LUT = {}
for game, meta in game_meta.items():
    roguelike_LUT[game] = game
    if 'AKA' in meta:
        for aka in meta['AKA']:
            roguelike_LUT[aka] = game

   |MSG| Loaded /Users/spaxe/dev/roguelike-universe/data/games.json
   |MSG| Loaded /Users/spaxe/dev/roguelike-universe/data/game-sources.json
   |MSG| Loaded /Users/spaxe/dev/roguelike-universe/data/corpus.json
   |MSG| Loaded /Users/spaxe/dev/roguelike-universe/data/not-games.json


In [35]:
# Look through the interview articles
roguelike_relations = {}
other_relations = {}
for game, articles in game_articles.items():
    roguelike_relations[game] = []
    other_relations[game] = []
    counter = collections.Counter()
    for url, article in articles.items():
        # Intersection for fast search
        things = []
        current = u''
        for paragraph in article:
            for token in paragraph.split():
                if re.compile("^[A-Z0-9][\w:']*[\w:']|[A-Z\.]+$").match(token) or \
                        (current and token in ('the', 'of', 'no', 'to')):
                    current += u'{} '.format(token)
                elif current:
                    things.append(current.strip())
                    current = u''
        roguelike_things = [roguelike_LUT[s] for s in things if s in roguelike_LUT]
        if roguelike_things:
            roguelike_relations[game].extend(roguelike_things)
        other_things = [s for s in things if
                            s in game_LUT and
                            s not in not_games and
                            s not in roguelike_LUT and
                            len(s) > 1 and
                            not s.isdigit()]
        if other_things:
            other_relations[game].extend(other_things)

# print("\n### ROGUELIKES ###\n")
# pprint.pprint(roguelike_relations, indent=2)
# print("\n### OTHER GAMES ###\n")
# pprint.pprint(other_relations, indent=2)

save_json(os.path.join(os.getcwd(), 'generated', 'roguelike-relations.json'), roguelike_relations)
save_json(os.path.join(os.getcwd(), 'generated', 'other-relations.json'), other_relations)

   |MSG| Written to /Users/spaxe/dev/roguelike-universe/generated/roguelike-relations.json
   |MSG| Written to /Users/spaxe/dev/roguelike-universe/generated/other-relations.json


In [12]:
# Construct influence network

roguelike_relations = read_json(os.path.join(os.getcwd(), 'generated', 'roguelike-relations.json'))
other_relations = read_json(os.path.join(os.getcwd(), 'generated', 'other-relations.json'))
games_years = read_json(os.path.join(os.getcwd(), 'generated', 'games-years.json'))

roguelike_influence = {}
for roguelike, other_roguelikes in roguelike_relations.items():
    roguelike_influence[roguelike] = []
    
    roguelike_relation_counter = collections.Counter()
    for other_roguelike in other_roguelikes:
        if other_roguelike != roguelike:
            roguelike_relation_counter[other_roguelike] += 1
            
    other_relation_counter = collections.Counter()
    for other_relation in other_relations[roguelike]:
        if other_relation != roguelike:
            other_relation_counter[other_relation] += 1
            
    for roguelike_relation in roguelike_relation_counter.most_common(5):
        roguelike_influence[roguelike].append(roguelike_relation[0])
        
    for other_relation in other_relation_counter.most_common(5):
        if other_relation[1] > 1:
            roguelike_influence[roguelike].append(other_relation[0])
            
#     print(u'{}\n{}\n{}\n'.format(roguelike, 
#                                    roguelike_relation_counter.most_common(3), 
#                                    other_relation_counter.most_common(3)))

games_set_small = set(itertools.chain(*(roguelike_relations.values()+other_relations.values())))
    
games_years_small = {game: int(year) for game, year in games_years.items() if game in games_set_small}
    
print(games_years_small)
                                                        
save_json(os.path.join(os.getcwd(), 'generated', 'relations.json'), roguelike_influence)
save_json(os.path.join(os.getcwd(), 'generated', 'games-years-small.json'), games_years_small)

   |MSG| Loaded /Users/spaxe/dev/roguelike-universe/generated/roguelike-relations.json
   |MSG| Loaded /Users/spaxe/dev/roguelike-universe/generated/other-relations.json
   |MSG| Loaded /Users/spaxe/dev/roguelike-universe/generated/games-years.json
{u"Dragon's Dogma": 2012, u'Torneko no Daiboken: Fushigi no Dungeon': 1993, u'Grand Theft Auto: Vice City': 2002, u'Hydra': 1990, u'Manhunt': 2003, u'Valkyrie Profile': 1999, u'Trivia': 1988, u'Larn': 1986, u'Sorcerer': 1983, u'Mutant Mudds': 2012, u'Apocalypse': 1994, u'UnReal World': 1992, u'BurgerTime': 1982, u'Odyssey: The Compleat Apventure': 1980, u'Ultima': 1981, u"Demon's Souls": 2009, u'Castle of the Winds': 1993, u'Encounter': 1984, u'Shoot': 1978, u'Roadwar Europa': 1987, u'Blade': 2000, u'Back to the Future': 1986, u'Calvin': 1987, u'Diablo III': 2012, u'Banshee': 1994, u'Battlefront': 1986, u'Obsidian': 1997, u'Star Control II': 1992, u'RPG Maker': 1997, u'Punishment': 2005, u'Magicka': 2011, u'Battleship': 1978, u'The Prisoner'