In [15]:
# coding:utf-8
# Textual analysis through the open web
from __future__ import print_function
import io
import os
import re
import bs4
import sys
import json
import time
import nltk
import urllib
import pprint
import random
import string
import requests
import wikipedia
import itertools
import collections

In [17]:
def scrape_wiki(title):
    try:
        searchstring = title
        summary = wikipedia.summary(searchstring)
        print(__success(searchstring))
    except wikipedia.DisambiguationError:
        try:
            searchstring = u'{} (video game)'.format(title).replace(u' ', u'_')
            summary = wikipedia.page(searchstring, auto_suggest=False).summary
            print(__success(searchstring))
        except wikipedia.DisambiguationError:
            searchstring = u'{} (Unix video game)'.format(title).replace(u' ', u'_')
            summary = wikipedia.page(searchstring, auto_suggest=False).summary
            print(__success(searchstring))
        except wikipedia.PageError:
            summary = 'No summary found on Wikipedia.'
            print(__warning(u'Wikipedia cannot find "{}"'.format(searchstring)))
    except wikipedia.PageError:
        try:
            summary = wikipedia.page(searchstring, auto_suggest=False).summary
            print(__success(searchstring))
        except wikipedia.PageError:
            summary = 'No summary found on Wikipedia.'
            print(__warning(u'Search term "{}" returned nothing'.format(searchstring)))
    return summary

def scrape_duckduckgo(keywords, developer=""):
    searchstring = u'"{}" AND {} AND game AND (interview OR mortem OR history OR develop)'.format(keywords, developer)
    response = requests.get(u'http://duckduckgo.com/html/?q={}'.format(
                urllib.quote(searchstring.encode('utf-8'))),
                            timeout=(9.1, 12.1)
             )
    print(__message(u'DDG: {}'.format(searchstring)))
    soup = bs4.BeautifulSoup(response.text)
    links = []
    for node in soup.select('div.web-result'):
        if 'web-result-sponsored' in node['class']:
            continue
        try:
            links.append(node.select('a.large')[0].get('href'))
        except Exception as e:
            print(__failure(e))
            pass
    if links:
        print(__success(u'DDG: {}'.format(searchstring)))
        print('\t\t\n'.join(links))
    return links

def scrape(url):
    try:
        print(__message(u'Scraping {} ...'.format(url)))
        response = requests.get(url, timeout=(9.1, 12.1))
    except Exception as e:
        print(__failure(u'Failed to load {}'.format(url)))
        print(e)
        return None
    
    return response.text
    
def read_json(path):
    data = ''
    with io.open(path, 'r', encoding='utf-8') as f:
        data = json.loads(f.read())
        print(__message(u'Loaded {}'.format(path)))
    return data
    
def save_json(path, data):
    with io.open(path, 'w', encoding='utf-8') as f:
        try:
            output = json.dumps(data, indent=2, ensure_ascii=False)
            f.write(output)
        except UnicodeEncodeError:
            f.write(output.encode('utf-8'))
    print(__message(u'Written to {}'.format(path)))
    
def encode_english(sentence):
    tokens = nltk.word_tokenize(sentence)
    tagged_words = nltk.pos_tag(tokens)
    universal_tagged = [(word, nltk.map_tag('en-ptb', 'universal', tag)) for word, tag in tagged_words]
    return universal_tagged
        
def __success(text):
    return u'  (SUCC) {}'.format(text).encode('utf-8')
    
def __failure(text):
    return u'!!FAIL!! {}'.format(text).encode('utf-8')
    
def __warning(text):
    return u'??WARN?? {}'.format(text).encode('utf-8')
    
def __message(text):
    return u'   |MSG| {}'.format(text).encode('utf-8')

In [38]:
# Extract themes
# Items of interest, of genre, of identification
# Emotions of joy, sadness, frustrations
# Memory recall? Specific sentices or sentiments
corpus = read_json(os.path.join(os.getcwd(), 'data', 'corpus.json'))
distributions = {}

for game, sites in corpus.items():
    print(game)
    tagged_sentences = []
    for url, content in sites.items():
        for sentence in content:
            tagged_sentences += (encode_english(sentence))
    freqdist = nltk.FreqDist((word, tag) for word, tag in tagged_sentences if tag == u'ADJ')
    distributions[game] = freqdist

   |MSG| Loaded /Users/spaxe/dev/roguelike-universe/data/corpus.json
Dungeon
Falcon's Eye
Island of Kesmai
Torneko no Daiboken: Fushigi no Dungeon
Pokémon Mystery Dungeon: Explorers of Sky
Linley's Dungeon Crawl
TowerClimb
Teleglitch
Brogue
Weird Worlds: Return to Infinite Space
Advanced Dungeons & Dragons: Cloudy Mountain
Dungeon Hack
Dungeon of Doom
Slash'EM
Beneath Apple Manor
Torneko: The Last Hope
Smart Kobold
Larn
Lost Labyrinth
Tower of Guns
UnReal World
Fatal Labyrinth
Dragon Quest: Shonen Yangus to Fushigi no Dungeon
Wizards Encounters
Rogue
Rogue Legacy
The Binding of Isaac
Dark Chronicle
Mystery Dungeon: Shiren the Wanderer
Risk of Rain
Ancient Domains of Mystery
Adventure
Sword of Fargoal
Pokémon Mystery Dungeon: Explorers of Darkness
Sword of the Stars: The Pit
Castle of the Winds
Nightmare of Druaga: Fushigino Dungeon
FTL: Faster Than Light
The Guided Fate Paradox
Angband
Shiren the Wanderer
Mission Thunderbolt
Moraff's World
Scarab of Ra
Z.H.P. Unlosing Ranger VS Darkdea

In [42]:
for game, dist in distributions.items():
    words = [x[0] for x, y in distributions[game].most_common(10)]
    print(u'{}: {}'.format(game, u', '.join(words)))

Dungeon: new, other, good, more, first, great, many, I’m, most, best
Falcon's Eye: other, new, many, Slash'EM, first, graphical, long, more, different, much
Island of Kesmai: other, new, many, such, different, most, advanced, more, special, first
Torneko no Daiboken: Fushigi no Dungeon: few, more, other, we'll, new
Pokémon Mystery Dungeon: Explorers of Sky: Pokémon, other, regular, few, special, new, various, previous, such, human
Linley's Dungeon Crawl: such, popular, entire, more, many, new, obvious, large, different, free
TowerClimb: new, many, more, other, top, own, first, most, great, social
Teleglitch: other, I’d, new, next, same, least, I’ve, few, later, more
Brogue: 26th, least, other, ranged, skillful, first, few, collaborative, v1.2:, full
Weird Worlds: Return to Infinite Space: new, other, more, full, same, black, less, real-time, best, turn-based
Advanced Dungeons & Dragons: Cloudy Mountain: other, original, new, first, many, top, different, more, same, good
Dungeon Hack: n

In [None]:
# Testing
scrape_wiki(u"Dungeon_(video_game)")

In [None]:
# Scrape for links
game_meta = read_json(os.path.join(os.getcwd(), 'data', 'game-sources.json'))

shuffled_game_meta = game_meta.items()
random.shuffle(shuffled_game_meta)
for game, meta in shuffled_game_meta:
    game_meta[game]['Links'] += scrape_duckduckgo(game, game_meta[game]['Developer'])
    game_meta[game]['Links'] = list(set(game_meta[game]['Links']))
    save_json(os.path.join(os.getcwd(), 'data', 'game-sources.json'), game_meta)
    time.sleep(2)

In [None]:
# Load content in search results
game_meta = read_json(os.path.join(os.getcwd(), 'data', 'game-sources.json'))
cached = read_json(os.path.join(os.getcwd(), 'data', 'corpus.json'))

output = cached
for game, meta in game_meta.items():
    if game not in output:
        output[game] = {}
    print(__message(game))
    for url in meta['Links']:
        if url in output[game] or url.endswith('pdf'):
            continue
        data = []
        html = scrape(url)
        if html and any(word in html.lower() for word in ['interview', 'mortem', 'review', 'history', 'develop', 'idea', 'inspir']):
            soup = bs4.BeautifulSoup(html)
            content = soup.select('div > p') + soup.select('body > p')
            data = [c.string.strip() for c in content if c.string and c.string.strip()]
            output[game][url] = data
            print(__message(u'Scrapped {}'.format(url)))
            save_json(os.path.join(os.getcwd(), 'data', 'corpus.json'), output)

In [None]:
# Locate mentions of games
game_LUT = set(read_json(os.path.join(os.getcwd(), 'data', 'games.json')))
game_meta = read_json(os.path.join(os.getcwd(), 'data', 'game-sources.json'))
game_articles = read_json(os.path.join(os.getcwd(), 'data', 'corpus.json'))
not_games = set(read_json(os.path.join(os.getcwd(), 'data', 'not-games.json')))

# Create a look up table for games
roguelike_LUT = {}
for game, meta in game_meta.items():
    roguelike_LUT[game] = game
    if 'AKA' in meta:
        for aka in meta['AKA']:
            roguelike_LUT[aka] = game

In [None]:
# Look through the interview articles
roguelike_relations = {}
other_relations = {}
for game, articles in game_articles.items():
    roguelike_relations[game] = []
    other_relations[game] = []
    counter = collections.Counter()
    for url, article in articles.items():
        # Intersection for fast search
        things = []
        current = u''
        for paragraph in article:
            for token in paragraph.split():
                if re.compile("^[A-Z0-9][\w:']*[\w:']|[A-Z\.]+$").match(token) or \
                        (current and token in ('the', 'of', 'no', 'to')):
                    current += u'{} '.format(token)
                elif current:
                    things.append(current.strip())
                    current = u''
        roguelike_things = [roguelike_LUT[s] for s in things if s in roguelike_LUT]
        if roguelike_things:
            roguelike_relations[game].extend(roguelike_things)
        other_things = [s for s in things if
                            s in game_LUT and
                            s not in not_games and
                            s not in roguelike_LUT and
                            len(s) > 1 and
                            not s.isdigit()]
        if other_things:
            other_relations[game].extend(other_things)

# print("\n### ROGUELIKES ###\n")
# pprint.pprint(roguelike_relations, indent=2)
# print("\n### OTHER GAMES ###\n")
# pprint.pprint(other_relations, indent=2)

save_json(os.path.join(os.getcwd(), 'generated', 'roguelike-relations.json'), roguelike_relations)
save_json(os.path.join(os.getcwd(), 'generated', 'other-relations.json'), other_relations)

In [None]:
# Construct influence network

roguelike_relations = read_json(os.path.join(os.getcwd(), 'generated', 'roguelike-relations.json'))
other_relations = read_json(os.path.join(os.getcwd(), 'generated', 'other-relations.json'))
games_years = read_json(os.path.join(os.getcwd(), 'generated', 'games-years.json'))

roguelike_influence = {}
for roguelike, other_roguelikes in roguelike_relations.items():
    roguelike_influence[roguelike] = []
    
    roguelike_relation_counter = collections.Counter()
    for other_roguelike in other_roguelikes:
        if other_roguelike != roguelike:
            roguelike_relation_counter[other_roguelike] += 1
            
    other_relation_counter = collections.Counter()
    for other_relation in other_relations[roguelike]:
        if other_relation != roguelike:
            other_relation_counter[other_relation] += 1
            
    for roguelike_relation in roguelike_relation_counter.most_common(5):
        roguelike_influence[roguelike].append(roguelike_relation[0])
        
    for other_relation in other_relation_counter.most_common(5):
        if other_relation[1] > 1:
            roguelike_influence[roguelike].append(other_relation[0])
            
#     print(u'{}\n{}\n{}\n'.format(roguelike, 
#                                    roguelike_relation_counter.most_common(3), 
#                                    other_relation_counter.most_common(3)))

games_set_small = set(itertools.chain(*(roguelike_relations.values()+other_relations.values())))
    
games_years_small = {game: int(year) for game, year in games_years.items() if game in games_set_small}
    
print(games_years_small)
                                                        
save_json(os.path.join(os.getcwd(), 'generated', 'relations.json'), roguelike_influence)
save_json(os.path.join(os.getcwd(), 'generated', 'games-years-small.json'), games_years_small)