In [61]:
# coding:utf-8
# Textual analysis through the open web
import io
import os
import re
import bs4
import sys
import json
import time
import nltk
import urllib
import pprint
import random
import string
import requests
import wikipedia
import itertools
import collections

def scrape_wiki(title):
    try:
        searchstring = title
        summary = wikipedia.summary(searchstring)
        print(__success(searchstring))
    except wikipedia.DisambiguationError:
        try:
            searchstring = u'{} (video game)'.format(title).replace(u' ', u'_')
            summary = wikipedia.page(searchstring, auto_suggest=False).summary
            print(__success(searchstring))
        except wikipedia.DisambiguationError:
            searchstring = u'{} (Unix video game)'.format(title).replace(u' ', u'_')
            summary = wikipedia.page(searchstring, auto_suggest=False).summary
            print(__success(searchstring))
        except wikipedia.PageError:
            summary = 'No summary found on Wikipedia.'
            print(__warning(u'Wikipedia cannot find "{}"'.format(searchstring)))
    except wikipedia.PageError:
        try:
            summary = wikipedia.page(searchstring, auto_suggest=False).summary
            print(__success(searchstring))
        except wikipedia.PageError:
            summary = 'No summary found on Wikipedia.'
            print(__warning(u'Search term "{}" returned nothing'.format(searchstring)))
    return summary

def scrape_duckduckgo(keywords):
    response = requests.get(u'http://duckduckgo.com/html/?q={}'.format(
                urllib.quote(keywords.encode('utf-8'))),
                            timeout=(9.1, 12.1)
             )
    print(__message(u'DDG: {}'.format(keywords)))
    soup = bs4.BeautifulSoup(response.text)
    links = []
    for node in soup.select('div.web-result'):
        if 'web-result-sponsored' in node['class']:
            continue
        try:
            links.append(node.select('a.large')[0].get('href'))
        except Exception as e:
            print(__failure(e))
            pass
    if links:
        print(__success(u'DDG: {}'.format(keywords)))
#         print('\t\t\n'.join(links))
    return links

def scrape(url):
    try:
        response = requests.get(url, timeout=(10, 15))
    except Exception as e:
        print(__failure(u'Failed to load {}'.format(url)))
        print(e)
        return None
    
    return response.text
    
def load_json(path):
    data = ''
    with io.open(path, 'r', encoding='utf-8') as f:
        content = f.read()
        if content:
            data = json.loads(content)
        else:
            data = {}
        print(__message(u'Loaded {}'.format(path)))
    return data
    
def save_json(path, data):
    output = json.dumps(data, indent=2, ensure_ascii=False)
    with io.open(path, 'w', encoding='utf-8') as f:
#         print output
        try:
            f.write(output)
        except (TypeError, UnicodeEncodeError):
            f.write(output.encode('utf-8'))
    print(__message(u'Written to {}'.format(path)))
        
def __success(text):
    return u'  (SUCC) {}'.format(text).encode('utf-8')
    
def __failure(text):
    return u'!!FAIL!! {}'.format(text).encode('utf-8')
    
def __warning(text):
    return u'??WARN?? {}'.format(text).encode('utf-8')
    
def __message(text):
    return u'   |MSG| {}'.format(text).encode('utf-8')

In [63]:
corpus = load_json('corpus-made-because.json')
for link in scrape_duckduckgo('"I made this game because"'):
    if link not in corpus:
        corpus[link] = u""

corpus = {k: v for k, v in corpus.items() if not k.endswith('pdf')}
        
for i, link in enumerate(corpus):
    print i, link
    soup = bs4.BeautifulSoup(scrape(link))
    content = '\n'.join(' '.join(x for x in s.stripped_strings)
                   for s in soup.select('p'))
    corpus[link] = content

save_json('corpus-made-because.json', corpus)

text = '\n\n'.join(corpus.values())
with io.open('corpus-made-because.txt', 'w', encoding='utf-8') as f:
    try:
        f.write(text)
    except:
        f.write(text.encode('utf-8'))

   |MSG| Loaded corpus-made-because.json
   |MSG| DDG: "I made this game because"
  (SUCC) DDG: "I made this game because"
0 http://www.newgrounds.com/portal/view/482263
1 http://www.kogama.com/games/profile/1819679/
2 http://de.roblox.com/PlaceItem.aspx?seoname=RMS-Titanic&id=116609246
3 http://www.crashtastic.com/?p=190
4 http://www.newgrounds.com/portal/search/games/lego
5 http://steamcommunity.com/sharedfiles/filedetails/?id=214144938
6 http://www.kogama.com/games/profile/817665/
7 http://gamejolt.com/games/arcade/just-survive/19780/
8 http://www.madcoatgames.com/2014/02/its-official-bad-blaster-for-android.html
9 https://timf12.wordpress.com/
10 http://www.youtube.com/watch?v=iB8Jgt9v0mg
11 http://www.sploder.com/games/members/mjduniverse/play/a-3-year-old-who-robbed-a-bank-3/
12 https://timf12.wordpress.com/2011/04/01/pong-on-scratch/
13 http://www.mfgg.net/?act=resdb&param=02&c=2&id=27783
14 http://www.sploder.com/games/members/samart/play/why-should-we-have-asploder-account/
15