In [1]:
import pandas as pd
import os

In [2]:
file_path = os.path.join(os.getcwd(), "input", "articles.csv")
articles = pd.read_csv(file_path, parse_dates=[2])

In [5]:
articles.sort_values("date_published", ascending=True).head()

Unnamed: 0,headline_intro,headline,date_published,content
5864,"Auswanderer, die zurückkamen",Heim in die Alte Welt,2007-09-20 17:11:00,"<p>Zwischen 1830 und 1974 verließen 7,2 Millio..."
5865,Katastrophen,"Die Todesfahrt der ""Gustloff""",2008-01-29 18:54:00,<p>Dem Furor der Roten Armee versuchen im Wint...
5866,"""Dschungel von Calais""",Französische Polizei räumt Flüchtlingslager am...,2009-09-22 15:16:27,<p>Calais - Als die Polizei im Morgengrauen an...
5847,Flüchtlinge in Frankreich,"Keine Heimat, nirgends",2009-09-26 13:00:19,<p>Vor Ort wirkt der Einsatz wie ein durchschl...
5848,Abkommen mit Pristina,Bundesregierung will Tausende Kosovaren abschi...,2009-10-14 09:42:34,<p>München - Gut zehn Jahre nach dem Ende des ...


In [4]:
def contiguous(xs):
	'given a sequence of numbers, xs, return a list of contiguous sub-sequences'
	xs = iter(xs)
	rv = []
	temp = ()
	prev = None
	for x in xs:
		if prev and x > prev + 1:
			rv.append(temp)
			temp = ()
		temp += (x,)
		prev = x
	if temp:
		rv.append(temp)
	return rv
				
# test it out
assert contiguous([1,2,3,7,8,10,12]) == [(1,2,3), (7,8), (10,), (12,)]

In [6]:
from itertools import tee, islice
nwise = lambda xs,n=2: zip(*(islice(xs,idx,None) for idx,xs in enumerate(tee(xs,n))))

assert list(nwise([1,2,3,4,5,6])) == [(1,2), (2,3), (3,4), (4,5), (5,6)]

In [8]:
def nwise(iterable, n=2):
	iterables = tee(iterable, n) # make n copies of the original iterable
	temp = []
	for idx, it in enumerate(iterables): # loop over each one
		it = islice(it, idx, None) # advance the iterable by idx places
		temp.append(it)	
	# we now have n copies of the iterable, where each iterable is skips the first 0..n-1 values
	# for itertools.count(10), n=3, this will look like:
	#   [0,1,2,3,4, ...]
	#   [1,2,3,4,5, ...]
	#   [2,3,4,5,6, ...]
	# therefore, if we zip them together, we'll get an n-wise window of values
	#   -> [(0,1,2), (1,2,3), (2,3,4), ...]
	# since we're using izip here, the zipping occurs lazily
	#   which means this approach will work on infinite-length iterables
	# also, since we've only stepped ahead up to n-1 values in any of the
	#   tee-d iterators, we can guarantee that we won't use more than O(n)
	#   memory in buffering cost
	return zip(*temp) 

In [14]:
import itertools
import spacy
nlp = spacy.load('de')

def gen_items():
    yield (0, 'Text 0')
    yield (1, 'Text 1')
    yield (2, 'Text 2')

gen1, gen2 = itertools.tee(gen_items())
ids = (id_ for (id_, text) in gen1)
texts = (text for (id_, text) in gen2)
docs = nlp.pipe(texts)
for id_, doc in zip(ids, docs):
    print(id_, doc.text)

0 Text 0
1 Text 1
2 Text 2


In [15]:
def gen_iter():
    yield(0, "Hallo 0")
    yield(1, "Hallo 1")
    yield(2, "Hallo 2")

for (id, test) in gen_items():
    print(id, test)

0 Text 0
1 Text 1
2 Text 2


In [46]:
import newspaper

In [47]:
spon = newspaper.build("http://www.spiegel.de/thema/fluechtlinge/archiv-2018196.html", language="de", memoize_articles=False, fetch_images=False) 

In [48]:
spon.size()

1796

In [37]:
def download_parse_articles(articles):
    parsed_articles = []
    for article in articles:
        try:
            article.download()
            article.parse()
            parsed_articles.append(article)
        except:
            continue
    return parsed_articles

In [38]:
parsed_articles = download_parse_articles(spon.articles[::5])

Article `download()` failed with 404 Client Error: Not Found for url: http://www.spiegel.tv/videos/1466719-love-and-sex-in-india?utm_source=sponhp on URL http://www.spiegel.tv/videos/1466719-love-and-sex-in-india?utm_source=sponhp
Article `download()` failed with 404 Client Error: Not Found for url: http://www.spiegel.de/video/netflix-film-beasts-of-no-nation-video-1616501.html on URL http://www.spiegel.de/video/netflix-film-beasts-of-no-nation-video-1616501.html
Article `download()` failed with 404 Client Error: Not Found for url: http://www.spiegel.de/wissenschaft/mensch/zahlenraetsel-sudoku-der-logik-klassiker-fuer-jeden-tag-a-994416.html on URL http://www.spiegel.de/wissenschaft/mensch/zahlenraetsel-sudoku-der-logik-klassiker-fuer-jeden-tag-a-994416.html


In [42]:
len(parsed_articles)

353

In [43]:
for article in parsed_articles:
    print(article.title)

Versicherung: Check für Haftpflicht, Hausrat, Berufsunfähigkeit
Sudoku online kostenlos spielen
Benzinpreise aktuell: Die günstigste Tankstelle in Ihrer Nähe
Österreich Nationalratswahl 2017: Alle Ergebnisse im Überblick
Flüchtlingspolitik in Zahlen: Gibt es Fortschritt?
Atomkraft: Diese interaktive Karte zeigt das Strahlenrisiko
Pharmazahlungen an Ärzte: Nur jeder vierte Mediziner veröffentlicht Einnahmen
G20-Gipfel im Newsblog: Joachim Sauer will Melania Trump ins Klimazentrum schleppen
Wahlergebnis NRW 2017: Die Ergebnisse aus Nordrhein-Westfalen im Überblick
Sonntagsfrage: Umfragen zu Bundestagswahl, Landtagswahl, Europawahl
Bundesliga-Taktiktafeln: Pässe und Formationen der Bundesligateams
Bundespräsidentenwahl 2017: So wird der Bundespräsident gewählt
Strategiesimulation: Werden Sie Kanzler
Tsunami 2004 in Südost-Asien: Die große Flut
IS Islamischer Staat: Kalif Abu Bakr al-Baghdadi und die IS-Spitze
113 Flüchtlinge stranden in der Sahara
Datengrafiken: Was die Muster der Dresdne

In [53]:
import os
import time
import threading
import multiprocessing
 
NUM_WORKERS = 4
 
def only_sleep():
    """ Do nothing, wait for a timer to expire """
    print("PID: %s, Process Name: %s, Thread Name: %s" % (
        os.getpid(),
        multiprocessing.current_process().name,
        threading.current_thread().name)
    )
    time.sleep(1)
 
 
def crunch_numbers():
    """ Do some computations """
    print("PID: %s, Process Name: %s, Thread Name: %s" % (
        os.getpid(),
        multiprocessing.current_process().name,
        threading.current_thread().name)
    )
    x = 0
    while x < 10000000:
        x += 1

In [54]:
## Run tasks serially
start_time = time.time()
for _ in range(NUM_WORKERS):
    only_sleep()
end_time = time.time()
 
print("Serial time=", end_time - start_time)
 
# Run tasks using threads
start_time = time.time()
threads = [threading.Thread(target=only_sleep) for _ in range(NUM_WORKERS)]
[thread.start() for thread in threads]
[thread.join() for thread in threads]
end_time = time.time()
 
print("Threads time=", end_time - start_time)
 
# Run tasks using processes
start_time = time.time()
processes = [multiprocessing.Process(target=only_sleep()) for _ in range(NUM_WORKERS)]
[process.start() for process in processes]
[process.join() for process in processes]
end_time = time.time()
 
print("Parallel time=", end_time - start_time)

PID: 11758, Process Name: MainProcess, Thread Name: MainThread
PID: 11758, Process Name: MainProcess, Thread Name: MainThread
PID: 11758, Process Name: MainProcess, Thread Name: MainThread
PID: 11758, Process Name: MainProcess, Thread Name: MainThread
Serial time= 4.006238222122192
PID: 11758, Process Name: MainProcess, Thread Name: Thread-185
PID: 11758, Process Name: MainProcess, Thread Name: Thread-186
PID: 11758, Process Name: MainProcess, Thread Name: Thread-187
PID: 11758, Process Name: MainProcess, Thread Name: Thread-188
Threads time= 1.0030782222747803
PID: 11758, Process Name: MainProcess, Thread Name: MainThread
PID: 11758, Process Name: MainProcess, Thread Name: MainThread
PID: 11758, Process Name: MainProcess, Thread Name: MainThread
PID: 11758, Process Name: MainProcess, Thread Name: MainThread
Parallel time= 4.101713418960571


In [56]:
article_url = '/politik/ausland/portugal-das-eu-land-das-mehr-fluechtlinge-aufnehmen-will-a-1219192.html'
"".join(["http://www.spiegel.de", article_url])

'http://www.spiegel.de/politik/ausland/portugal-das-eu-land-das-mehr-fluechtlinge-aufnehmen-will-a-1219192.html'