In [12]:
import dask.bag as db
import requests
def stream_from_url(url):
    with requests.get(url, stream=True) as response:
        for line in response.iter_lines():
            yield line.decode('utf-8')

url = "https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt"
bag = db.from_sequence(stream_from_url(url))

In [13]:
print(bag.take(10))

("LO! Death hath rear'd himself a throne", 'In a strange city, all alone,', 'Far down within the dim west')




In [14]:
import dask.bag as db
import dask
import requests

@dask.delayed
def load_url(url):
    with requests.get(url, stream=True) as r:
        return [line.decode('utf-8') for line in r.iter_lines() if line]

url = "https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt"
delayed_bag = load_url(url)
bag = db.from_delayed([delayed_bag])  # Convert the delayed object to a Dask Bag

# Now you can operate on the bag as you normally would
result = bag.filter(lambda x: "fair" in x).compute()

In [15]:
print(bag.take(10))
print(result)

("LO! Death hath rear'd himself a throne", 'In a strange city, all alone,', 'Far down within the dim west', 'Where the good, and the bad, and the worst, and the best,', 'Have gone to their eternal rest.', '\u2009', 'There shrines, and palaces, and towers', 'Are not like any thing of ours', 'Oh no! O no! ours never loom', 'To heaven with that ungodly gloom!')
['Fair isle, that from the fairest of all flowers,', 'How fairy-like a melody there floats ', 'Once fair and stately palace --', 'Over fabric half so fair.', 'Was the fair palace door,']


In [16]:
def stream_lines(url, lines_per_chunk=100):
    buffer = []
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        for line in r.iter_lines():
            decoded_line = line.decode('utf-8')
            buffer.append(decoded_line)
            if len(buffer) == lines_per_chunk:
                yield buffer
                buffer = []
        if buffer:  # handle any remaining lines
            yield buffer

In [21]:
bag_of_lines = db.from_sequence(stream_lines(url))

# Let's say you have a function to process the chunks
def process_chunk(chunk, label):
    # Do something with the chunk, e.g., return its length
    bc = BayesianCorpus(chunk, label)
    return len(chunk)

# Map the function over the bag
label = "poe"
result_bag = bag_of_lines.map(lambda chunk: process_chunk(chunk, label))

# Now, whenever you want to actually compute the results
results = result_bag.compute()
print(results)

[100, 100, 100, 100, 100, 100, 100, 97]


In [22]:
from bayesian_classifier.poems import BayesianCorpus, get_confusion_matrix, combine_vocabs