-
Notifications
You must be signed in to change notification settings - Fork 2.5k
Closed
Description
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
from multiprocessing import Process
import os
import time
visited = []
def get_links(bs):
print('Getting links in {}'.format(os.getpid()))
links = bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
return [link for link in links if link not in visited]
def scrape_article(path):
visited.append(path)
html = urlopen('http://en.wikipedia.org{}'.format(path))
time.sleep(3)
bs = BeautifulSoup(html, 'html.parser')
title = bs.find('h1').get_text()
print('Scraping {} in process {}'.format(title, os.getpid()))
links = get_links(bs)
if len(links) > 0:
newArticle = links[random.randint(0, len(links)-1)].attrs['href']
print(newArticle)
scrape_article(newArticle)
processes = []
processes.append(Process(target=scrape_article, args=('/wiki/Kevin_Bacon', )))
processes.append(Process(target=scrape_article, args=('/wiki/Monty_Python', )))
for p in processes:
p.start()
following error occurred while running it
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
...
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.
changed code to this:
# inserted if__name__=='__main__':
if __name__ == '__main__':
processes = []
processes.append(Process(target=scrape_article, args=('/wiki/Kevin_Bacon', )))
processes.append(Process(target=scrape_article, args=('/wiki/Monty_Python', )))
for p in processes:
p.start()
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels