-
Notifications
You must be signed in to change notification settings - Fork 0
/
BDSspider_AllCodes.py
42 lines (33 loc) · 1.38 KB
/
BDSspider_AllCodes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
"""
Scrapy spider for extracting codes from all Birds of the World pages
by scraping urls from the "Browse Taxonomy" page
as is it makes a text file of all the html lines containing urls, which is great
but it also includes the links for families (not just species)
not sure how to fix yet...
"""
import scrapy
from scrapy.http import FormRequest
from scrapy.http import Request
class AllCodesSpider(scrapy.Spider):
name = "allcodes"
start_urls = [
"https://login.proxy.birdsoftheworld.org/login"
]
def parse(self, response):
aosuser = input("Enter username: ")
password = input("Enter password: ")
user = "AOS-" + aosuser
return FormRequest.from_response(response, formdata = {
"user": user,
"url": "https://birdsoftheworld.org",
"aosuser": aosuser,
"pass": password
}, callback = self.after_login)
def after_login(self, response):
yield Request(
url = "https://birdsoftheworld-org.proxy.birdsoftheworld.org/bow/species", callback = self.action)
def action(self, response):
t = response.xpath("//a[contains(@href,'/bow/species/') and @class='notranslate']").extract()
with open("allurls.txt", "a") as f:
for i, line in enumerate(t):
f.write(line + "\n")