-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
109 lines (101 loc) · 3.61 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import re
import time
import numpy
import threading
from spider import Spider
# the patten to extract file id from the HTML context
S_PATTERN_PDF = r'<span class="list-identifier">.*?\[<a href="/pdf/(.*?)" title="Download PDF">pdf</a>'
# the pattern to extract total entries number
S_PATTERN_EMTRIES = r'\[ total of (.*?) entries:'
BASE_URLs = (
# 'https://arxiv.org',
'http://de.arxiv.org',
'http://cn.arxiv.org',
'http://lanl.arxiv.org',
'http://xxx.itp.ac.cn'
)
# the path to save the crawled files
SAVE_DIR = './pdfs/'
# It's a list of the file names to be crawled(Excluding file suffixes)
task_list = []
task_list_lock = threading.Lock()
spider = Spider(BASE_URLs[numpy.random.randint(0, len(BASE_URLs))], SAVE_DIR)
WORKER_NUM = 10
BEGIN_YEAR = 15
BEGIN_MONTH = 1
END_YEAR = 18
END_MONTH = 3
class Worker(threading.Thread):
def __init__(self, thread_id, thread_name):
threading.Thread.__init__(self)
self.threadID = thread_id
self.thread_name = thread_name
def run(self):
while True:
task_list_lock.acquire()
if len(task_list) == 0:
task_list_lock.release()
break
file_id = task_list[0]
task_list.pop(0)
# Randomly select the mirror address to download
spider.base_url = BASE_URLs[numpy.random.randint(0, len(BASE_URLs))]
# sleep 2 seconds before downloading
time.sleep(2)
task_list_lock.release()
file_name = "%s.pdf" % file_id
file_url = '/pdf/%s' % file_id
spider.get_file(file_name, file_url)
# sleep 2 seconds after downloading
time.sleep(2)
def download(_year, _month):
global task_list
# skip means the offset, show means how many entries per page
skip = 0
show = 100
page_url = '/list/cs/%02d%02d' % (_year, _month)
page = spider.get_page(page_url)
pattern_total = re.compile(S_PATTERN_EMTRIES, re.S)
total_obj = re.search(pattern_total, page)
if not total_obj:
print('total match failed:[%s]' % S_PATTERN_EMTRIES)
return False
total_num = int(total_obj.group(1))
print("page_url:%s\ntotal_num:%d" % (page_url, total_num))
while skip < total_num:
# Randomly select the mirror address to download
spider.base_url = BASE_URLs[numpy.random.randint(0, len(BASE_URLs))]
# e.g.:/list/cs/1801?skip=0&show=1000
page_url = '/list/cs/%02d%02d?skip=%d&show=%d' % (_year, _month, skip, show)
page = spider.get_page(page_url)
pattern_pdf = re.compile(S_PATTERN_PDF, re.S)
# get all the file id
task_list = re.findall(pattern_pdf, page)
task_list_size = len(task_list)
workers = []
if task_list:
for index in range(1, WORKER_NUM):
worker = Worker(index, "Thread-%d" % index)
worker.start()
workers.append(worker)
for worker in workers:
worker.join()
else:
print('files match failed:[%s]' % S_PATTERN_PDF)
return False
print("Ranging from %d to %d was downloaded!" % (skip + 1, skip + task_list_size))
skip = skip + show
time.sleep(10)
return True
if __name__ == '__main__':
# Init date
_year = END_YEAR
_month = END_MONTH
while _year > BEGIN_YEAR or (_year == BEGIN_YEAR and _month > BEGIN_MONTH):
download(_year, _month)
if _month == 1:
_month = 12
_year = _year - 1
else:
_month = _month - 1
print("All tasks finished! Exiting Main Thread")