-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider_runner.py
executable file
·43 lines (31 loc) · 1.26 KB
/
spider_runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import os
import time
import subprocess
from database.db_setup import check_database_connection
from oryx_scrape.spiders_settings import spider_russia, spider_ukraine
from oryx_scrape.log_maintenance import log_maintenance
project_root_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '.'))
output_path_ukraine = project_root_path + '/scraped_data/losses_ukraine.json'
output_path_russia = project_root_path + '/scraped_data/losses_russia.json'
def run_spider(spider_name, output_file):
"""Since the data is being pipelined into a database we just keep the
last scraped data into JSON files, overwriting it at every new run.
"""
os.chdir(f'{project_root_path}/oryx_scrape/spiders')
# subprocess.run(['scrapy', 'crawl', spider_name, '-O', output_file, '-a', 'append=True'])
subprocess.run(['scrapy', 'crawl', spider_name, '-O', output_file,])
def run_spiders():
spiders = [
(spider_ukraine, output_path_ukraine),
(spider_russia, output_path_russia)
]
delay_between_runs = 2
for spider, data_path in spiders:
run_spider(spider, data_path)
time.sleep(delay_between_runs)
def main():
log_maintenance
check_database_connection()
run_spiders()
if __name__ == '__main__':
main()