In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json
from tqdm import tqdm

In [2]:
class CovidSpider(object):
    def __init__(self):
        self.home_url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'
            
    def get_content_from_url(self, url):
        """
        obtain data according to the url response content
        :param url:requested url
        :return: response content
        """
        # send request，obtain homepage content
        response = requests.get(url)
        return response.content.decode()
    
    def parse_home_page(self, home_page, tag_id):
        """
        analyze content of home page, obtain analyzed python data 
        :param home_page:
        :return:
        """
        # obtain covid data from homepage content
        soup = BeautifulSoup(home_page, 'html5lib')
        script = soup.find(id=tag_id)
        text = script.text
        # print(text)
        
        # obtain json string from covid data 
        json_str = re.findall(r'\[.+\]', text)[0]
        # print(json_str)
        
        # turn json string into python data type 
        data = json.loads(json_str)
        return data

    def parse_corona_virus(self, latest_covid_China, desc):
        # define list to store covid data 
        corona_virus = []
        # iterate through covid data for statistical URLs
        for country in tqdm(latest_covid_China, desc):
            statistics_data_url = country['statisticsData']
            statistics_data_json_str = self.get_content_from_url(statistics_data_url)
            # analyze covid data and add them to list 
            statistics_data = json.loads(statistics_data_json_str)['data']
            #print(statistics_data)
            for one_day in statistics_data:
                one_day['provinceName'] = country['provinceName']
                if country.get('countryShortCode'):
                    one_day['countryShortCode'] = country['countryShortCode']
            # print(statistics_data)
            corona_virus.extend(statistics_data)
            # print(corona_virus)
        return corona_virus

    def load(self, path):
        """
        load data according to given path
        :param path:
        :return:
        """
        with open(path) as fp:
            data = json.load(fp)
        return data
    
    def save(self, data, path):
        # save covid data as json file
        with open(path, 'w') as fp:
            json.dump(data, fp, ensure_ascii = False)
            
    def crawl_latest_covid(self):
        """
        collect most recent date covid data from countries  
        :return:
        """
        #1. send request，obtain homepage content
        home_page = self.get_content_from_url(self.home_url)
        #2. analyze homepage，obtain covid info
        latest_covid = self.parse_home_page(home_page, tag_id='getListByCountryTypeService2true')
        #3. save data as json file
        self.save(latest_covid, 'data/latest_covid.json')
        
    def crawl_corona_virus(self):
        """
         collect covid data starting from January 23rd in countries
        :return:
        """
        #1. load coivd data from latest_covid.json
        latest_covid = self.load('data/latest_covid.json')
        # print(latest_covid)
        #2. analyze homepage，obtain covid info
        corona_virus = self.parse_corona_virus(latest_covid, desc = 'Collect covid data starting from January 23rd in countries')
        #3. save data as json file 
        self.save(corona_virus, 'data/corona_virus.json')
        
    def crawl_latest_covid_china(self):
        """
        collect covid data starting from January 23rd in China
        """
        #1. send request，obtain homepage content
        home_page = self.get_content_from_url(self.home_url)
        #2. analyze homepage，obtain covid info
        latest_covid_China = self.parse_home_page(home_page,tag_id='getAreaStat')
        #3. save data as json file 
        self.save(latest_covid_China, 'data/latest_covid_China.json')
        
    def crawl_corona_virus_of_china_province(self):
        """
        collect covid data starting from January 22rd in China provinces 
        """
        #1. load coivd data from latest_covid_China.json
        latest_covid_China = self.load('data/latest_covid_China.json')
        #2. analyze homepage，obtain covid info 
        corona_virus = self.parse_corona_virus(latest_covid_China, 'Collect covid data starting from January 22rd in China provinces')
        #3. save data as json file 
        self.save(corona_virus, 'data/covid_China_province.json')


    def run(self):
        self.crawl_latest_covid()
        self.crawl_corona_virus()
        self.crawl_latest_covid_china()
        self.crawl_corona_virus_of_china_province()
if __name__ == '__main__':
    spider = CovidSpider()
    spider.run()

Collect covid data starting from January 23rd in countries: 100%|██████████| 215/215 [01:51<00:00,  1.93it/s]
Collect covid data starting from January 22rd in China provinces: 100%|██████████| 34/34 [00:11<00:00,  3.04it/s]
