In [1]:
from bs4 import BeautifulSoup
from requests import get

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import math 
import os

In [2]:
# Function for remove comma within numbers
def removeCommas(string): 
    string = string.replace(',','')
    return string 

# Scrap data from worldmeter

In [81]:
# Test if we can scrap info from worldometers
# The communication with website is ok if the response is 200
headers = ({'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
worldometers = "https://www.worldometers.info/coronavirus/#countries"
response = get(worldometers, headers=headers)
response

<Response [200]>

In [82]:
# Scrap all content from the website
html_soup = BeautifulSoup(response.text, 'html.parser')
# After inspect the website content, data are stored inside tag 'tbody' and table header is 'thead'
table_contents = html_soup.find_all('tbody')
table_header = html_soup.find_all('thead')

# Header for the table
header = []
for head_title in table_header[0].find_all('th'):    
    header.append(str(head_title.contents))

# Save value into columns
CountryName = []
TotalCases = []
NewCases = []
TotalDeaths = []
NewDeaths = []
TotalRecovered = []
ActiveCases = []
SeriousCritical = []

for row in table_contents[0].find_all('tr'):
    cells = row.find_all('td')
    CountryName.append(cells[0].contents[0])
    TotalCases.append(cells[1].contents[0])
    NewCases.append(cells[2].contents[0])
    TotalDeaths.append(cells[3].contents[0])
    NewDeaths.append(cells[4].contents[0])
    TotalRecovered.append(cells[5].contents[0])
    ActiveCases.append(cells[6].contents[0])    
    SeriousCritical.append(cells[7].contents[0])
        
CaseTable = pd.DataFrame({header[0]: CountryName,
                          header[1]: TotalCases,
                          header[2]: NewCases,
                          header[3]: TotalDeaths,
                          header[4]: NewDeaths,                          
                          header[5]: TotalRecovered,
                          header[6]: ActiveCases,
                          header[7]: SeriousCritical,
                          })  

CaseTable.head(80)

Unnamed: 0,"['Country,', <br>Other</br>]","['Total', <br>Cases</br>]","['New', <br>Cases</br>]","['Total', <br>Deaths</br>]","['New', <br>Deaths</br>]","['Total', <br/>, 'Recovered']","['Active', <br/>, 'Cases']","['Serious,', <br/>, 'Critical']"
0,China,80701,+50,3097,+27,57320,20284,5264
1,S. Korea,7313,+272,50,+2,130,7133,36
2,Iran,6566,+743,194,+49,2134,4238,
3,Italy,5883,,233,,589,5061,567
4,France,949,,16,,12,921,45
...,...,...,...,...,...,...,...,...
75,Malta,3,,,,,3,
76,Slovakia,3,,,,,3,
77,South Africa,3,+1,,,,3,
78,Cambodia,2,,,,1,1,


In [83]:
caseTableSimple = CaseTable[[CaseTable.columns[0], CaseTable.columns[1], CaseTable.columns[3], CaseTable.columns[5]]]
caseTableSimple.columns = ['Country/Region', 'Confirmed', 'Deaths', 'Recovered']
# Remove the last row of total number
caseTableSimple = caseTableSimple.iloc[:-1,:]
# Remove lead and tail space for each element
caseTableSimple = caseTableSimple.apply(lambda x: x.str.strip())
# Remove comma for each element
caseTableSimple = caseTableSimple.applymap(removeCommas)
# Replace empty str with zero. This include row of 'Diamond Princess' (its name is empty)
caseTableSimple = caseTableSimple.replace('', '0')
# Convert data type as correct type
caseTableSimple = caseTableSimple.astype({'Country/Region':'str',
                                          'Confirmed':'int',
                                          'Deaths':'int',
                                          'Recovered':'int',                                          
                                         })
# Data for these countries come from other source
removeRegion = ['China', 'Canada', 'Australia', 'USA']
for i in removeRegion:
    caseTableSimple.drop(caseTableSimple[caseTableSimple['Country/Region'] == i].index, axis=0, inplace=True)

# Change Country name the same as my old data 
if 'S. Korea' in list(caseTableSimple['Country/Region']):
    caseTableSimple = caseTableSimple.replace('S. Korea', 'South Korea')

# In my old data, 'Diamond Princess' is represented by 'Yokohama' in the column of 'Province/State'
# Add column 'Province/State' with empty value
caseTableSimple['Province/State'] =''
if '0' in list(caseTableSimple['Country/Region']):
    caseTableSimple.at[caseTableSimple.loc[caseTableSimple['Country/Region'] == '0',].index, 'Province/State'] = 'Yokohama'
    caseTableSimple['Country/Region'].replace({'0':'Japan'}, inplace=True)

# In my old data, 'Belgium' has 'Brussels' in the column of 'Province/State'
if 'Belgium' in list(caseTableSimple['Country/Region']):
    caseTableSimple.at[caseTableSimple.loc[caseTableSimple['Country/Region'] == 'Belgium',].index, 'Province/State'] = 'Brussels'

# In my old data, I used 'Macau' not 'Macao'
if 'Macao' in list(caseTableSimple['Country/Region']):
    caseTableSimple.at[caseTableSimple.loc[caseTableSimple['Country/Region'] == 'Macao',].index, 'Province/State'] = 'Macau'
    caseTableSimple['Country/Region'].replace({'Macao':'Macau'}, inplace=True)

# In my old data, 'Hong Kong' has 'Hong Kong' in the column of 'Province/State'
if 'Hong Kong' in list(caseTableSimple['Country/Region']):
    caseTableSimple.at[caseTableSimple.loc[caseTableSimple['Country/Region'] == 'Hong Kong',].index, 'Province/State'] = 'Hong Kong'

# In my old data, 'Taiwan' has 'Taiwan' in the column of 'Province/State'
if 'Taiwan' in list(caseTableSimple['Country/Region']):
    caseTableSimple.at[caseTableSimple.loc[caseTableSimple['Country/Region'] == 'Taiwan',].index, 'Province/State'] = 'Taiwan'

# In my old data, I used 'United Arab Emirates' not 'UAE'
if 'UAE' in list(caseTableSimple['Country/Region']):
    caseTableSimple['Country/Region'].replace({'UAE':'United Arab Emirates'}, inplace=True)

# In my old data I used US time as Last Update time
currentTime = datetime.now()
lastUpdateTime = currentTime.strftime('%m/%d/%Y %H:%M')
# Remove the first number (This only works for month number less than 10)
lastUpdateTime[1:]
caseTableSimple['Last Update'] = lastUpdateTime[1:]

# Reorder list as all old data
columnList = caseTableSimple.columns.tolist()
columnList =[columnList[i] for i in [4, 0, 5, 1, 2, 3]]
caseTableSimple = caseTableSimple[columnList]

# Scrap data for US_CAN

In [84]:
# Test if we can scrap info from worldometers
# The communication with website is ok if the response is 200
US_Canada = "https://coronavirus.1point3acres.com/en"
response2 = get(US_Canada, headers=headers)
response2

<Response [200]>

In [85]:
# Scrap all content from the website
html_soup2 = BeautifulSoup(response2.text, 'html.parser')

In [86]:
Locations = []
Confirmed = []
Recovered = []
Deaths = []
list1 = range(0, len(html_soup2.find_all('span', class_='jsx-2915694336'))-3, 4)
list2 = range(1, len(html_soup2.find_all('span', class_='jsx-2915694336'))-2, 4)
list3 = range(2, len(html_soup2.find_all('span', class_='jsx-2915694336'))-1, 4)
list4 = range(3, len(html_soup2.find_all('span', class_='jsx-2915694336'))-0, 4)

for index in list1:
    if len(html_soup2.find_all('span', class_='jsx-2915694336')[index].contents):
        Locations.append(html_soup2.find_all('span', class_='jsx-2915694336')[index].contents[0])
    else:
        Locations.append(0)
for index in list2:
    if len(html_soup2.find_all('span', class_='jsx-2915694336')[index].contents):
        Confirmed.append(html_soup2.find_all('span', class_='jsx-2915694336')[index].contents[0])
    else:
        Confirmed.append(0)
for index in list3:
    if len(html_soup2.find_all('span', class_='jsx-2915694336')[index].contents):
        Recovered.append(html_soup2.find_all('span', class_='jsx-2915694336')[index].contents[0])
    else:
        Recovered.append(0)
for index in list4:
    if len(html_soup2.find_all('span', class_='jsx-2915694336')[index].contents):
        Deaths.append(html_soup2.find_all('span', class_='jsx-2915694336')[index].contents[0])
    else:
        Deaths.append(0)
    
US_Can_data = pd.DataFrame({'Province/State':Locations,
                            'Confirmed':Confirmed,
                            'Deaths':Deaths,
                            'Recovered':Recovered,  
                            })

# Remove rows that are not data
US_Can_data.drop(US_Can_data[US_Can_data['Deaths'] == '死亡'].index, axis=0, inplace=True)

In [87]:
US_Can_data

Unnamed: 0,Province/State,Confirmed,Deaths,Recovered
1,华盛顿州,110,16,1
2,加州,108,1,2
3,纽约,89,0,0
4,钻石公主号,42,0,0
5,德克萨斯,13,0,0
6,马萨诸塞,13,0,1
7,佛罗里达,12,2,0
8,科罗拉多,8,0,0
9,俄勒冈,7,0,0
10,乔治亚,7,0,0


In [88]:
nameList = pd.read_csv('./web_data/statesNameTranslation.csv')

In [89]:
US_Can_data_EN = pd.merge(US_Can_data, nameList, how = 'left', left_on = 'Province/State', right_on = 'Chinese')
US_Can_data_EN = US_Can_data_EN.drop(['Chinese', 'Province/State', 'Abbr.'], axis=1)
US_Can_data_EN['Last Update'] = lastUpdateTime[1:]
US_Can_data_EN.rename(columns={'English':'Province/State'}, inplace=True)
columnOrder = ['Province/State', 'Country/Region', 'Last Update','Confirmed', 'Deaths', 'Recovered']
US_Can_data_EN = US_Can_data_EN[columnOrder]
US_Can_data_EN 

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,WA,US,3/08/2020 22:00,110,16,1
1,California,US,3/08/2020 22:00,108,1,2
2,New York,US,3/08/2020 22:00,89,0,0
3,From Diamond Princess cruise,US,3/08/2020 22:00,42,0,0
4,Texas,US,3/08/2020 22:00,13,0,0
5,Massachusetts,US,3/08/2020 22:00,13,0,1
6,Florida,US,3/08/2020 22:00,12,2,0
7,Colorado,US,3/08/2020 22:00,8,0,0
8,Oregon,US,3/08/2020 22:00,7,0,0
9,Georgia,US,3/08/2020 22:00,7,0,0


In [90]:
finalTable = pd.concat([US_Can_data_EN, caseTableSimple], ignore_index=True)
finalTable

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,WA,US,3/08/2020 22:00,110,16,1
1,California,US,3/08/2020 22:00,108,1,2
2,New York,US,3/08/2020 22:00,89,0,0
3,From Diamond Princess cruise,US,3/08/2020 22:00,42,0,0
4,Texas,US,3/08/2020 22:00,13,0,0
...,...,...,...,...,...,...
134,,Liechtenstein,3/08/2020 22:00,1,0,0
135,,Moldova,3/08/2020 22:00,1,0,0
136,,Paraguay,3/08/2020 22:00,1,0,0
137,,Serbia,3/08/2020 22:00,1,0,0


In [91]:
timeStampe = currentTime.strftime('%m_%d_%Y_%H_%M')
finalTable.to_csv('./web_data/{}_webData.csv'.format(timeStampe), index=False)

# Scrap data for China

In [74]:
# Test if we can scrap info from worldometers
# The communication with website is ok if the response is 200
CHN = "https://ncov.dxy.cn/ncovh5/view/pneumonia?scene=2&clicktime=1579582238&enterid=1579582238&from=singlemessage&isappinstalled=0"
response3 = get(CHN, headers=headers)
response3.encoding='utf-8' ##去掉这句则乱码，加上则正常显示，其中utf-8是根据网页源代码中设置的编码格式来指定的  
response3

<Response [200]>

In [75]:
# Scrap all content from the website
html_soup3 = BeautifulSoup(response3.text, 'html.parser')

In [76]:
print(html_soup3.prettify())

<!DOCTYPE html>
<html lang="zh-cn" xmlns:layout="http://www.ultraq.net.nz/web/thymeleaf/layout">
 <head>
  <link href="//assets.dxycdn.com/gitrepo/ncov-mobile/dist/umi.bundle.css?t=1583497981741" rel="stylesheet"/>
  <meta charset="utf-8"/>
  <meta content="width=device-width,initial-scale=1,user-scalable=0,viewport-fit=cover" name="viewport"/>
  <meta content="#000000" name="theme-color"/>
  <title>
   全球新冠病毒最新实时疫情地图_丁香园
  </title>
  <script>
   window.routerBase = "/ncovh5/view";
  </script>
  <script charset="utf-8" src="//assets.dxycdn.com/gitrepo/ncov-mobile/dist/vendors~p__ECommerce~p__Pneumonia~p__Pneumonia__area~p__Pneumonia__recommend-list~p__Pneumonia__rumo~5e297593.async.13df3f6e.js">
  </script>
  <script charset="utf-8" src="//assets.dxycdn.com/gitrepo/ncov-mobile/dist/vendors~p__Pneumonia~p__Pneumonia__area~p__Pneumonia__rumor-list.async.9184546f.js">
  </script>
  <link href="//assets.dxycdn.com/gitrepo/ncov-mobile/dist/vendors~p__ECommerce~p__Pneumonia~p__Pneumonia__are

In [80]:
html_soup3.find_all('script', id='getAreaStat')[0].contents

['try { window.getAreaStat = [{"provinceName":"湖北省","provinceShortName":"湖北","currentConfirmedCount":19568,"confirmedCount":67707,"suspectedCount":0,"curedCount":45153,"deadCount":2986,"comment":"","locationId":420000,"statisticsData":"https://file1.dxycdn.com/2020/0223/618/3398299751673487511-135.json","cities":[{"cityName":"武汉","currentConfirmedCount":17634,"confirmedCount":49912,"suspectedCount":0,"curedCount":29908,"deadCount":2370,"locationId":420100},{"cityName":"孝感","currentConfirmedCount":369,"confirmedCount":3518,"suspectedCount":0,"curedCount":3024,"deadCount":125,"locationId":420900},{"cityName":"鄂州","currentConfirmedCount":352,"confirmedCount":1394,"suspectedCount":0,"curedCount":988,"deadCount":54,"locationId":420700},{"cityName":"随州","currentConfirmedCount":187,"confirmedCount":1307,"suspectedCount":0,"curedCount":1077,"deadCount":43,"locationId":421300},{"cityName":"宜昌","currentConfirmedCount":170,"confirmedCount":931,"suspectedCount":0,"curedCount":727,"deadCount":34,"l

In [None]:
{"provinceName":"湖北省","provinceShortName":"湖北","currentConfirmedCount":19568,"confirmedCount":67707,"suspectedCount":0,"curedCount":45153,"deadCount":2986,"comment":"","locationId":420000,"statisticsData":"https://file1.dxycdn.com/2020/0223/618/3398299751673487511-135.json","cities":[{"cityName":"武汉","currentConfirmedCount":17634,"confirmedCount":49912,"suspectedCount":0,"curedCount":29908,"deadCount":2370,"locationId":420100},{"cityName":"孝感","currentConfirmedCount":369,"confirmedCount":3518,"suspectedCount":0,"curedCount":3024,"deadCount":125,"locationId":420900},{"cityName":"鄂州","currentConfirmedCount":352,"confirmedCount":1394,"suspectedCount":0,"curedCount":988,"deadCount":54,"locationId":420700},{"cityName":"随州","currentConfirmedCount":187,"confirmedCount":1307,"suspectedCount":0,"curedCount":1077,"deadCount":43,"locationId":421300},{"cityName":"宜昌","currentConfirmedCount":170,"confirmedCount":931,"suspectedCount":0,"curedCount":727,"deadCount":34,"locationId":420500},{"cityName":"荆州","currentConfirmedCount":155,"confirmedCount":1580,"suspectedCount":0,"curedCount":1376,"deadCount":49,"locationId":421000},{"cityName":"黄冈","currentConfirmedCount":151,"confirmedCount":2907,"suspectedCount":0,"curedCount":2631,"deadCount":125,"locationId":421100},{"cityName":"荆门","currentConfirmedCount":146,"confirmedCount":928,"suspectedCount":0,"curedCount":743,"deadCount":39,"locationId":420800},{"cityName":"黄石","currentConfirmedCount":95,"confirmedCount":1015,"suspectedCount":0,"curedCount":884,"deadCount":36,"locationId":420200},{"cityName":"十堰","currentConfirmedCount":93,"confirmedCount":672,"suspectedCount":0,"curedCount":571,"deadCount":8,"locationId":420300},{"cityName":"襄阳","currentConfirmedCount":82,"confirmedCount":1175,"suspectedCount":0,"curedCount":1055,"deadCount":38,"locationId":420600},{"cityName":"仙桃","currentConfirmedCount":53,"confirmedCount":575,"suspectedCount":0,"curedCount":501,"deadCount":21,"locationId":429004},{"cityName":"天门","currentConfirmedCount":24,"confirmedCount":496,"suspectedCount":0,"curedCount":457,"deadCount":15,"locationId":429006},{"cityName":"咸宁","currentConfirmedCount":21,"confirmedCount":836,"suspectedCount":0,"curedCount":801,"deadCount":14,"locationId":421200},{"cityName":"潜江","currentConfirmedCount":21,"confirmedCount":198,"suspectedCount":0,"curedCount":168,"deadCount":9,"locationId":429005},{"cityName":"恩施州","currentConfirmedCount":15,"confirmedCount":252,"suspectedCount":0,"curedCount":231,"deadCount":6,"locationId":422800},{"cityName":"神农架林区","currentConfirmedCount":0,"confirmedCount":11,"suspectedCount":0,"curedCount":11,"deadCount":0,"locationId":429021}]}