In [28]:
import subprocess
import os

import pandas as pd
pd.set_option('display.max_rows',500)

import requests
from bs4 import BeautifulSoup

import json


![CRISP_DM](Crisp_DM_Tasks.png)

# Business Understanding

We would like to track Corono Virus spread across countries and with personal local information

The general information is not so relevant for us, so we would like to dive deep into local development of the spread.

# Data Understanding

* RKI, webscrape (webscraping)  https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
* John Hopkins (GITHUB)  https://github.com/CSSEGISandData/COVID-19.git
* REST API services to retreive data https://npgeo-corona-npgeo-de.hub.arcgis.com/

# GITHUB csv data

git clone/pull https://github.com/CSSEGISandData/COVID-19.git

In [29]:
git_pull = subprocess.Popen(["git", "pull"], #"/usr/bin/git pull" , 
                     cwd = os.path.dirname( 'C:/ProgramData/Anaconda3/eps_covid19/data/raw/COVID-19/' ), 
                     shell = True, 
                     stdout = subprocess.PIPE, 
                     stderr = subprocess.PIPE )
(out, error) = git_pull.communicate()


print("Error : " + str(error)) 
print("out : " + str(out))

Error : b'From https://github.com/CSSEGISandData/COVID-19\n   29b85c49..80fa13de  master     -> origin/master\n   882e32f6..0e56c684  web-data   -> origin/web-data\n'
out : b'Updating 29b85c49..80fa13de\nFast-forward\n README.md                                                              | 1 +\n csse_covid_19_data/README.md                                           | 3 ++-\n .../csse_covid_19_time_series/time_series_covid19_confirmed_global.csv | 2 +-\n .../csse_covid_19_time_series/time_series_covid19_deaths_global.csv    | 2 +-\n .../csse_covid_19_time_series/time_series_covid19_recovered_global.csv | 2 +-\n 5 files changed, 6 insertions(+), 4 deletions(-)\n'


In [30]:
data_path='C:/ProgramData/Anaconda3/eps_covid19/data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
pd_raw=pd.read_csv(data_path)
pd_raw.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,9/4/20,9/5/20,9/6/20,9/7/20,9/8/20,9/9/20,9/10/20,9/11/20,9/12/20,9/13/20
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,38304,38324,38398,38494,38520,38544,38572,38606,38641,38716
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,9967,10102,10255,10406,10553,10704,10860,11021,11185,11353
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,45773,46071,46364,46653,46938,47216,47488,47752,48007,48254
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,1215,1215,1215,1261,1261,1301,1301,1344,1344,1344
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,2876,2935,2965,2981,3033,3092,3217,3279,3335,3388


In [31]:
#import sys
#sys.path

# Web Scraping

In [32]:
# import requests
# from bs4 import BeautifulSoup

In [33]:
page = requests.get("https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html")

In [34]:
soup = BeautifulSoup(page.content, 'html.parser')

In [35]:
#soup.get_text()

In [36]:
html_table=soup.find('table')

In [37]:
all_rows=html_table.find_all('tr')

In [38]:
final_data_list=[]

In [39]:
for pos,rows in enumerate(all_rows):
   # print(pos)
   # print(rows)
    col_list=[each_col.get_text(strip=True) for each_col in rows.find_all('td')]
   # for each_col in rows.find_all('td'):
   #     print(each_col.get_text(strip=True))
   # print(col_list)
    final_data_list.append(col_list)

In [40]:
pd_daily_status=pd.DataFrame(final_data_list).dropna().rename(columns={0:'State', 
                                                       1:'Cases',
                                                       2:'Difference to the previous day', 
                                                       3:'Cases in the past 7 days', 
                                                       4:'7-day incidence', 
                                                       5:'Deaths'})

In [41]:
pd_daily_status

Unnamed: 0,State,Cases,Difference to the previous day,Cases in the past 7 days,7-day incidence,Deaths
2,Baden-Württem­berg,45.254,184,1.423,129,1.867
3,Bayern,61.974,269,2.328,178,2.645
4,Berlin,12.269,15,532.0,142,226.0
5,Branden­burg,4.006,1,65.0,26,169.0
6,Bremen,2.135,5,65.0,95,58.0
7,Hamburg,6.825,11,254.0,138,268.0
8,Hessen,16.877,69,696.0,111,539.0
9,Meck­lenburg-Vor­pommern*,1.055,0,24.0,15,20.0
10,Nieder­sachsen,17.911,67,557.0,70,667.0
11,Nord­rhein-West­falen,62.261,182,1.738,97,1.829


## REST API Calls

In [42]:
data= requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

In [43]:
#data.content

In [44]:
# import json

In [45]:
json_object=json.loads(data.content)

In [46]:
type(json_object)

dict

In [47]:
json_object.keys()

dict_keys(['objectIdFieldName', 'uniqueIdField', 'globalIdFieldName', 'geometryProperties', 'serverGens', 'geometryType', 'spatialReference', 'fields', 'features'])

In [48]:
full_list=[]
for pos,each_dict in enumerate (json_object['features'][:]):
    full_list.append(each_dict['attributes'])

In [49]:
pd_full_list=pd.DataFrame(full_list)
pd_full_list

Unnamed: 0,OBJECTID_1,LAN_ew_AGS,LAN_ew_GEN,LAN_ew_BEZ,LAN_ew_EWZ,OBJECTID,Fallzahl,Aktualisierung,AGS_TXT,GlobalID,faelle_100000_EW,Shape__Area,Shape__Length,Death
0,1,1,Schleswig-Holstein,Land,2896712,15,4231,1600034400000,1,fc5ba936-c95c-432c-8a33-9eb2f30b660f,146.062156,45737310000.0,2881496.0,161
1,2,2,Hamburg,Freie und Hansestadt,1841179,6,6825,1600034400000,2,0f3e860c-5181-4d3f-a421-1d51f50315ea,370.686392,2089396000.0,418800.2,268
2,3,3,Niedersachsen,Land,7982448,9,17911,1600034400000,3,3fd77024-c29b-4843-9be8-682ad48e60c9,224.379789,129983600000.0,4008988.0,667
3,4,4,Bremen,Freie Hansestadt,682986,5,2135,1600034400000,4,4132268b-54de-4327-ac1e-760e915112f1,312.597916,1119157000.0,335717.7,58
4,5,5,Nordrhein-Westfalen,Land,17932651,10,62261,1600034400000,5,561d658f-3ee5-46e3-bc95-3528c6558ab9,347.193508,87829360000.0,2648673.0,1829
5,6,6,Hessen,Land,6265809,7,16877,1600034400000,6,93277ac4-e8fc-48c7-8940-028dc2ed66af,269.350694,52359130000.0,2148244.0,539
6,7,7,Rheinland-Pfalz,Land,4084844,11,9764,1600034400000,7,e9b4296f-9be2-4e53-9a58-ccf1396cb03d,239.029936,47838770000.0,1774430.0,246
7,8,8,Baden-Württemberg,Land,11069533,1,45254,1600034400000,8,80394ddf-c6a4-4a6e-be8e-0259a81b22a9,408.815801,81517320000.0,2544320.0,1867
8,9,9,Bayern,Freistaat,13076721,2,61974,1600034400000,9,1ff920f4-62cd-4a4f-b8c9-f042f2a3e00a,473.926147,163485500000.0,3898618.0,2645
9,10,10,Saarland,Land,990509,12,3245,1600034400000,10,e3396a6f-8a30-4fdf-8df7-def77dd38bea,327.60934,6060692000.0,562678.9,175


In [50]:
#pd_full_list.to_csv('../data/raw/NPGEO/GER_state_data.csv',sep=';') #not working, ask someone

In [51]:
pd_full_list.to_csv('C:/ProgramData/Anaconda3/eps_covid19/data/raw/NPGEO/GER_state_data.csv',sep=';')

In [52]:
pd_full_list.shape[0]

16

## API access via REST service, e.g. USA data

example of a REST conform interface (attention registration mandatory)
www.smartable.ai
The website is down!!!

url_endpoint='https://api.smartable.ai/coronavirus/stats/US'

headers = {
    'Cache-Control': 'no-cache',
    'Subscription-Key': '426ffca2718f4a819ac9d29b3ae6c2f8',
}

response = requests.get(url_endpoint, headers=headers)
print(response)



US_dict=json.loads(response.content) # imports string
with open('C:/ProgramData/Anaconda3/eps_covid19/data/raw/SMARTABLE/US_data.txt', 'w') as outfile:
    json.dump(US_dict, outfile,indent=2)

print(json.dumps(US_dict,indent=2)) #string dump