In [25]:
# Importing modules
import json

import requests
from bs4 import BeautifulSoup

import subprocess
import os

import pandas as pd

pd.set_option('display.max_rows',400)



![CRISP_DM](../reports/figures/CRISP_DM.png)

# Data Understanding

* RKI, webscrape (webscraping) https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
* John Hopkins (GITHUB) https://github.com/CSSEGISandData/COVID-19.git
* REST API services to retreive data https://npgeo-corona-npgeo-de.hub.arcgis.com/

In [4]:
url='https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv'
pd_raw = pd.read_csv(url)


In [48]:
pd_raw

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197479,ZWE,Africa,Zimbabwe,2022-06-24,255309.0,173.0,79.429,5549.0,7.0,2.286,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
197480,ZWE,Africa,Zimbabwe,2022-06-25,255309.0,0.0,79.429,5549.0,0.0,2.286,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
197481,ZWE,Africa,Zimbabwe,2022-06-26,255355.0,46.0,79.143,5549.0,0.0,2.143,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
197482,ZWE,Africa,Zimbabwe,2022-06-27,255383.0,28.0,78.857,5549.0,0.0,2.143,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,


# Webscrapping

In [29]:
page = requests.get("https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html")

In [30]:
soup = BeautifulSoup(page.content, 'html.parser')

In [31]:
html_table=soup.find('table') # find the table, attention this works if one table exists

In [32]:
all_rows=html_table.find_all('tr')

In [13]:
final_data_list=[]

In [33]:
for pos,rows in enumerate(all_rows):
   
    col_list=[each_col.get_text(strip=True) for each_col in rows.find_all('td')] #td for data element
    final_data_list.append(col_list)

In [38]:
pd_daily_status=pd.DataFrame(final_data_list).dropna().rename(columns={0:'state',
                                                       1:'cases',
                                                       2:'changes',
                                                       3:'cases_per_200k',
                                                       4:'fatal',
                                                       5:'comment'})

In [39]:
pd_daily_status.head()

Unnamed: 0,state,cases,changes,cases_per_200k,fatal,comment
2,Baden-Württem­berg,3.845.449,14.598,57.643,5192,16.323
3,Bayern,5.138.327,19.63,75.649,5757,24.379
4,Berlin,1.099.906,3.194,15.739,4295,4.641
5,Branden­burg,821.701,2.609,10.358,4092,5.716
6,Bremen,216.094,1.02,5.303,7797,790.0


# REST API calls

In [40]:
## data request for Germany
data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=432&f=json')

In [41]:
json_object=json.loads(data.content) 

In [42]:
type(json_object)

dict

In [43]:
json_object.keys()

dict_keys(['objectIdFieldName', 'uniqueIdField', 'globalIdFieldName', 'geometryProperties', 'geometryType', 'spatialReference', 'fields', 'features'])

In [44]:
full_list=[]
for pos,each_dict in enumerate (json_object['features'][:]):
    full_list.append(each_dict['attributes'])
    

In [45]:
pd_full_list=pd.DataFrame(full_list)
pd_full_list.head()

Unnamed: 0,OBJECTID_1,LAN_ew_AGS,LAN_ew_GEN,LAN_ew_BEZ,LAN_ew_EWZ,OBJECTID,Fallzahl,Aktualisierung,AGS_TXT,GlobalID,faelle_100000_EW,Shape__Area,Shape__Length,Death,cases7_bl_per_100k,cases7_bl,death7_bl,cases7_bl_per_100k_txt,AdmUnitId
0,1,1,Schleswig-Holstein,Land,2910875,15,830254,1656453600000,1,fc5ba936-c95c-432c-8a33-9eb2f30b660f,28522.488942,45737310000.0,2881496.0,2606,989.118392,28792,4,9891,1
1,2,2,Hamburg,Freie und Hansestadt,1852478,6,630954,1656453600000,2,0f3e860c-5181-4d3f-a421-1d51f50315ea,34059.999633,2089396000.0,418800.2,2723,687.295612,12732,3,6873,2
2,3,3,Niedersachsen,Land,8003421,9,2613059,1656453600000,3,3fd77024-c29b-4843-9be8-682ad48e60c9,32649.275853,129983600000.0,4008988.0,9678,917.770039,73453,6,9178,3
3,4,4,Bremen,Freie Hansestadt,680130,5,216094,1656453600000,4,4132268b-54de-4327-ac1e-760e915112f1,31772.455266,1119157000.0,335717.7,790,779.70388,5303,2,7797,4
4,5,5,Nordrhein-Westfalen,Land,17925570,10,5715015,1656453600000,5,561d658f-3ee5-46e3-bc95-3528c6558ab9,31881.915052,87829360000.0,2648673.0,25737,774.039542,138751,12,7740,5
