# 2020-nCoV Global Cases by Wooil Jeong

- **Wooil Jeong**  
[Dashboard by WooilJeong](https://plot.ly/dashboard/coronavirus:34/present#/)  
[Blog](https://wooiljeong.github.io/etc/corona_dash/)  
[Github Repository](https://github.com/WooilJeong/novel_coronavirus)  


- **Novel Coronavirus (2019-nCoV) Cases, provided by JHU CSSE**  
[Dashboard by JHU CSSE](https://gisanddata.maps.arcgis.com/apps/opsdashboard/index.html#/bda7594740fd40299423467b48e9ecf6)  
[Old Data Sheets](https://docs.google.com/spreadsheets/d/1yZv9w9zRKwrGTaR-YzmAqMefw4wMlaXocejdxZaTs6w/htmlview?usp=sharing&sle=true#)  
[New Google Sheet Link (support comments)](https://docs.google.com/spreadsheets/d/1wQVypefm946ch4XDp37uZ-wartW4V7ILdg-qYiDXUHM/edit?usp=sharing)  
[Time series google sheet](https://docs.google.com/spreadsheets/d/1UF2pSkFTURko2OvfHWWlFpDFAr1UxCBA4JLwlSP6KFo/edit?usp=sharing)  


- **Contact**  
email : wooil@kakao.com  

## Dataset Pipeline

In [1]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
scope = [
'https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive',
]

json_file_name = 'gspread-266617-7512230df225.json'
credentials = ServiceAccountCredentials.from_json_keyfile_name(json_file_name, scope)

gc = gspread.authorize(credentials)
spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1UF2pSkFTURko2OvfHWWlFpDFAr1UxCBA4JLwlSP6KFo/htmlview?usp=sharing&sle=true#'
doc = gc.open_by_url(spreadsheet_url)

In [3]:
sheet_list = doc.worksheets()
sheet_nm = []
for i in sheet_list:
    sheet_nm.append(i.title)

print('sheets number :', len(sheet_list))
print(sheet_nm)

sheets number : 3
['Confirmed', 'Recovered', 'Death']


In [4]:
df_list = []
for i in sheet_nm:
    print(i)
    data = doc.worksheet(i).get_all_values()
    globals()[i] = pd.DataFrame(data[1:], columns=data[0])
    
    del globals()[i][globals()[i].columns[2]]
    
    df_list.append(globals()[i])

Confirmed
Recovered
Death


## Reshape Dataset

In [5]:
import pandas as pd
id_vars=['Province/State',
         'Country/Region',
         'Lat',
         'Long'
         ]

for i in sheet_nm:
    globals()['df_'+i] = pd.melt(globals()[i],
                                 id_vars=id_vars,
                                 var_name='Last Update',
                                 value_name=i,
                                ).sort_values('Last Update', ascending=True)
    globals()['df_'+i].index=range(len(globals()['df_'+i]))
    
df = pd.merge(df_Confirmed, df_Death, how='left')
df = pd.merge(df, df_Recovered, how='left')
df

Unnamed: 0,Province/State,Country/Region,Lat,Long,Last Update,Confirmed,Death,Recovered
0,Anhui,Mainland China,31.82571,117.2264,1/21/2020 10:00 PM,,,
1,Washington,US,47.7511,-120.74,1/21/2020 10:00 PM,1,,
2,Illinois,US,40.6331,-89.3985,1/21/2020 10:00 PM,,,
3,California,US,36.7783,-119.418,1/21/2020 10:00 PM,,,
4,Arizona,US,34.0489,-111.094,1/21/2020 10:00 PM,,,
...,...,...,...,...,...,...,...,...
1555,Tibet,Mainland China,30.1534,88.7879,2/2/2020 9:00 PM,1,,
1556,Xinjiang,Mainland China,41.11981,85.17822,2/2/2020 9:00 PM,24,,
1557,Yunnan,Mainland China,24.97411,101.4868,2/2/2020 9:00 PM,105,,3
1558,Inner Mongolia,Mainland China,44.09448,113.9456,2/2/2020 9:00 PM,27,,1


## Pre-Processing

### Date type

In [6]:
import datetime

date_list=[]
for i in df['Last Update']:
    
    if 'AM' in i:
        
        a=datetime.datetime.strptime(i, "%m/%d/%Y %I:%M %p")
        b=datetime.datetime.strftime(a, "%Y-%m-%d %H:%M")
        
    elif 'PM' in i:
        
        a=datetime.datetime.strptime(i, "%m/%d/%Y %I:%M %p")
        b=datetime.datetime.strftime(a, "%Y-%m-%d %H:%M")        
    
    else:
        
        a=datetime.datetime.strptime(i, "%m/%d/%Y %H:%M")
        b=datetime.datetime.strftime(a, "%Y-%m-%d %H:%M")        
        
    date_list.append(b)

df['Last Update'] = date_list

In [7]:
df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Last Update,Confirmed,Death,Recovered
0,Anhui,Mainland China,31.82571,117.2264,2020-01-21 22:00,,,
1,Washington,US,47.7511,-120.74,2020-01-21 22:00,1.0,,
2,Illinois,US,40.6331,-89.3985,2020-01-21 22:00,,,
3,California,US,36.7783,-119.418,2020-01-21 22:00,,,
4,Arizona,US,34.0489,-111.094,2020-01-21 22:00,,,


In [8]:
# Replace spaces with zeros
df['Province/State'] = df["Province/State"].apply(lambda x: 'None' if x=="" else x)
df['Confirmed'] = df["Confirmed"].apply(lambda x: 0 if x=="" else x)
df['Death'] = df["Death"].apply(lambda x: 0 if x=="" else x)
df['Recovered'] = df["Recovered"].apply(lambda x: 0 if x=="" else x)

# ETC
df['Confirmed'] = df["Confirmed"].apply(lambda x: 0 if x=="`" else x)

# Data type conversion
df['Lat'] = pd.to_numeric(df['Lat'])
df['Long'] = pd.to_numeric(df['Long'])
df['Last Update'] = pd.to_datetime(df['Last Update'])
df['Confirmed'] = pd.to_numeric(df['Confirmed'])
df['Death'] = pd.to_numeric(df['Death'])
df['Recovered'] = pd.to_numeric(df['Recovered'])

# Feature Engineering
df['D/C'] = (df['Death']/df['Confirmed'])*100
df['R/C'] = (df['Recovered']/df['Confirmed'])*100

# Fill Na with Zeros
df = df.fillna(0)

## Save Dataset

In [9]:
import os

if not os.path.exists('Data'):
    os.mkdir('Data')

In [10]:
df.to_csv('Data/Dataset.csv',index=False,encoding='utf-8')