In [73]:
# Import lib
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

1.Load data

In [74]:
df_data= pd.read_csv('covid_jpn_total.csv')
df_data.head()

Unnamed: 0,Date,Location,Positive,Tested,Symptomatic,Asymptomatic,Sym-unknown,Hosp_require,Hosp_mild,Hosp_severe,Hosp_unknown,Hosp_waiting,Discharged,Fatal,Vaccinated_1st,Vaccinated_2nd
0,2020-02-06,Domestic,16,132.0,16.0,0.0,0.0,,9.0,0,3.0,0.0,4,0,,
1,2020-02-06,Returnee,9,566.0,5.0,4.0,0.0,,3.0,0,2.0,0.0,0,0,,
2,2020-02-07,Domestic,16,151.0,16.0,0.0,0.0,,12.0,0,0.0,0.0,4,0,,
3,2020-02-07,Returnee,9,566.0,6.0,3.0,0.0,,4.0,0,2.0,0.0,0,0,,
4,2020-02-10,Domestic,16,174.0,16.0,0.0,0.0,,7.0,0,0.0,0.0,9,0,,


2. Data cleaning

In [75]:
# copy original data
df_clean= df_data.copy()

In [76]:
# remove rows which have all values are null or missing
df_clean.dropna(how= 'all', inplace= True)

In [77]:
# remove all features which have missing values > 50%
df_clean.drop(['Symptomatic', 'Asymptomatic', 'Sym-unknown', 'Hosp_mild', 'Hosp_unknown', 'Hosp_waiting', 'Vaccinated_1st', 'Vaccinated_2nd'], axis= 1, inplace= True)
df_clean.head()

Unnamed: 0,Date,Location,Positive,Tested,Hosp_require,Hosp_severe,Discharged,Fatal
0,2020-02-06,Domestic,16,132.0,,0,4,0
1,2020-02-06,Returnee,9,566.0,,0,0,0
2,2020-02-07,Domestic,16,151.0,,0,4,0
3,2020-02-07,Returnee,9,566.0,,0,0,0
4,2020-02-10,Domestic,16,174.0,,0,9,0


In [78]:
# rename some features for meaningful
df_clean= df_clean.rename(columns= {'Fatal': 'Deaths', 'Hosp_require': 'Hospitalization', 'Hosp_severe': 'Severe'})
df_clean.head()

Unnamed: 0,Date,Location,Positive,Tested,Hospitalization,Severe,Discharged,Deaths
0,2020-02-06,Domestic,16,132.0,,0,4,0
1,2020-02-06,Returnee,9,566.0,,0,0,0
2,2020-02-07,Domestic,16,151.0,,0,4,0
3,2020-02-07,Returnee,9,566.0,,0,0,0
4,2020-02-10,Domestic,16,174.0,,0,9,0


In [79]:
# convert date column to datetime type
df_clean['Date']= pd.to_datetime(df_clean['Date'])
df_clean.head()

Unnamed: 0,Date,Location,Positive,Tested,Hospitalization,Severe,Discharged,Deaths
0,2020-02-06,Domestic,16,132.0,,0,4,0
1,2020-02-06,Returnee,9,566.0,,0,0,0
2,2020-02-07,Domestic,16,151.0,,0,4,0
3,2020-02-07,Returnee,9,566.0,,0,0,0
4,2020-02-10,Domestic,16,174.0,,0,9,0


In [80]:
# resample data to date level (add date if it's miss)
df_clean= df_clean.groupby('Location').apply(
    lambda x: x.set_index('Date').resample('D').interpolate('linear')
)
df_clean

Unnamed: 0_level_0,Unnamed: 1_level_0,Location,Positive,Tested,Hospitalization,Severe,Discharged,Deaths
Location,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Airport,2020-03-05,Airport,1.0,,,0.0,0.0,0.0
Airport,2020-03-06,Airport,1.0,,,0.0,0.0,0.0
Airport,2020-03-07,Airport,1.0,,,0.0,0.0,0.0
Airport,2020-03-08,Airport,1.0,,,0.0,0.0,0.0
Airport,2020-03-09,Airport,1.0,,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
Returnee,2021-08-18,Returnee,15.0,829.0,0.0,0.0,15.0,0.0
Returnee,2021-08-19,Returnee,15.0,829.0,0.0,0.0,15.0,0.0
Returnee,2021-08-20,Returnee,15.0,829.0,0.0,0.0,15.0,0.0
Returnee,2021-08-21,Returnee,15.0,829.0,0.0,0.0,15.0,0.0


In [81]:
# drop location column
df_clean = df_clean.drop('Location', axis=1).reset_index()
df_clean.head()

Unnamed: 0,Location,Date,Positive,Tested,Hospitalization,Severe,Discharged,Deaths
0,Airport,2020-03-05,1.0,,,0.0,0.0,0.0
1,Airport,2020-03-06,1.0,,,0.0,0.0,0.0
2,Airport,2020-03-07,1.0,,,0.0,0.0,0.0
3,Airport,2020-03-08,1.0,,,0.0,0.0,0.0
4,Airport,2020-03-09,1.0,,,0.0,0.0,0.0


In [82]:
# sort data by date
df_clean= df_clean.sort_values('Date', ascending= False).reset_index(drop= True)
df_clean.head()

Unnamed: 0,Location,Date,Positive,Tested,Hospitalization,Severe,Discharged,Deaths
0,Returnee,2021-08-22,15.0,829.0,0.0,0.0,15.0,0.0
1,Domestic,2021-08-22,1273652.0,19651418.0,203540.0,1891.0,1048617.0,15589.0
2,Airport,2021-08-22,3772.0,955051.0,176.0,0.0,3589.0,7.0
3,Returnee,2021-08-21,15.0,829.0,0.0,0.0,15.0,0.0
4,Domestic,2021-08-21,1248539.0,19538125.0,193355.0,1888.0,1033914.0,15556.0


In [83]:
# impute missing data by using interpolation method
unmis_cols= df_clean.columns.isin(['Location', 'Date'])
df_clean.loc[:, ~unmis_cols] = df_clean.loc[:, ~unmis_cols].interpolate('linear').astype(np.int64)
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1664 entries, 0 to 1663
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Location         1664 non-null   object        
 1   Date             1664 non-null   datetime64[ns]
 2   Positive         1664 non-null   int64         
 3   Tested           1664 non-null   int64         
 4   Hospitalization  1664 non-null   int64         
 5   Severe           1664 non-null   int64         
 6   Discharged       1664 non-null   int64         
 7   Deaths           1664 non-null   int64         
dtypes: datetime64[ns](1), int64(6), object(1)
memory usage: 104.1+ KB


3. Detail Analytics for Domestic Location

In [84]:
# select data
df_dom= df_clean[df_clean['Location'] == 'Domestic'].drop(['Location'], axis= 1)
df_dom.head()

Unnamed: 0,Date,Positive,Tested,Hospitalization,Severe,Discharged,Deaths
1,2021-08-22,1273652,19651418,203540,1891,1048617,15589
4,2021-08-21,1248539,19538125,193355,1888,1033914,15556
8,2021-08-20,1223602,19508897,181106,1816,1020747,15527
10,2021-08-19,1198283,19276531,168699,1765,1008309,15494
14,2021-08-18,1175455,19145040,161699,1716,993758,15460


In [85]:
# reset index
df_dom.reset_index(drop= True, inplace= True)
df_dom.set_index('Date', inplace= True)
df_dom.head(10)

Unnamed: 0_level_0,Positive,Tested,Hospitalization,Severe,Discharged,Deaths
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-08-22,1273652,19651418,203540,1891,1048617,15589
2021-08-21,1248539,19538125,193355,1888,1033914,15556
2021-08-20,1223602,19508897,181106,1816,1020747,15527
2021-08-19,1198283,19276531,168699,1765,1008309,15494
2021-08-18,1175455,19145040,161699,1716,993758,15460
2021-08-17,1156228,19026968,157758,1646,979128,15424
2021-08-16,1140857,18922794,156406,1603,964911,15401
2021-08-15,1124700,18859952,153626,1563,951787,15393
2021-08-14,1104601,18778497,145674,1521,939796,15376
2021-08-13,10884120,18643957,137246,1478,927344,15251


In [86]:
# remove 
df_dom.drop(['2021-08-13'], inplace=True)

In [87]:
# Draw bar chart for Tested column
fig= px.bar(
    df_dom,
    y= 'Tested',
    color= 'Tested',
    title= 'Number of Tested people',
    labels= {'Tested': 'Number of Tested people'}
)
fig.show()

In [88]:
# get data in lastest day
df_latest= df_dom.head(1)
df_latest

Unnamed: 0_level_0,Positive,Tested,Hospitalization,Severe,Discharged,Deaths
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-08-22,1273652,19651418,203540,1891,1048617,15589


In [89]:
# Draw pie with pull-out function
fig= go.Figure(
    data= [
        go.Pie(
            labels= df_latest.columns.to_list(),
            values= np.array(df_latest)[0],
            pull= [0.3,0],
            title= 'Current situation in JP 2021-08-22'
        )
    ]
)
fig.show()

In [90]:
# reset index and transpose data
df_latest.drop(['Tested'], axis=1, inplace= True)
df_latest= df_latest.reset_index(drop= True)
df_latest= pd.DataFrame(
    df_latest.T.values, 
    columns= ['Cases'],
    index= ['Positive', 'Hospitalization', 'Severe', 'Deaths', 'Discharged']
    )
df_latest.sort_values(['Cases'], ascending= False, inplace= True)
df_latest.style.background_gradient(cmap= 'plasma_r')



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Cases
Positive,1273652
Deaths,1048617
Hospitalization,203540
Discharged,15589
Severe,1891


In [91]:
# draw bar chart for df_latest data
fig= px.bar(
    df_latest,
    y= 'Cases',
    color= 'Cases',
    title= 'Infomation in latest day',
    labels= {'Cases': 'Number of people'}
)
fig.show()