In [35]:
# Import lib
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


### 1. Load data

In [36]:
# load data
df_data= pd.read_csv('../dataset/covid_jpn_total.csv')
df_data.head()

Unnamed: 0,Date,Location,Positive,Tested,Symptomatic,Asymptomatic,Sym-unknown,Hosp_require,Hosp_mild,Hosp_severe,Hosp_unknown,Hosp_waiting,Discharged,Fatal,Vaccinated_1st,Vaccinated_2nd
0,2020-02-06,Domestic,16,132.0,16.0,0.0,0.0,,9.0,0,3.0,0.0,4,0,,
1,2020-02-06,Returnee,9,566.0,5.0,4.0,0.0,,3.0,0,2.0,0.0,0,0,,
2,2020-02-07,Domestic,16,151.0,16.0,0.0,0.0,,12.0,0,0.0,0.0,4,0,,
3,2020-02-07,Returnee,9,566.0,6.0,3.0,0.0,,4.0,0,2.0,0.0,0,0,,
4,2020-02-10,Domestic,16,174.0,16.0,0.0,0.0,,7.0,0,0.0,0.0,9,0,,


In [37]:
# check info
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1654 entries, 0 to 1653
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            1654 non-null   object 
 1   Location        1654 non-null   object 
 2   Positive        1654 non-null   int64  
 3   Tested          1638 non-null   float64
 4   Symptomatic     241 non-null    float64
 5   Asymptomatic    241 non-null    float64
 6   Sym-unknown     241 non-null    float64
 7   Hosp_require    1413 non-null   float64
 8   Hosp_mild       241 non-null    float64
 9   Hosp_severe     1654 non-null   int64  
 10  Hosp_unknown    241 non-null    float64
 11  Hosp_waiting    241 non-null    float64
 12  Discharged      1654 non-null   int64  
 13  Fatal           1654 non-null   int64  
 14  Vaccinated_1st  561 non-null    float64
 15  Vaccinated_2nd  561 non-null    float64
dtypes: float64(10), int64(4), object(2)
memory usage: 206.9+ KB


### 2. Data cleaning

In [38]:
# copy original data
df_clean= df_data.copy()

In [39]:
# remove rows which have all values are null or missing
df_clean.dropna(how= 'all', inplace= True)

In [40]:
# remove all features which have missing values > 50%
df_clean.drop(['Symptomatic', 'Asymptomatic', 'Sym-unknown', 'Hosp_mild', 'Hosp_unknown', 'Hosp_waiting', 'Vaccinated_1st', 'Vaccinated_2nd'], axis= 1, inplace= True)
df_clean.head()

Unnamed: 0,Date,Location,Positive,Tested,Hosp_require,Hosp_severe,Discharged,Fatal
0,2020-02-06,Domestic,16,132.0,,0,4,0
1,2020-02-06,Returnee,9,566.0,,0,0,0
2,2020-02-07,Domestic,16,151.0,,0,4,0
3,2020-02-07,Returnee,9,566.0,,0,0,0
4,2020-02-10,Domestic,16,174.0,,0,9,0


In [41]:
# rename some features for meaningful
df_clean= df_clean.rename(columns= {'Fatal': 'Deaths', 'Hosp_require': 'Hospitalization', 'Hosp_severe': 'Severe'})
df_clean.head()

Unnamed: 0,Date,Location,Positive,Tested,Hospitalization,Severe,Discharged,Deaths
0,2020-02-06,Domestic,16,132.0,,0,4,0
1,2020-02-06,Returnee,9,566.0,,0,0,0
2,2020-02-07,Domestic,16,151.0,,0,4,0
3,2020-02-07,Returnee,9,566.0,,0,0,0
4,2020-02-10,Domestic,16,174.0,,0,9,0


In [42]:
# convert date column to datetime type
df_clean['Date']= pd.to_datetime(df_clean['Date'])
df_clean.head()

Unnamed: 0,Date,Location,Positive,Tested,Hospitalization,Severe,Discharged,Deaths
0,2020-02-06,Domestic,16,132.0,,0,4,0
1,2020-02-06,Returnee,9,566.0,,0,0,0
2,2020-02-07,Domestic,16,151.0,,0,4,0
3,2020-02-07,Returnee,9,566.0,,0,0,0
4,2020-02-10,Domestic,16,174.0,,0,9,0


In [43]:
# resample data to date level (add date if it's miss)
df_clean= df_clean.groupby('Location').apply(
    lambda x: x.set_index('Date').resample('D').interpolate('linear')
)
df_clean

Unnamed: 0_level_0,Unnamed: 1_level_0,Location,Positive,Tested,Hospitalization,Severe,Discharged,Deaths
Location,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Airport,2020-03-05,Airport,1.0,,,0.0,0.0,0.0
Airport,2020-03-06,Airport,1.0,,,0.0,0.0,0.0
Airport,2020-03-07,Airport,1.0,,,0.0,0.0,0.0
Airport,2020-03-08,Airport,1.0,,,0.0,0.0,0.0
Airport,2020-03-09,Airport,1.0,,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
Returnee,2021-08-18,Returnee,15.0,829.0,0.0,0.0,15.0,0.0
Returnee,2021-08-19,Returnee,15.0,829.0,0.0,0.0,15.0,0.0
Returnee,2021-08-20,Returnee,15.0,829.0,0.0,0.0,15.0,0.0
Returnee,2021-08-21,Returnee,15.0,829.0,0.0,0.0,15.0,0.0


In [44]:
# drop location column
df_clean = df_clean.drop('Location', axis=1).reset_index()
df_clean.head()

Unnamed: 0,Location,Date,Positive,Tested,Hospitalization,Severe,Discharged,Deaths
0,Airport,2020-03-05,1.0,,,0.0,0.0,0.0
1,Airport,2020-03-06,1.0,,,0.0,0.0,0.0
2,Airport,2020-03-07,1.0,,,0.0,0.0,0.0
3,Airport,2020-03-08,1.0,,,0.0,0.0,0.0
4,Airport,2020-03-09,1.0,,,0.0,0.0,0.0


In [45]:
# sort data by date
df_clean= df_clean.sort_values('Date', ascending= False).reset_index(drop= True)
df_clean.head()

Unnamed: 0,Location,Date,Positive,Tested,Hospitalization,Severe,Discharged,Deaths
0,Returnee,2021-08-22,15.0,829.0,0.0,0.0,15.0,0.0
1,Domestic,2021-08-22,1273652.0,19651418.0,203540.0,1891.0,1048617.0,15589.0
2,Airport,2021-08-22,3772.0,955051.0,176.0,0.0,3589.0,7.0
3,Returnee,2021-08-21,15.0,829.0,0.0,0.0,15.0,0.0
4,Domestic,2021-08-21,1248539.0,19538125.0,193355.0,1888.0,1033914.0,15556.0


In [46]:
# impute missing data by using interpolation method
unmis_cols= df_clean.columns.isin(['Location', 'Date'])
df_clean.loc[:, ~unmis_cols] = df_clean.loc[:, ~unmis_cols].interpolate('linear').astype(np.int64)
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1664 entries, 0 to 1663
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Location         1664 non-null   object        
 1   Date             1664 non-null   datetime64[ns]
 2   Positive         1664 non-null   int64         
 3   Tested           1664 non-null   int64         
 4   Hospitalization  1664 non-null   int64         
 5   Severe           1664 non-null   int64         
 6   Discharged       1664 non-null   int64         
 7   Deaths           1664 non-null   int64         
dtypes: datetime64[ns](1), int64(6), object(1)
memory usage: 104.1+ KB


In [47]:
# create a new feature by ratio between positive and tested
df_clean['Pos_Tes_Ratio']= df_clean['Positive']/ df_clean['Tested']

# remove test feature
df_clean.drop(['Tested'], axis=1, inplace= True)
df_clean.head()

Unnamed: 0,Location,Date,Positive,Hospitalization,Severe,Discharged,Deaths,Pos_Tes_Ratio
0,Returnee,2021-08-22,15,0,0,15,0,0.018094
1,Domestic,2021-08-22,1273652,203540,1891,1048617,15589,0.064812
2,Airport,2021-08-22,3772,176,0,3589,7,0.00395
3,Returnee,2021-08-21,15,0,0,15,0,0.018094
4,Domestic,2021-08-21,1248539,193355,1888,1033914,15556,0.063903


### 3.Exploratory Data Analysis(EDA)

#### 3.1 Compare the change about positive case among 3 locations

In [48]:
df_pos= df_clean.pivot_table(
    index= 'Date', columns= 'Location', values= 'Positive', aggfunc= 'last'
)
df_pos.head()

Location,Airport,Domestic,Returnee
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-06,,16.0,9.0
2020-02-07,,16.0,9.0
2020-02-08,,16.0,9.0
2020-02-09,,16.0,9.0
2020-02-10,,16.0,10.0


In [49]:
# draw line chart 
fig= px.line(
    df_pos,
    y= ['Airport', 'Domestic', 'Returnee']
)
fig.show()

In [50]:
# remove info in 13/08/2021
df_pos.drop(['2021-08-13'], inplace= True)

# draw line chart 
fig= px.line(
    df_pos,
    y= ['Airport', 'Domestic', 'Returnee'],
    labels= {'value': 'Number of positive people', 'variable': 'Locations'}
)
fig.show()

In [51]:
# draw line chart for only Airport and returnee
fig= px.line(
    df_pos,
    y= ['Airport', 'Returnee'],
    labels= {'value': 'Number of positive people', 'variable': 'Locations'}
)
fig.show()

#### 3.2 Compare the change about number of death people in 3 locations

In [52]:
df_death= df_clean.pivot_table(
    index= 'Date', columns= 'Location', values= 'Deaths', aggfunc= 'last'
)
df_death.head()

Location,Airport,Domestic,Returnee
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-06,,0.0,0.0
2020-02-07,,0.0,0.0
2020-02-08,,0.0,0.0
2020-02-09,,0.0,0.0
2020-02-10,,0.0,0.0


In [53]:
# draw line chart 
fig= px.line(
    df_death,
    y= ['Airport', 'Domestic', 'Returnee'],
    labels= {'value': 'Number of positive people', 'variable': 'Locations'}
)
fig.show()

#### 3.3 Detail Analytics for domestic location

In [68]:
# select data
df_dom= df_clean[df_clean['Location'] == 'Domestic'].drop(['Location'], axis= 1)
df_dom.head()

Unnamed: 0,Date,Positive,Hospitalization,Severe,Discharged,Deaths,Pos_Tes_Ratio
1,2021-08-22,1273652,203540,1891,1048617,15589,0.064812
4,2021-08-21,1248539,193355,1888,1033914,15556,0.063903
8,2021-08-20,1223602,181106,1816,1020747,15527,0.06272
10,2021-08-19,1198283,168699,1765,1008309,15494,0.062163
14,2021-08-18,1175455,161699,1716,993758,15460,0.061397


In [69]:
# reset index
df_dom.reset_index(drop= True, inplace= True)
df_dom.set_index('Date', inplace= True)
df_dom.head()

Unnamed: 0_level_0,Positive,Hospitalization,Severe,Discharged,Deaths,Pos_Tes_Ratio
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-08-22,1273652,203540,1891,1048617,15589,0.064812
2021-08-21,1248539,193355,1888,1033914,15556,0.063903
2021-08-20,1223602,181106,1816,1020747,15527,0.06272
2021-08-19,1198283,168699,1765,1008309,15494,0.062163
2021-08-18,1175455,161699,1716,993758,15460,0.061397


In [70]:
# remove 
df_dom.drop(['2021-08-13'], inplace=True)

In [57]:
# compare all infomation 
fig= px.line(
    df_dom,
    y= df_dom.columns.to_list(),
    labels= {'value': 'Number of people', 'variable': ''}
)
fig.show()

In [71]:
# !pip install bar_chart_race
# draw chart in animation mode
import bar_chart_race as bcr

# sort value by date
df_dom.sort_values(['Date'], ascending= True)
df_dom.head()

Unnamed: 0_level_0,Positive,Hospitalization,Severe,Discharged,Deaths,Pos_Tes_Ratio
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-08-22,1273652,203540,1891,1048617,15589,0.064812
2021-08-21,1248539,193355,1888,1033914,15556,0.063903
2021-08-20,1223602,181106,1816,1020747,15527,0.06272
2021-08-19,1198283,168699,1765,1008309,15494,0.062163
2021-08-18,1175455,161699,1716,993758,15460,0.061397


In [59]:
# !pip install ffmpeg

In [61]:
# draw bar chart for death people
fig= px.bar(
    df_dom,
    y= 'Deaths',
    color= 'Deaths',
    title= 'Number of Death people',
    labels= {'Death': 'Number of death people'}
)
fig.show()

In [62]:
# Draw bar chart for Test column
# create a tested column
df_dom['Tested']= df_dom['Positive'] / df_dom['Pos_Tes_Ratio']

fig= px.bar(
    df_dom,
    y= 'Tested',
    color= 'Tested',
    title= 'Number of Tested people',
    labels= {'Tested': 'Number of Tested people'}
)
fig.show()

In [80]:
# get data in lastest day
df_latest= df_dom.head(1)
df_latest.drop(['Pos_Tes_Ratio'], axis=1, inplace= True)
df_latest



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,Positive,Hospitalization,Severe,Discharged,Deaths
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-08-22,1273652,203540,1891,1048617,15589


In [81]:
# compare percentage of positive, Hospitalization, discharge, severe, death people
fig= px.pie(
    df_latest,
    names= df_latest.columns.to_list(),
    values= np.array(df_latest)[0],
    title= 'Current situation in JP 2021-08-22'
)
fig.show()

In [83]:
# Draw pie with pull-out function
fig= go.Figure(
    data= [
        go.Pie(
            labels= df_latest.columns.to_list(),
            values= np.array(df_latest)[0],
            pull= [0, 0.5, 0, 0, 0],
            title= 'Current situation in JP 2021-08-22'
        )
    ]
)
fig.show()

In [73]:
# reset index and transpose data
df_latest= df_latest.reset_index(drop= True)
df_latest= pd.DataFrame(
    df_latest.T.values, 
    columns= ['Cases'],
    index= ['Positive', 'Hospitalization', 'Severe', 'Deaths', 'Discharged']
    )
df_latest.sort_values(['Cases'], ascending= False, inplace= True)
df_latest.style.background_gradient(cmap= 'plasma_r')


Unnamed: 0,Cases
Positive,1273652
Deaths,1048617
Hospitalization,203540
Discharged,15589
Severe,1891


In [78]:
# draw bar chart for df_latest data
fig= px.bar(
    df_latest,
    y= 'Cases',
    color= 'Cases',
    title= 'Infomation in latest day',
    labels= {'Cases': 'Number of people'}
)
fig.show()
