In [55]:
# python >= 3.5 is required
import sys
assert sys.version_info >= (3,5)
# scikit learn >=0.20 is required
import sklearn
assert sklearn.__version__ >= '0.20'

# commmon imports
import numpy as np
import os

# to maake this notebook outpuut stable accross runs
np.random.seed(42)

# set up plot 
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes',labelsize=14)
mpl.rc('xtick',labelsize=12)
mpl.rc('ytick',labelsize=12)

# where to save the figure
ROOT = '.'
CHAPTER_ID = 'svm'
IMAGES_PATH = os.path.join(ROOT,'images',CHAPTER_ID)
os.makedirs(IMAGES_PATH,exist_ok=True)

def save_fig(fig_id,tight_layout=True, fig_extension='png',resolution=300):
    path = os.path.join(IMAGES_PATH,fig_id + '.' +fig_extension)
    print('saving images figure',fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path,format=fig_extension,dpi=resolution)

In [56]:
#load data frame
import pandas as pd
df_transformed = pd.read_csv('Data/transformed_data.csv')
df_raw= pd.read_csv('Data/raw_data.csv')

In [57]:
# testing load of df 
df_raw

Unnamed: 0,iso_code,location,date,total_cases,total_deaths,stringency_index,population,gdp_per_capita,human_development_index,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
0,AFG,Afghanistan,2019-12-31,0.0,0.0,0.00,38928341,1803.987,0.498,#NUM!,#NUM!,#NUM!,17.477233,7.497754494
1,AFG,Afghanistan,2020-01-01,0.0,0.0,0.00,38928341,1803.987,0.498,#NUM!,#NUM!,#NUM!,17.477233,7.497754494
2,AFG,Afghanistan,2020-01-02,0.0,0.0,0.00,38928341,1803.987,0.498,#NUM!,#NUM!,#NUM!,17.477233,7.497754494
3,AFG,Afghanistan,2020-01-03,0.0,0.0,0.00,38928341,1803.987,0.498,#NUM!,#NUM!,#NUM!,17.477233,7.497754494
4,AFG,Afghanistan,2020-01-04,0.0,0.0,0.00,38928341,1803.987,0.498,#NUM!,#NUM!,#NUM!,17.477233,7.497754494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50413,ZWE,Zimbabwe,2020-10-15,8055.0,231.0,76.85,14862927,1899.775,0.535,8.994048296,5.442417711,4.34185547,16.514381,7.549490737
50414,ZWE,Zimbabwe,2020-10-16,8075.0,231.0,76.85,14862927,1899.775,0.535,8.996528148,5.442417711,4.34185547,16.514381,7.549490737
50415,ZWE,Zimbabwe,2020-10-17,8099.0,231.0,76.85,14862927,1899.775,0.535,8.999495876,5.442417711,4.34185547,16.514381,7.549490737
50416,ZWE,Zimbabwe,2020-10-18,8110.0,231.0,76.85,14862927,1899.775,0.535,9.000853147,5.442417711,4.34185547,16.514381,7.549490737


In [58]:
df_transformed

Unnamed: 0,CODE,COUNTRY,DATE,HDI,TC,TD,STI,POP,GDPCAP
0,AFG,Afghanistan,2019-12-31,0.498,0.000000,0.000000,0.000000,17.477233,7.497754
1,AFG,Afghanistan,2020-01-01,0.498,0.000000,0.000000,0.000000,17.477233,7.497754
2,AFG,Afghanistan,2020-01-02,0.498,0.000000,0.000000,0.000000,17.477233,7.497754
3,AFG,Afghanistan,2020-01-03,0.498,0.000000,0.000000,0.000000,17.477233,7.497754
4,AFG,Afghanistan,2020-01-04,0.498,0.000000,0.000000,0.000000,17.477233,7.497754
...,...,...,...,...,...,...,...,...,...
50413,ZWE,Zimbabwe,2020-10-15,0.535,8.994048,5.442418,4.341855,16.514381,7.549491
50414,ZWE,Zimbabwe,2020-10-16,0.535,8.996528,5.442418,4.341855,16.514381,7.549491
50415,ZWE,Zimbabwe,2020-10-17,0.535,8.999496,5.442418,4.341855,16.514381,7.549491
50416,ZWE,Zimbabwe,2020-10-18,0.535,9.000853,5.442418,4.341855,16.514381,7.549491


In [59]:
df_transformed['COUNTRY'].value_counts().mode()

0    294
Name: COUNTRY, dtype: int64

In [60]:
df_transformed['COUNTRY'].value_counts()

Afghanistan        294
Indonesia          294
Macedonia          294
Luxembourg         294
Lithuania          294
                  ... 
Tajikistan         172
Comoros            171
Lesotho            158
Hong Kong           51
Solomon Islands      4
Name: COUNTRY, Length: 210, dtype: int64

dapat di lihat bahwa nilai yang sama dari sebuah value tersebut adalah 294->jadi jika nilai kurang dari itu maka null value  

In [61]:
# Feature engineering
code = df_transformed['CODE'].unique().tolist()
country =df_transformed['COUNTRY'].unique().tolist()
hdi = []
tc = []
td = []
sti =[]
population = df_transformed['POP'].unique().tolist()
gdp = []

In [62]:
for i in country:# satu negara satu proses berarti
    # nilai pada satu aspek pada suatu negara di jumlahkan, pada hdi/sti/population di bagi 294
    hdi.append((df_transformed.loc[df_transformed['COUNTRY']==i, 'HDI']).sum()/294)
    tc.append((df_raw.loc[df_raw['location']==i, 'total_cases']).sum())
    td.append((df_raw.loc[df_raw['location']==i, 'total_deaths']).sum())
    sti.append((df_transformed.loc[df_transformed['COUNTRY']==i,'STI']).sum()/294)
    population.append((df_raw.loc[df_raw['population']==i, 'population']).sum()/294)
    

In [63]:
aggregate_data=pd.DataFrame(list(zip(code,country,hdi,tc,td,sti,population)),
             columns=["Country Code", "Country", "HDI", 
                    "Total Cases", "Total Deaths", 
                    "Stringency Index", "Population"])

In [64]:
aggregate_data

Unnamed: 0,Country Code,Country,HDI,Total Cases,Total Deaths,Stringency Index,Population
0,AFG,Afghanistan,0.498000,5126433.0,165875.0,3.049673,17.477233
1,ALB,Albania,0.600765,1071951.0,31056.0,3.005624,14.872537
2,DZA,Algeria,0.754000,4893999.0,206429.0,3.195168,17.596309
3,AND,Andorra,0.659551,223576.0,9850.0,2.677654,11.254996
4,AGO,Angola,0.418952,304005.0,11820.0,2.965560,17.307957
...,...,...,...,...,...,...,...
205,VEN,Venezuela,0.566867,4839834.0,40840.0,3.235752,17.163165
206,VNM,Vietnam,0.694000,122618.0,2318.0,3.710868,18.393706
207,YEM,Yemen,0.296721,228925.0,64304.0,2.369568,17.210890
208,ZMB,Zambia,0.430000,1129913.0,26475.0,2.645104,16.726989


GDP per capita belum di tentukan karena belum dapat menentukan fitur pada data set yang menyatakan gdp percapita
take 10 data highest covid kasus

In [74]:
data = aggregate_data.sort_values(by=['Total Cases'], ascending=False)

In [75]:
data.head()

Unnamed: 0,Country Code,Country,HDI,Total Cases,Total Deaths,Stringency Index,Population
200,USA,United States,0.924,746014098.0,26477574.0,3.350949,19.617637
27,BRA,Brazil,0.759,425704517.0,14340567.0,3.136028,19.174732
90,IND,India,0.64,407771615.0,7247327.0,3.610552,21.045353
157,RUS,Russia,0.816,132888951.0,2131571.0,3.380088,18.798668
150,PER,Peru,0.59949,74882695.0,3020038.0,3.430126,17.311165


In [80]:
df_transformed[df_transformed['CODE']=='USA']['GDPCAP'].unique()

array([10.90090556])

In [81]:
df_raw[df_raw['iso_code']=='USA']['gdp_per_capita'].unique()

array([54225.446])

In [83]:
data = data.head(10)
data

Unnamed: 0,Country Code,Country,HDI,Total Cases,Total Deaths,Stringency Index,Population
200,USA,United States,0.924,746014098.0,26477574.0,3.350949,19.617637
27,BRA,Brazil,0.759,425704517.0,14340567.0,3.136028,19.174732
90,IND,India,0.64,407771615.0,7247327.0,3.610552,21.045353
157,RUS,Russia,0.816,132888951.0,2131571.0,3.380088,18.798668
150,PER,Peru,0.59949,74882695.0,3020038.0,3.430126,17.311165
125,MEX,Mexico,0.774,74347548.0,7295850.0,3.019289,18.674802
178,ESP,Spain,0.887969,73717676.0,5510624.0,3.393922,17.660427
175,ZAF,South Africa,0.608653,63027659.0,1357682.0,3.364333,17.898266
42,COL,Colombia,0.581847,60543682.0,1936134.0,3.357923,17.745037
199,GBR,United Kingdom,0.922,59475032.0,7249573.0,3.353883,18.03334


In [84]:
# this take from feature engineering liquiditas
data["GDP Before Covid"] = [65279.53, 8897.49, 2100.75, 
                            11497.65, 7027.61, 9946.03, 
                            29564.74, 6001.40, 6424.98, 42354.41]
data["GDP During Covid"] = [63543.58, 6796.84, 1900.71, 
                            10126.72, 6126.87, 8346.70, 
                            27057.16, 5090.72, 5332.77, 40284.64]
data

Unnamed: 0,Country Code,Country,HDI,Total Cases,Total Deaths,Stringency Index,Population,GDP Before Covid,GDP During Covid
200,USA,United States,0.924,746014098.0,26477574.0,3.350949,19.617637,65279.53,63543.58
27,BRA,Brazil,0.759,425704517.0,14340567.0,3.136028,19.174732,8897.49,6796.84
90,IND,India,0.64,407771615.0,7247327.0,3.610552,21.045353,2100.75,1900.71
157,RUS,Russia,0.816,132888951.0,2131571.0,3.380088,18.798668,11497.65,10126.72
150,PER,Peru,0.59949,74882695.0,3020038.0,3.430126,17.311165,7027.61,6126.87
125,MEX,Mexico,0.774,74347548.0,7295850.0,3.019289,18.674802,9946.03,8346.7
178,ESP,Spain,0.887969,73717676.0,5510624.0,3.393922,17.660427,29564.74,27057.16
175,ZAF,South Africa,0.608653,63027659.0,1357682.0,3.364333,17.898266,6001.4,5090.72
42,COL,Colombia,0.581847,60543682.0,1936134.0,3.357923,17.745037,6424.98,5332.77
199,GBR,United Kingdom,0.922,59475032.0,7249573.0,3.353883,18.03334,42354.41,40284.64


In [23]:
import plotly.express as px
import plotly.graph_objects as go
figure = px.bar(data,y='Total Cases', x='Country',title="Countries with Highest Covid Cases")
figure.show()

In [24]:
figure = px.bar(data,y='Total Deaths', x='Country', title='Countries with Highest Covid Deaths')
figure.show()
save_fig('Countries with Highest Covid Deaths')

saving images figure Countries with Highest Covid Deaths


<Figure size 432x288 with 0 Axes>