# Extract Part

In [1]:
#Import all dependencies
import pandas as pd
from sqlalchemy import create_engine

In [2]:
#Locate all three CSV files/data sets
confirmed_csv_file = "./Raw Data/time_series_covid19_confirmed_global.csv"
recovered_csv_file = "./Raw Data/time_series_covid19_recovered_global.csv"
deaths_csv_file = "./Raw Data/time_series_covid19_deaths_global.csv"

In [3]:
#Read data set/csv file on confirmed cases and display first 5 rows
confirmed_data_df = pd.read_csv(confirmed_csv_file)
confirmed_data_df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,22,22,24,24,40,40,74,84,94,110
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,59,64,70,76,89,104,123,146,174,186
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,74,87,90,139,201,230,264,302,367,409
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,39,53,75,88,113,133,164,188,224,267
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,0,0,1,2,2,3,3,3,4,4


In [4]:
#Read data set/csv file on recovered cases and display first 5 rows
recovered_data_df = pd.read_csv(recovered_csv_file)
recovered_data_df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,2,2,2
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,0,0,0,2,2,2,10,17,17,31
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,12,32,32,32,65,65,24,65,29,29
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
#Read data set/csv file on deaths and display first 5 rows
deaths_data_df = pd.read_csv(deaths_csv_file)
deaths_data_df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,2,4,4
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,2,2,2,2,2,4,5,5,6,8
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,7,9,11,15,17,17,19,21,25,26
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,0,0,0,0,1,1,1,1,3,3
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Transform Part

In [6]:
#Import module 
import numpy as np

In [7]:
#Confirm the data are the right data types
confirmed_data_df.dtypes
recovered_data_df.dtypes
deaths_data_df.dtypes

Province/State     object
Country/Region     object
Lat               float64
Long              float64
1/22/20             int64
                   ...   
3/23/20             int64
3/24/20             int64
3/25/20             int64
3/26/20             int64
3/27/20             int64
Length: 70, dtype: object

In [8]:
#Create a function to clean the datasets

def clean(df):
    #Drop the "Province/State","Lat" & "Long" columns
    #Select columns of index and "Country/Region", and only dates in March
    #Group by "Country/Region" and caculate the sum value
    #Reset the index of the dataframe
    clean_df = df.drop(columns=['Province/State','Lat','Long']).\
                    iloc[:,np.r_[0:1,40:67]].\
                    groupby('Country/Region').sum().\
                    reset_index()
    return clean_df

In [9]:
#Clean the "confirmed_data_df" dataset
confirmed_clean_df = clean(confirmed_data_df)
confirmed_clean_df

Unnamed: 0,Country/Region,3/1/20,3/2/20,3/3/20,3/4/20,3/5/20,3/6/20,3/7/20,3/8/20,3/9/20,...,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20
0,Afghanistan,1,1,1,1,1,1,1,4,4,...,22,22,24,24,40,40,74,84,94,110
1,Albania,0,0,0,0,0,0,0,0,2,...,59,64,70,76,89,104,123,146,174,186
2,Algeria,1,3,5,12,12,17,17,19,20,...,74,87,90,139,201,230,264,302,367,409
3,Andorra,0,1,1,1,1,1,1,1,1,...,39,53,75,88,113,133,164,188,224,267
4,Angola,0,0,0,0,0,0,0,0,0,...,0,0,1,2,2,3,3,3,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,Venezuela,0,0,0,0,0,0,0,0,0,...,36,42,42,70,70,77,84,91,107,107
172,Vietnam,16,16,16,16,16,16,18,30,30,...,75,85,91,94,113,123,134,141,153,163
173,West Bank and Gaza,0,0,0,0,4,7,16,16,19,...,41,44,47,48,52,59,59,59,84,91
174,Zambia,0,0,0,0,0,0,0,0,0,...,2,2,2,2,3,3,3,12,16,22


In [10]:
#Clean the "recovered_data_df" dataset
recovered_clean_df = clean(recovered_data_df)
recovered_clean_df

Unnamed: 0,Country/Region,3/1/20,3/2/20,3/3/20,3/4/20,3/5/20,3/6/20,3/7/20,3/8/20,3/9/20,...,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20
0,Afghanistan,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,2,2,2
1,Albania,0,0,0,0,0,0,0,0,0,...,0,0,0,2,2,2,10,17,17,31
2,Algeria,0,0,0,0,0,0,0,0,0,...,12,32,32,32,65,65,24,65,29,29
3,Andorra,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
4,Angola,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,Venezuela,0,0,0,0,0,0,0,0,0,...,0,0,0,0,15,15,15,15,15,31
172,Vietnam,16,16,16,16,16,16,16,16,16,...,16,16,16,17,17,17,17,17,20,20
173,West Bank and Gaza,0,0,0,0,0,0,0,0,0,...,0,0,17,17,17,17,17,17,17,17
174,Zambia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
#Clean the "deaths_data_df" dataset
deaths_clean_df = clean(deaths_data_df)
deaths_clean_df

Unnamed: 0,Country/Region,3/1/20,3/2/20,3/3/20,3/4/20,3/5/20,3/6/20,3/7/20,3/8/20,3/9/20,...,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20
0,Afghanistan,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,2,4,4
1,Albania,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,4,5,5,6,8
2,Algeria,0,0,0,0,0,0,0,0,0,...,7,9,11,15,17,17,19,21,25,26
3,Andorra,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,1,3,3
4,Angola,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,Venezuela,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
172,Vietnam,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
173,West Bank and Gaza,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
174,Zambia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
#Confirm the content of "Country/Region" column are the same in those dataframes
confirmed_clean_df["Country/Region"].equals(deaths_clean_df["Country/Region"])

True

In [13]:
#Same as above
recovered_clean_df["Country/Region"].equals(deaths_clean_df["Country/Region"])

True

In [14]:
recovered_clean_df

Unnamed: 0,Country/Region,3/1/20,3/2/20,3/3/20,3/4/20,3/5/20,3/6/20,3/7/20,3/8/20,3/9/20,...,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20
0,Afghanistan,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,2,2,2
1,Albania,0,0,0,0,0,0,0,0,0,...,0,0,0,2,2,2,10,17,17,31
2,Algeria,0,0,0,0,0,0,0,0,0,...,12,32,32,32,65,65,24,65,29,29
3,Andorra,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
4,Angola,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,Venezuela,0,0,0,0,0,0,0,0,0,...,0,0,0,0,15,15,15,15,15,31
172,Vietnam,16,16,16,16,16,16,16,16,16,...,16,16,16,17,17,17,17,17,20,20
173,West Bank and Gaza,0,0,0,0,0,0,0,0,0,...,0,0,17,17,17,17,17,17,17,17
174,Zambia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
