In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
%matplotlib inline


In [None]:
url = "https://opendata.ecdc.europa.eu/covid19/casedistribution/csv"
df = pd.read_csv(url)

In [None]:
df.info()

In [None]:
df.head(5)

In [None]:
#Convert data strin columns to Pandas datatime .
df["dateRep"] = pd.to_datetime(df["dateRep"], format='%d/%m/%Y')

In [None]:
df['countriesAndTerritories'].value_counts(sort=True)

In [None]:
df.isnull().sum()

In [None]:
df[df['geoId'].isnull()]['countriesAndTerritories'].nunique()

In [None]:
#Concatenate the Values countriesAndTerritories	geoId	countryterritoryCode to ensure we have unique values
cols = ['countryterritoryCode', 'countriesAndTerritories', 'geoId']
df['location'] = df[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

In [None]:
df['location'].nunique()

We notice that the fields geoId, CountryTerritoryCode, CountriesAndTerritories have a 1:1 relation and we can get rid of these redundant features. Keeping countriesAndTerritories as it is has non-null values and getting rid of other two features.


In [None]:
df.drop(['geoId','countryterritoryCode'], axis = 1, inplace=True)

In [None]:
df['continentExp'].value_counts()

In [None]:
df[['cases','deaths','popData2019']].describe()

In [None]:
 df.groupby(by='continentExp')[['cases','deaths','popData2019']].agg(['max','min','mean','std']).sort_values([('deaths', 'max')], ascending=False)

**Add a new feature to understand the death rate per million and cases reported per million. Before this ensure there are no null values for population**




In [None]:
df[df['popData2019'].isnull()]['countriesAndTerritories'].unique()

In [None]:
IntJapan = df[df['popData2019'].isnull()]
IntJapan[IntJapan['cases'] == IntJapan['cases'].max()]

In [None]:
#Drop the columns which do not have population density - This data belongs to a particular category and can be reiterated later
df = df[df['popData2019'].notna()]

In [None]:
df['casesPerX'] = df['cases']/df['popData2019']
df['deathsPerX'] = df['deaths']/df['popData2019']

**EDA - Visualization**



In [None]:
plt.style.use('seaborn-colorblind')

In [None]:
sns.lmplot(x='cases', y="deaths",hue="continentExp", data=df,fit_reg=True);


In [None]:
cols = ['month','cases','deaths','casesPerX','deathsPerX','popData2019']
df[cols].corr()[cols]

In [None]:
sns.heatmap(df[cols].corr()[cols],annot=True);

In [None]:
#Convert Month to legible format. We have one record for Dec31st - So plot Dec first
df['month'] = df['month'].map({1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',
                               7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'})


In [None]:
hue_order = ['Africa', 'America','Asia','Europe','Ocenia']
order = list(reversed(df['month'].unique()))
plt.figure(figsize=(15,20))
def plt_barchart(pos,title,yaxis):
    plt.subplot(4,1,pos)
    p1 = sns.barplot(x='month', y=yaxis, data=df, hue='continentExp', palette='muted', order = order, hue_order=hue_order);
    plt.title(title);
    p1.set(xlabel=None)
    p1.set(ylabel=None)
    p1.legend(title='Continent',loc='upper left')

plt_barchart(1,'Cases by Month', 'cases')
plt_barchart(2,'Case Rate Per Million', 'casesPerX')
plt_barchart(3,'Deaths by Month', 'deaths')
plt_barchart(4, 'Death Rate Per Million', 'deathsPerX')



In [None]:
#Perform some queries based on above analysis. Check the total number of cases for the month of June.

df[df['month']=='Jun']['continentExp'].value_counts()


In [None]:
df[df['continentExp']=='America']['countriesAndTerritories'].value_counts()

In [None]:
import matplotlib.dates as mdates
df_america = df[df['continentExp'] == 'America']
months_list = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']

fig = plt.figure(figsize=(20,18))
lv_index = 0
def plt_relplot(import_df,title,pos,col): 
    ax1 = fig.add_subplot(3,2,pos)
    p1 = sns.lineplot('dateRep',col, data=import_df, estimator=sum, ci=None, size=5,\
                  hue='countriesAndTerritories',legend=False,ax=ax1);
    p1.set_xticklabels(rotation=45, horizontalalignment='right',
        fontweight='light',
        fontsize=12, labels=df['dateRep']);
    p1.set_yticklabels(horizontalalignment='right',
        fontweight='light',
        fontsize=12,labels=df[col]);
    p1.set_title(title,fontsize='small');
    p1.set_xlabel(xlabel='');
    p1.set_ylabel(ylabel='');
    #set ticks every week
    ax1.xaxis.set_major_locator(mdates.WeekdayLocator())
    #set major ticks format
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
    

for month in months_list:
    df_list = df[df['month'] == month]
    lv_index = lv_index + 1
    title = 'Case Count For : ' + month
    plt_relplot(df_list,title,lv_index,'cases')


**Feature Scaling** 
<p>Implementing the feature scaling on Population Data</p>

In [None]:
#MinMaxScalar ( Normalization )
minmax_scaling = MinMaxScaler()
df['popData2019MinMax'] =  minmax_scaling.fit_transform(df[['popData2019']])
df['casesPerXMinMax'] =  minmax_scaling.fit_transform(df[['casesPerX']])
df['deathsPerXMinMax'] = minmax_scaling.fit_transform(df[['deathsPerX']])

In [None]:
#Standardization ( Z-Score Normalization)
standard_scaler = StandardScaler()

df['popData201Standard'] =  standard_scaler.fit_transform(df[['popData2019']])
df['casesPerXStandard'] =  standard_scaler.fit_transform(df[['casesPerX']])
df['deathsPerXStandard'] = standard_scaler.fit_transform(df[['deathsPerX']])

In [None]:
#Standardize cases and death rater per million 


sns.lmplot(y='deathsPerXStandard', x='casesPerXStandard', data=df, hue='continentExp', palette='viridis')

sns.lmplot(y='deathsPerXMinMax', x='casesPerXMinMax', data=df, palette='viridis', hue='continentExp');

In [None]:
#Select which country/territory is most effected in America

df_america = df[df['continentExp'] == 'America']

df_sort = df_america.sort_values(['deathsPerX', 'casesPerX'])
df_sort.tail(5)