In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'
plt.style.use('ggplot')

import seaborn as sns
from datetime import datetime

In [None]:
df = pd.read_csv('/kaggle/input/novel-corona-virus-2019-dataset/covid_19_data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
countries = df.groupby(['ObservationDate','Country/Region'])[['Confirmed','Deaths','Recovered']].sum()

In [None]:
countries.reset_index(inplace=True)

In [None]:
countries['Country/Region'].value_counts()

In [None]:
provinces = df.groupby(['ObservationDate','Country/Region', 'Province/State'])[['Confirmed','Deaths','Recovered']].sum()

In [None]:
provinces.reset_index(inplace=True)

In [None]:
countries.head()

In [None]:
def feature_eng(dataset):
    
    dataset['Active_Cases'] = [ c-r-d for c,r,d in zip(dataset.Confirmed,dataset.Recovered, dataset.Deaths)]

    dataset['Death_Proportion'] = [ (d/c)*100 if c != 0 and d !=0 else 0 for c,d in zip(dataset.Confirmed, dataset.Deaths)]
    
    dataset['Recover_Proportion'] = [ (r/c)*100 if c != 0 and r !=0 else 0 for c,r in zip(dataset.Confirmed, dataset.Recovered)]
    
    dataset.reset_index(inplace=True)
    
    return dataset



In [None]:
countries = feature_eng(countries)

In [None]:
countries.head()

In [None]:
countries.tail()

### Adjusting the ObservationDate Field to has the same pattern

In [None]:
# As I'd identified that some data are with different pattern, I start to see the length of data field to adjust it after that
countries['dt_len'] = [len(d) for d in countries.ObservationDate]

In [None]:
# Adjusting years that has different pattern
countries['Date'] = [dt+ '20' if l<10 else dt for dt,l in zip(countries.ObservationDate, countries.dt_len)]

### Creating the Top 10 Rankings 

In [None]:
ranking = countries.groupby('Country/Region')[['Deaths','Confirmed','Recovered','Active_Cases']].max()

In [None]:
def rankings(dataset, nivel='top'):
    
    '''
    nivel (str) -> 'top' or 'bottom'
    '''
    
    if nivel=='top':
        
        print('Top 10 Ranking With Mainland China')

        fig,ax = plt.subplots(2,2)

        dataset.sort_values(by='Confirmed',ascending=False)['Confirmed'][:10].plot(kind='barh',color='blue', ax=ax[0][0], figsize=(20,8))
        ax[0][0].title.set_text('Confirmed Cases')
        dataset.sort_values(by='Deaths',ascending=False)['Deaths'][:10].plot(kind='barh',color='red', ax=ax[0][1])
        ax[0][1].title.set_text('Deaths Cases')
        dataset.sort_values(by='Recovered',ascending=False)['Recovered'][:10].plot(kind='barh',color='green', ax=ax[1][0])
        ax[1][0].title.set_text('Recovered Cases')
        dataset.sort_values(by='Active_Cases',ascending=False)['Active_Cases'][:10].plot(kind='barh',color='gold', ax=ax[1][1])
        ax[1][1].title.set_text('Active Cases')

        plt.show()


        print('Top 10 Ranking Without Mainland China')

        fig,ax = plt.subplots(2,2)

        dataset[dataset.index != 'Mainland China'].sort_values(by='Confirmed',ascending=False)['Confirmed'][:10].plot(kind='barh',color='blue', ax=ax[0][0], figsize=(20,8))
        ax[0][0].title.set_text('Confirmed Cases')
        dataset[dataset.index != 'Mainland China'].sort_values(by='Deaths',ascending=False)['Deaths'][:10].plot(kind='barh',color='red', ax=ax[0][1])
        ax[0][1].title.set_text('Deaths Cases')
        dataset[dataset.index != 'Mainland China'].sort_values(by='Recovered',ascending=False)['Recovered'][:10].plot(kind='barh',color='green', ax=ax[1][0])
        ax[1][0].title.set_text('Recovered Cases')
        dataset[dataset.index != 'Mainland China'].sort_values(by='Active_Cases',ascending=False)['Active_Cases'][:10].plot(kind='barh',color='gold', ax=ax[1][1])
        ax[1][1].title.set_text('Active Cases')
        plt.show()
        
    else:
        
        print('Rankings')

        fig,ax = plt.subplots(2,2)

        dataset[(dataset.Confirmed >= 50) ].sort_values(by='Confirmed',ascending=True)['Confirmed'][:10].plot(kind='barh', ax=ax[0][0], figsize=(20,8))
        ax[0][0].title.set_text('Confirmed More than 50 Cases')
        dataset[(dataset.Deaths >= 20) ].sort_values(by='Deaths',ascending=True)['Deaths'][:10].plot(kind='barh',color='red', ax=ax[0][1])
        ax[0][1].title.set_text('More than 20 Deaths Cases')
        dataset[(dataset.Recovered >= 50) ].sort_values(by='Recovered',ascending=True)['Recovered'][:10].plot(kind='barh',color='green', ax=ax[1][0])
        ax[1][0].title.set_text('More Than 50 Recovered Cases')
        dataset[(dataset.Active_Cases >= 30) ].sort_values(by='Active_Cases',ascending=True)['Active_Cases'][:10].plot(kind='barh',color='gold', ax=ax[1][1])
        ax[1][1].title.set_text('More Than 30 Active Cases')

        plt.show()


        print('Bottom 10 Ranking Different of 0')

        fig,ax = plt.subplots(2,2)

        dataset[(dataset.Confirmed != 0) ].sort_values(by='Confirmed',ascending=True)['Confirmed'][:10].plot(kind='barh', ax=ax[0][0], figsize=(20,8))
        ax[0][0].title.set_text('Confirmed Cases')
        dataset[(dataset.Deaths != 0)].sort_values(by='Deaths',ascending=True)['Deaths'][:10].plot(kind='barh',color='red', ax=ax[0][1])
        ax[0][1].title.set_text('Deaths Cases')
        dataset[(dataset.Recovered != 0)].sort_values(by='Recovered',ascending=True)['Recovered'][:10].plot(kind='barh',color='green', ax=ax[1][0])
        ax[1][0].title.set_text('Recovered Cases')
        dataset[(dataset.Active_Cases != 0)].sort_values(by='Active_Cases',ascending=True)['Active_Cases'][:10].plot(kind='barh',color='gold', ax=ax[1][1])
        ax[1][1].title.set_text('Active Cases')
        plt.show()

    

In [None]:
rankings(ranking)

In [None]:
rankings(ranking, nivel='bottom')

### Ranking By proportion (Proportion calculated by total cases in the country)

In [None]:
ranking.reset_index(inplace=True)

In [None]:
proportion_rank = feature_eng(ranking)
proportion_rank.drop(columns='index', inplace=True)

In [None]:
proportion_rank.sort_values(by=['Death_Proportion','Recover_Proportion'],ascending=False)[['Country/Region','Death_Proportion','Recover_Proportion']][:50]\
.plot(x='Country/Region',kind='bar', figsize=(20,5))
plt.title('Death x Recover Proportion look in each country')
plt.show()

### View By Country

In [None]:
def country_view(dataset, country):
    
    ds = dataset[dataset['Country/Region']== country]
    
    print(country + " Behavior from: " + np.min(ds.Date) + ' to: '+ np.max(ds.Date))
    
    fig,ax = plt.subplots(2,2)

    ds[['Date','Confirmed']].plot(x='Date', figsize=(20,8),\
                                                                                                                    ax=ax[0][0])
    ax[0][0].title.set_text('Confirmed Cases')

    ds[['Date','Active_Cases']].plot(x='Date', figsize=(20,8),color='gold' ,\
                                                                                                                    ax=ax[0][1])
    ax[0][1].title.set_text('Active Cases')

    ds[['Date','Recover_Proportion']].plot(x='Date', figsize=(20,8),color='green',\
                                                                                                                    ax=ax[1][0])
    ax[1][0].title.set_text('\nRecovered Proportion Cases')

    ds[['Date','Death_Proportion']].plot(x='Date', figsize=(20,8),color='red',\
                                                                                                                    ax=ax[1][1])
    ax[1][1].title.set_text('Death Proportion Cases')

    plt.show()
    
    

In [None]:
country_view(countries, 'Mainland China')

In [None]:
country_view(countries, 'Italy')

In [None]:
country_view(countries, 'US')

In [None]:
country_view(countries, 'Canada')

In [None]:
country_view(countries, 'Brazil')

In [None]:
country_view(countries, 'Australia')

In [None]:
country_view(countries, 'Portugal')

In [None]:
country_view(countries, 'France')

In [None]:
country_view(countries, 'Spain')

### Now we gonna look how each country is compared with the hole world

In [None]:
# Calculating the total of deaths, confirmed, recovered, active cases in the world
# I tranform it in list to can interate over and calculate the proportion for each country 

total_deaths = [ranking['Deaths'].sum()] * len(ranking)
total_cases = [ranking['Confirmed'].sum()]  * len(ranking)
total_recovers = [ranking['Recovered'].sum() ]  * len(ranking)
total_active_cases = [ranking['Active_Cases'].sum()]  * len(ranking)

In [None]:
print('Corona Death rate in the hole World: {0:.3f} %'.format(((total_deaths[0] / total_cases[0]) * 100)) )

In [None]:
print('Corona Recovered rate in the hole World: {0:.3f} %'.format(((total_recovers[0] / total_cases[0]) * 100)) )

In [None]:
print('Corona Active cases rate in the hole World: {0:.3f} %'.format(((total_active_cases[0] / total_cases[0]) * 100)) )

In [None]:
# Creating the data frame using the country/region column
world_proportion = pd.DataFrame(ranking['Country/Region'], columns=['Country/Region']) 

In [None]:
# Calculating the proportion for each country, comparing his total values with the hole world   

world_proportion['Active_Cases'] = [ (a/t)*100 for a,t in zip(ranking.Active_Cases, total_active_cases )]

world_proportion['Death_Proportion'] = [ (d/t)*100 if d != 0 and t !=0 else 0 for d,t in zip(ranking.Deaths, total_deaths)]

world_proportion['Recover_Proportion'] = [ (r/t)*100 if t != 0 and r !=0 else 0 for r,t in zip(ranking.Recovered, total_recovers)]

world_proportion['Confirmed_Proportion'] = [ (c/t)*100 if c != 0 and t !=0 else 0 for c,t in zip(ranking.Confirmed, total_cases)]


In [None]:
world_proportion.sort_values('Death_Proportion', ascending=False)[:30].plot(x='Country/Region',kind='bar', figsize=(20,5))
plt.title('Ranking of Death Proportion (With China)')
plt.show()

In [None]:
world_proportion[world_proportion['Country/Region'] != 'Mainland China'].sort_values('Death_Proportion', ascending=False)[:30].plot(x='Country/Region',kind='bar', figsize=(20,5))
plt.title('Ranking of Death Proportion (Without China)')
plt.show()

### Making Projections Using Forecast

In [None]:
from fbprophet import Prophet
from fbprophet.plot import add_changepoints_to_plot

In [None]:
world = countries.groupby('Date')[['Confirmed','Deaths','Recovered','Active_Cases']].sum()

In [None]:
world.reset_index(inplace=True)

In [None]:
china_ds = countries[countries['Country/Region'] == 'Mainland China']
italy_ds = countries[countries['Country/Region'] == 'Italy']
france_ds = countries[countries['Country/Region'] == 'France']
brazil_ds = countries[countries['Country/Region'] == 'Brazil']
canada_ds = countries[countries['Country/Region'] == 'Canada']
spain_ds = countries[countries['Country/Region'] == 'Spain']
portugal_ds = countries[countries['Country/Region'] == 'Portugal']
us_ds = countries[countries['Country/Region'] == 'US']

In [None]:
brazil_ds.head()

In [None]:
#np.diff()

In [None]:
italy = italy_ds[['Date','Confirmed','Deaths']]
italy['T'] = [i for i in np.arange(len(italy))]

In [None]:
brazil = brazil_ds[['Date','Confirmed','Deaths']]
brazil['T'] = [i for i in np.arange(len(brazil))]

In [None]:
us = us_ds[['Date','Confirmed','Deaths']]
us['T'] = [i for i in np.arange(len(us))]

In [None]:
compare_itl_brz = italy.merge(brazil, left_on='T' , right_on='T', how='left', suffixes=('_Italy', '_Brazil'))

In [None]:
compare_itl_brz.head()

In [None]:
compare_itl_brz = compare_itl_brz.merge(us,left_on='T' , right_on='T', how='left')


In [None]:
compare_itl_brz.rename(columns={'Confirmed':'Confirmed_US', 'Date':'Date_US','Deaths':'Deaths_US'}, inplace=True)

In [None]:
china = china_ds[['Date','Confirmed','Deaths']]
china['T'] = [i for i in np.arange(len(china))]

In [None]:
compare_itl_brz = china.merge(compare_itl_brz,left_on='T' , right_on='T', how='left')


In [None]:
compare_itl_brz.rename(columns={'Confirmed':'Confirmed_China','Date':'Date_China','Deaths':'Deaths_China'}, inplace=True)

In [None]:
compare_itl_brz[:5]

In [None]:
lck_down_china = '01/23/2020'
compare_itl_brz['China_lck_Down'] = [1 if  i == lck_down_china else 0  for i in compare_itl_brz.Date_China]
day_lck_china = compare_itl_brz[compare_itl_brz['China_lck_Down'] == 1]['T']

In [None]:
lck_down_italy = '03/09/2020'
compare_itl_brz['Italy_lck_Down'] = [1 if  i == lck_down_italy else 0  for i in compare_itl_brz.Date_Italy]
day_lck_italy = compare_itl_brz[compare_itl_brz['Italy_lck_Down'] == 1]['T']

In [None]:
lck_down_brz = '03/27/2020'
compare_itl_brz['Brazil_lck_Down'] = [1 if  i == lck_down_brz else 0  for i in compare_itl_brz.Date_Brazil]
day_lck_brz = compare_itl_brz[compare_itl_brz['Brazil_lck_Down'] == 1]['T']

In [None]:
day_lck_brz

In [None]:
plt.style.use('ggplot')
ax = compare_itl_brz.plot(x='T', y=['Confirmed_Italy'],figsize=(20,8), color='g')
compare_itl_brz.plot(x='T', y=['Confirmed_Brazil'], color='red', ax=ax)
compare_itl_brz.plot(x='T', y=['Confirmed_US'], color='blue', ax=ax)
compare_itl_brz.plot(x='T', y=['Confirmed_China'], color='orange', ax=ax)

plt.axvline(day_lck_china.values, ls='--',color='orange')
plt.axhline(compare_itl_brz[compare_itl_brz.China_lck_Down == 1]['Confirmed_China'].values, ls='--', color='orange')

plt.axvline(day_lck_italy.values, ls='--', color='g')
plt.axhline(compare_itl_brz[compare_itl_brz.Italy_lck_Down == 1]['Confirmed_Italy'].values, ls='--', color='g')

plt.axvline(day_lck_brz.values, ls='--', color='red')
plt.axhline(compare_itl_brz[compare_itl_brz.Brazil_lck_Down == 1]['Confirmed_Brazil'].values, ls='--', color='red')

plt.title("Comparing the brute values curve of China x Brazil x US x Italy curve")
ax.set_xlabel('Day Count')
ax.set_ylabel('Confirmed cases')
plt.show()

In [None]:
compare_itl_brz.head()

In [None]:
compare_itl_brz['Diff_China'] = compare_itl_brz['Confirmed_China'].diff()
compare_itl_brz['Diff_Italy'] = compare_itl_brz['Confirmed_Italy'].diff()
compare_itl_brz['Diff_Brazil'] = compare_itl_brz['Confirmed_Brazil'].diff()
compare_itl_brz['Diff_US'] = compare_itl_brz['Confirmed_US'].diff()

In [None]:
compare_itl_brz['Diff_China'][0] = compare_itl_brz['Confirmed_China'][0]
compare_itl_brz['Diff_Italy'][0] = compare_itl_brz['Confirmed_Italy'][0]
compare_itl_brz['Diff_Brazil'][0] = compare_itl_brz['Confirmed_Brazil'][0]
compare_itl_brz['Diff_US'][0] = compare_itl_brz['Confirmed_US'][0]

In [None]:
compare_itl_brz['Diff_Death_China'] = compare_itl_brz['Deaths_China'].diff()
compare_itl_brz['Diff_Death_Italy'] = compare_itl_brz['Deaths_Italy'].diff()
compare_itl_brz['Diff_Death_Brazil'] = compare_itl_brz['Deaths_Brazil'].diff()
compare_itl_brz['Diff_Death_US'] = compare_itl_brz['Deaths_US'].diff()

In [None]:
compare_itl_brz['Diff_Death_China'][0] = compare_itl_brz['Deaths_China'][0]
compare_itl_brz['Diff_Death_Italy'][0] = compare_itl_brz['Deaths_Italy'][0]
compare_itl_brz['Diff_Death_Brazil'][0] = compare_itl_brz['Deaths_Brazil'][0]
compare_itl_brz['Diff_Death_US'][0] = compare_itl_brz['Deaths_US'][0]

In [None]:
compare_itl_brz['Log_Diff_Death_China'] = np.log(compare_itl_brz['Deaths_China']).diff()
compare_itl_brz['Log_Diff_Death_Italy'] = np.log(compare_itl_brz['Deaths_Italy']).diff()
compare_itl_brz['Log_Diff_Death_Brazil'] = np.log(compare_itl_brz['Deaths_Brazil']).diff()
compare_itl_brz['Log_Diff_Death_US'] = np.log(compare_itl_brz['Deaths_US']).diff()

In [None]:
compare_itl_brz['Log_Diff_Death_China'][0] = np.log(compare_itl_brz['Deaths_China'][0]+1)
compare_itl_brz['Log_Diff_Death_Italy'][0] = np.log(compare_itl_brz['Deaths_Italy'][0]+1)
compare_itl_brz['Log_Diff_Death_Brazil'][0] = np.log(compare_itl_brz['Deaths_Brazil'][0]+1)
compare_itl_brz['Log_Diff_Death_US'][0] = np.log(compare_itl_brz['Deaths_US'][0]+1)

In [None]:
us_ds.tail()

In [None]:
compare_itl_brz[compare_itl_brz.Italy_lck_Down == 1].head()

In [None]:
compare_itl_brz[compare_itl_brz.Italy_lck_Down == 1]['Confirmed_Italy'].values, compare_itl_brz[compare_itl_brz.China_lck_Down == 1]['Confirmed_China'].values

In [None]:
compare_itl_brz[compare_itl_brz.Italy_lck_Down == 1]['Deaths_Italy'].values, compare_itl_brz[compare_itl_brz.China_lck_Down == 1]['Deaths_China'].values

In [None]:
log_compare = compare_itl_brz.copy()
log_compare['Confirmed_Brazil'] = [np.log(i) for i in compare_itl_brz['Confirmed_Brazil']+1]
log_compare['Confirmed_Italy'] = [np.log(i) for i in compare_itl_brz['Confirmed_Italy']]
log_compare['Confirmed_China'] = [np.log(i) for i in compare_itl_brz['Confirmed_China']]
log_compare['Confirmed_US'] = [np.log(i) for i in compare_itl_brz['Confirmed_US']]

In [None]:
plt.style.use('ggplot')
ax = log_compare.plot(x='T', y=['Confirmed_Italy'],figsize=(20,8), color='g')
log_compare.plot(x='T', y=['Confirmed_Brazil'], color='red', ax=ax)
log_compare.plot(x='T', y=['Confirmed_US'], color='blue', ax=ax)
log_compare.plot(x='T', y=['Confirmed_China'], color='orange', ax=ax)

plt.axvline(day_lck_china.values, ls='--',color='orange')
plt.axhline(log_compare[log_compare.China_lck_Down == 1]['Confirmed_China'].values, ls='--', color='orange')

plt.axvline(day_lck_italy.values, ls='--', color='g')
plt.axhline(log_compare[log_compare.Italy_lck_Down == 1]['Confirmed_Italy'].values, ls='--', color='g')

plt.axvline(day_lck_brz.values, ls='--', color='red')
plt.axhline(log_compare[log_compare.Brazil_lck_Down == 1]['Confirmed_Brazil'].values, ls='--', color='red')

plt.title("Comparing the log values curve of China x Brazil x US x Italy curve")
ax.set_xlabel('Day Count')
ax.set_ylabel('Log of confirmed cases')
plt.show()

In [None]:
plt.style.use('ggplot')
ax = log_compare.plot(x='T', y=['Deaths_Italy'],figsize=(20,8), color='g')
log_compare.plot(x='T', y=['Deaths_Brazil'], color='red', ax=ax)
log_compare.plot(x='T', y=['Deaths_US'], color='blue', ax=ax)
log_compare.plot(x='T', y=['Deaths_China'], color='orange', ax=ax)

plt.axvline(day_lck_china.values, ls='--',color='orange')
plt.axhline(log_compare[log_compare.China_lck_Down == 1]['Deaths_China'].values, ls='--', color='orange')

plt.axvline(day_lck_italy.values, ls='--', color='g')
plt.axhline(log_compare[log_compare.Italy_lck_Down == 1]['Deaths_Italy'].values, ls='--', color='g')

plt.axvline(day_lck_brz.values, ls='--', color='red')
plt.axhline(log_compare[log_compare.Brazil_lck_Down == 1]['Deaths_Brazil'].values, ls='--', color='red')

plt.title("Comparing the log values curve of China x Brazil x US x Italy curve")
ax.set_xlabel('Day Count')
ax.set_ylabel('Log of Deaths cases')
plt.show()

In [None]:
plt.style.use('ggplot')
ax = compare_itl_brz.plot(x='T', y=['Diff_Death_Italy'],figsize=(20,8), color='g')
compare_itl_brz.plot(x='T', y=['Diff_Death_Brazil'], color='red', ax=ax)
compare_itl_brz.plot(x='T', y=['Diff_Death_US'], color='blue', ax=ax)
compare_itl_brz.plot(x='T', y=['Diff_Death_China'], color='orange', ax=ax)

plt.axvline(day_lck_china.values, ls='--',color='orange')
plt.axhline(compare_itl_brz[compare_itl_brz.China_lck_Down == 1]['Diff_Death_China'].values, ls='--', color='orange')

plt.axvline(day_lck_italy.values, ls='--', color='g')
plt.axhline(compare_itl_brz[compare_itl_brz.Italy_lck_Down == 1]['Diff_Death_Italy'].values, ls='--', color='g')

plt.axvline(day_lck_brz.values, ls='--', color='red')
plt.axhline(compare_itl_brz[compare_itl_brz.Brazil_lck_Down == 1]['Diff_Death_Brazil'].values, ls='--', color='red')


plt.title("Comparing the Difference of death values curve of China x Brazil x US x Italy curve")
ax.set_xlabel('Day Count')
ax.set_ylabel('Difference of Death cases')
plt.show()

In [None]:
plt.style.use('ggplot')
ax = compare_itl_brz.plot(x='T', y=['Log_Diff_Death_Italy'],figsize=(20,8), color='g')
compare_itl_brz.plot(x='T', y=['Log_Diff_Death_Brazil'], color='red', ax=ax)
compare_itl_brz.plot(x='T', y=['Log_Diff_Death_US'], color='blue', ax=ax)
compare_itl_brz.plot(x='T', y=['Log_Diff_Death_China'], color='orange', ax=ax)

plt.axvline(day_lck_china.values, ls='--',color='orange')
plt.axhline(compare_itl_brz[compare_itl_brz.China_lck_Down == 1]['Log_Diff_Death_China'].values, ls='--', color='orange')

plt.axvline(day_lck_italy.values, ls='--', color='g')
plt.axhline(compare_itl_brz[compare_itl_brz.Italy_lck_Down == 1]['Log_Diff_Death_Italy'].values, ls='--', color='g')
plt.title("Comparing the Difference of death values curve of China x Brazil x US x Italy curve")

plt.axvline(day_lck_brz.values, ls='--', color='red')
plt.axhline(compare_itl_brz[compare_itl_brz.Brazil_lck_Down == 1]['Log_Diff_Death_Brazil'].values, ls='--', color='red')

ax.set_xlabel('Day Count')
ax.set_ylabel('Difference of Death cases')
plt.show()

In [None]:
plt.style.use('ggplot')
ax = compare_itl_brz.plot(x='T', y=['Diff_Italy'],figsize=(20,8), color='g')
compare_itl_brz.plot(x='T', y=['Diff_Brazil'], color='red', ax=ax)
compare_itl_brz.plot(x='T', y=['Diff_US'], color='blue', ax=ax)
compare_itl_brz.plot(x='T', y=['Diff_China'], color='orange', ax=ax)

plt.axvline(day_lck_china.values, ls='--',color='orange')
plt.axhline(compare_itl_brz[compare_itl_brz.China_lck_Down == 1]['Diff_China'].values, ls='--', color='orange')

plt.axvline(day_lck_italy.values, ls='--', color='g')
plt.axhline(compare_itl_brz[compare_itl_brz.Italy_lck_Down == 1]['Diff_Italy'].values, ls='--', color='g')




plt.title("Comparing the Difference of Confirmed cases values curve of China x Brazil x US x Italy curve")
ax.set_xlabel('Day Count')
ax.set_ylabel('Difference of confirmed cases')
plt.show()

In [None]:
density_compare = compare_itl_brz.copy()
density_compare['Confirmed_Brazil'] = [np.log(i) for i in compare_itl_brz['Confirmed_Brazil']]
density_compare['Confirmed_Italy'] = [np.log(i) for i in compare_itl_brz['Confirmed_Italy']]
density_compare['Confirmed_China'] = [np.log(i) for i in compare_itl_brz['Confirmed_China']]
density_compare['Confirmed_US'] = [np.log(i) for i in compare_itl_brz['Confirmed_US']]

In [None]:
len(brazil_ds), len(italy_ds)

##### Create the columns used by pattern for prophet
    - First I'll try apply the numbers of deaths to get the projection for the next days

In [None]:
def prophet_predictions(dataset,country_name, measure = 'Deaths', periods=60 ):
    
    import warnings
    warnings.filterwarnings("ignore")
    
    
    dataset['ds'] = [d for d in dataset.Date]
    dataset['y'] = [m for m in dataset[measure]]
    
    print("Period from: " + np.min(dataset['ds']) + ' to: ' + np.max(dataset['ds']) )
    
    #instantiating the model and training
    m = Prophet()
    m.fit(dataset)
    
    # Creating the range of next dates to predict
    future = m.make_future_dataframe(periods=periods)
    
    # Making the prediction
    forecast = m.predict(future)
    
    
    # Plotting the predict results
    print(country_name + " forecasting results")
    fig1 = m.plot(forecast)
    a = add_changepoints_to_plot(fig1.gca(), m, forecast)
    
    # Plotting the commponents
    print(country_name + " forecasting components")
    fig2 = m.plot_components(forecast)
    
 

### Creating Exponential model

In [None]:
def exponential_results(dataset, column_list, days_to_predict=30):
    t = days_to_predict
    column = column_list[1]
    
    br_reg_data = dataset[column_list]
    
    br_reg_data.reset_index(inplace=True)
    br_reg_data.rename(columns={'index':'Orgiginal_index'}, inplace=True)
    
    br_reg_data['Time'] = [i for i in br_reg_data.index]
    br_reg_data['Log_Confirmed'] = [np.log(i) if i!= 0 else 0 for i in br_reg_data[column]]
    
    print(br_reg_data.head())
    
    
    import statsmodels.api as sm
    
    X = br_reg_data.Time
    X = sm.add_constant(X) 
    
    y = br_reg_data.Log_Confirmed
    
    ln_reg = sm.OLS(y,X)
    result = ln_reg.fit()
    print(result.summary())
    
    a = np.exp(result.params[0])
    b = np.exp(result.params[1])
    
    lista = []
    
    for i in np.arange(t+1):
        lista.append(a * b ** i)  
    
    y = a * (b ** t)
    
    print('\nO valor de ' + column + ' estimado é de: '+ str(int(y)))
    
    return lista
    
    
    

#### World Forecast

In [None]:
prophet_predictions(world, 'World')

In [None]:
prophet_predictions(world, 'World','Confirmed')

In [None]:
prophet_predictions(world, 'World','Recovered')

In [None]:
prophet_predictions(world, 'World','Active_Cases')

#### Main China Number of Deaths Forecast 

In [None]:
prophet_predictions(china_ds, 'China')

#### Main China Number of Confirmed Cases Forecast 

In [None]:
prophet_predictions(china_ds, 'China','Confirmed')

In [None]:
prophet_predictions(china_ds, 'China','Recovered')

#### Italy Number of Deaths Forecast 

In [None]:
exp_predicao = exponential_results(italy_ds, column_list=['Date','Deaths'], days_to_predict=50)

In [None]:
plt.figure(figsize=(20,8))
#plt.plot(italy_ds.Deaths)
plt.plot(exp_predicao)

In [None]:
prophet_predictions(italy_ds, 'Italy')

#### Number of Confirmed Cases Forecast in Italy 

In [None]:
exponential_results(italy_ds, column_list=['Date','Confirmed'], days_to_predict=40)

In [None]:
prophet_predictions(italy_ds, 'Italy','Confirmed')

In [None]:
prophet_predictions(italy_ds, 'Italy','Recovered')

#### Deaths forecast for France

In [None]:
prophet_predictions(france_ds, 'France','Confirmed')

In [None]:
prophet_predictions(france_ds, 'France','Deaths')

In [None]:
prophet_predictions(france_ds, 'France','Recovered')

#### Spain

In [None]:
prophet_predictions(spain_ds, 'Spain','Confirmed')

In [None]:
prophet_predictions(spain_ds, 'Spain','Deaths')

In [None]:
prophet_predictions(spain_ds, 'Spain','Recovered')

#### Portugal Forecast

In [None]:
prophet_predictions(portugal_ds, 'Portugal','Confirmed')

In [None]:
prophet_predictions(portugal_ds, 'Portugal','Deaths')

In [None]:
prophet_predictions(portugal_ds, 'Portugal','Recovered')

#### Number of Confirmed Cases Forecast in Brazil

### Trying exponential model to predict 

In [None]:
brazil_ds.head()

In [None]:
exponential_results(brazil_ds, column_list=['Date','Confirmed'], days_to_predict=30)

In [None]:
prophet_predictions(brazil_ds[1:], 'Brazil','Confirmed')

In [None]:
prophet_predictions(brazil_ds[1:], 'Brazil','Deaths')

In [None]:
prophet_predictions(brazil_ds[1:], 'Brazil','Recovered')

#### Number of Death Cases Forecast in Canada 

In [None]:
prophet_predictions(canada_ds, 'Canada','Deaths')

#### Number of Confirmed Cases Forecast in Canda 

In [None]:
prophet_predictions(canada_ds, 'Canada','Confirmed')


#### Canada recovered cases

In [None]:
prophet_predictions(canada_ds, 'Canada','Recovered')

#### US

In [None]:
prophet_predictions(us_ds, 'US','Deaths')

In [None]:
prophet_predictions(us_ds, 'US','Confirmed')

In [None]:
prophet_predictions(us_ds, 'US','Recovered')

In [None]:
provinces.head()

In [None]:
us_provinces = provinces[provinces['Country/Region'] == 'US']

In [None]:
sum_us_provinces = us_provinces.groupby('Province/State')[['Confirmed','Deaths','Recovered']].sum()

In [None]:
sum_us_provinces.reset_index(inplace=True)

In [None]:
sum_us_provinces.sort_values('Confirmed', ascending=False)[['Province/State','Confirmed','Deaths','Recovered']]

In [None]:
provinces = sum_us_provinces.sort_values('Confirmed', ascending=False)[['Province/State','Confirmed','Deaths','Recovered']]

In [None]:
provinces [provinces['Province/State'] == 'Maryland']

In [None]:
sum_us_provinces.sort_values('Deaths', ascending=False)[:18][['Province/State','Confirmed','Deaths','Recovered']]

In [None]:
sum_us_provinces.sort_values('Recovered', ascending=False)[:18][['Province/State','Confirmed','Deaths','Recovered']]

### Let's start to use statistical thynking to comprove (or not) some hypothesis

In [None]:
#Install the dc-stat-think library
!pip install dc-stat-think

In [None]:
import dc_stat_think as dcst

### Comparing results by the total population of each country

##### China population: [China population](https://www.worldometers.info/world-population/china-population/)
##### Italy population: [Italy population](https://www.worldometers.info/world-population/italy-population/)



In [None]:
italy_population =  60487480
china_population =  1437710279

italy_demographic_density = italy_population / 301339
china_demographic_density = china_population / 9596960

In [None]:
italy_confirmed_mean = (np.mean(italy_ds.Confirmed) /  italy_population) * 100
china_confirmed_mean = (np.mean(china_ds.Confirmed[:len(italy_ds)]) / china_population) * 100

print('Populational Proportion of the Mean confirmed cases in Italy: ' + str(italy_confirmed_mean) + '\nPopulational Proportion of the Mean confirmed cases in China (equal period of Italy): ' + str(china_confirmed_mean) )

##### The proportion in Italy is alarmant, offcourse that China is bigger than Italy in population, but analyzing the proportion in each population, the Italy stay worst than the China situation, looking at the same period

In [None]:
italy_confirmed_std = (np.std(italy_ds.Confirmed) /  italy_population) * 100 
china_confirmed_std = (np.std(china_ds.Confirmed[:len(italy_ds)]) / china_population) * 100

In [None]:
print('Confirmed variance in Italy: ' + str(italy_confirmed_std) + '\nConfirmed variance in China (equal period of Italy): ' + str(china_confirmed_std) )

In [None]:
italy_confirmed_mean = (np.mean(italy_ds.Deaths) /  italy_population) * 100
china_confirmed_mean = (np.mean(china_ds.Deaths[:len(italy_ds)]) / china_population) * 100

print('Populational Proportion of the Mean deaths cases in Italy: ' + str(italy_confirmed_mean) + '\nPopulational Proportion of the Mean deaths cases in China (equal period of Italy): ' + str(china_confirmed_mean) )

#### Seems that in Italy the grow rate is bigger than in China (for equal period), the best manner to see it is compare the confidence interval for the slope rate 

### Comparing Brazil with China in equal period
[Brazil Territory length](https://en.wikipedia.org/wiki/Geography_of_Brazil)

In [None]:
brazil_population =   212119183

brazil_demographic_density = brazil_population / 8514215

In [None]:
brazil_confirmed_mean = (np.mean(brazil_ds.Confirmed) / brazil_population ) * 100 
china_confirmed_mean = (np.mean(china_ds.Confirmed[:len(brazil_ds)]) / china_population) * 100

print('Mean of confirmed cases in Brazil: ' + str(brazil_confirmed_mean) + '\nMean of confirmed cases in China (in equal period of Brazil): ' + str(china_confirmed_mean) )

In [None]:
0.000012728693189432095

In [None]:
brazil_confirmed_std = (np.std(brazil_ds.Confirmed) / brazil_population ) * 100 
china_confirmed_std = (np.std(china_ds.Confirmed[:len(brazil_ds)]) / china_population) * 100

print('Variation of confirmed cases in Brazil: ' + str(brazil_confirmed_std) + '\nVariation of confirmed cases in China (in equal period of Brazil): ' + str(china_confirmed_std) )

In [None]:
0.0000211590020223276 < 0.0009185691144710906

In [None]:
brazil_confirmed_mean = (np.mean(brazil_ds.Confirmed) / brazil_population ) * 100 
italy_confirmed_mean = (np.mean(italy_ds.Confirmed[:len(brazil_ds)]) / italy_population) * 100

print('Mean of confirmed cases in Brazil: ' + str(brazil_confirmed_mean) + '\nMean of confirmed cases in Italy (in equal period of Brazil): ' + str(italy_confirmed_mean) )

In [None]:
brazil_confirmed_mean > italy_confirmed_mean

In [None]:
brazil_death_mean = (np.mean(brazil_ds.Deaths) / brazil_population ) * 100 
italy_death_mean = (np.mean(italy_ds.Deaths[:len(brazil_ds)]) / italy_population) * 100

print('Mean of confirmed cases in Brazil: ' + str(brazil_death_mean) + '\nMean of death cases in Italy (in equal period of Brazil): ' + str(italy_death_mean) )

In [None]:
brazil_death_mean > italy_death_mean

In [None]:
brazil_demographic_density, italy_demographic_density

In [None]:
# Difference of demographic density between Brazil and Italy
italy_demographic_density - brazil_demographic_density

In [None]:
italy_demographic_density / brazil_demographic_density

In [None]:
brazil_death_mean = (np.mean(brazil_ds.Deaths) / (brazil_demographic_density + 175.8) ) * 100 
italy_death_mean = (np.mean(italy_ds.Deaths[:len(brazil_ds)]) / (italy_demographic_density)  ) * 100

print('Mean of confirmed cases in Brazil: ' + str(brazil_death_mean) + '\nMean of death cases in Italy (in equal period of Brazil): ' + str(italy_death_mean) )

### List Data Set Analyses

In [None]:
df_list = pd.read_csv('/kaggle/input/novel-corona-virus-2019-dataset/COVID19_line_list_data.csv')

In [None]:
df_list.info()

In [None]:
# df_list.drop(columns=['Unnamed: 33','Unnamed: 34','Unnamed: 35','Unnamed: 36','Unnamed: 37','Unnamed: 38','Unnamed: 39','Unnamed: 40',\
#                        'Unnamed: 41','Unnamed: 42','Unnamed: 43','Unnamed: 44'], inplace=True)

df_list.drop(columns=['Unnamed: 21','Unnamed: 22','Unnamed: 23','Unnamed: 24','Unnamed: 25','Unnamed: 26'], inplace=True)

In [None]:
age_range = []
for age in df_list.age:
    if age <= 10.0:
        age_range.append('0-10')
    elif age > 10.0 and age <= 20.0:
        age_range.append('11-20')
    elif age > 20.0 and age <= 30.0:
        age_range.append('21-30')    
    elif age > 30.0 and age <= 40.0:
        age_range.append('31-40')    
    elif age > 40.0 and age <= 50.0:
        age_range.append('41-50')    
    elif age > 50.0 and age <= 60.0:
        age_range.append('51-60')        
    elif age > 60.0 and age <= 70.0:
        age_range.append('61-70')    
    elif age > 70.0 and age <= 80.0:
        age_range.append('71-80')    
    else:
        age_range.append('81 and more')
        

In [None]:
df_list['Age_Range'] = age_range

In [None]:
df_list['death'].fillna(-1, inplace=True)

In [None]:
ages_fillup = df_list[df_list['death'] != -1]

In [None]:
ages_fillup['DEATHS'] = ['1' if (d != '1' and d != '0') else d for d in ages_fillup.death ]

In [None]:
ages_fillup['DEATHS'] = ages_fillup['DEATHS'].astype('int64')

In [None]:
age_freq = pd.DataFrame(ages_fillup.groupby('Age_Range')['DEATHS'].sum())

In [None]:
total_deaths = np.sum(age_freq.DEATHS)
print(total_deaths)


In [None]:
age_freq['DEATH_PROPORTION'] =  [ (d/t)*100 for d,t in zip(age_freq.DEATHS,[total_deaths] * len(age_freq.DEATHS) ) ]

In [None]:
age_freq['DEATH_PROPORTION']

In [None]:
gender_freq = pd.DataFrame(ages_fillup.groupby('gender')['DEATHS'].sum())

In [None]:
(gender_freq / total_deaths) * 100

In [None]:
gender_freq = pd.DataFrame(ages_fillup.groupby(['gender','Age_Range'])['DEATHS'].sum())

In [None]:
gender_freq

### Exploring the summary column
    - Wich kind of word is most frequent?
    - Could it show to us the main symthoms?

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2))

In [None]:
def exclude_numbs(dataset, column):
    pass

In [None]:
X = vectorizer.fit_transform(df_list.summary.fillna('no_values'))

In [None]:
print(vectorizer.get_feature_names())

In [None]:
X.toarray()

In [None]:
df_list.head()

In [None]:
# df_open_list = pd.read_csv('/kaggle/input/novel-corona-virus-2019-dataset/COVID19_open_line_list.csv')

In [None]:
# df_open_list.drop(columns=['Unnamed: 33','Unnamed: 34','Unnamed: 35','Unnamed: 36','Unnamed: 37','Unnamed: 38','Unnamed: 39','Unnamed: 40',\
#                         'Unnamed: 41','Unnamed: 42','Unnamed: 43','Unnamed: 44'], inplace=True)