# World contamination

Creation of 3 pickle files :

- `df_world.p` : number of contaminated/saved/dead people through the world with location information
- `df_world_fr.p` : adaptation of france contaminination file to have the same structure as the `df_world` dataframe
- `country_position.p` : saving the location of each country in case of further needs

# Imports and settings

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt

import sys
sys.path.append('../scripts/')
import utils_covid as f

import plotly.express as px
from plotly import graph_objs as go
from plotly.offline import init_notebook_mode, plot, iplot

In [None]:
pd.set_option('chained_assignment',None)
pd.set_option('display.max_columns', 500)

init_notebook_mode(connected=True)
mapbox_access_token = f.load_mapbox_token()
#px.set_mapbox_access_token(token)

In [None]:
%load_ext autoreload
%autoreload 2

%aimport utils_covid

# Data Cleaning

__Re using France Contamination file__

In [None]:
df_fr = f.load_pickle('df_contamination_fr.p')
df_fr = df_fr[['cas_confirme', 'total_retour_a_domicile', 'total_deces']]
df_fr.columns = ['confirmed', 'deaths', 'recovered']
df_fr.head()

__Loading and preparing world pandeminc dataset__

In [None]:
# Reading file
path = f.OPENDATA_PATH + 'covid-19-pandemic-worldwide-data.csv'
#path = '/Users/thibaud/Downloads/covid-19-pandemic-worldwide-data.csv'
df = pd.read_csv(path, sep=';')

# Focus on zones
df['subzone'] = df[['Zone', 'Sub Zone']].apply(lambda line: line['Zone'] if pd.isnull(line['Sub Zone']) else line['Sub Zone'], axis=1)

# Columns renaming
df.columns = [col.lower() for col in df.columns]

# Extracting latitute and longitude
df['lat'] = df['location'].apply(lambda x: x.split(',')[0])
df['lon'] = df['location'].apply(lambda x: x.split(',')[1])

# Saving countrues positions (latitude and longitude per subzones)
country_position = df[['subzone', 'lat', 'lon']].drop_duplicates("subzone").set_index('subzone').to_dict(orient='index')
print('France:', country_position['France'])

# Droping unnecessary colunms
df = df.drop(['zone', 'sub zone', 'location'], axis=1)

# Pivoting per category
df = pd.pivot_table(df, values='count', index=['date', 'subzone'], columns=['category'])
df.columns = ['confirmed', 'deaths', 'recovered']

df.head()

__Saving__ : both DataFrames have the exact same structure

In [None]:
f.save_pickle(df, 'df_world.p')
f.save_pickle(df_fr, 'df_world_fr.p')
f.save_pickle(country_position , 'country_position.p')

# Reload data

In [None]:
df = f.load_pickle('df_world.p')
df_fr = f.load_pickle('df_world_fr.p')
country_position = f.load_pickle('country_position.p')
region_info = f.load_pickle('region_info_by_id.p') # from contamination part

In [None]:
df_fr.head()

In [None]:
df.head()

# World map

__For one given_day__

(For animation see : https://plotly.com/~empet/14825/scattermapbox-animation-forum-question/#/)

In [None]:
jour = '2020-03-27'
category = 'confirmed'

In [None]:
# Select onde day, one category  (reset index for applying lat/lon by line)
tmp = df.xs(jour)[[category]].reset_index()

# Add latitute and longitude information for map info
tmp['lat'] = tmp['subzone'].apply(lambda x:country_position[x]['lat']).apply(float)
tmp['lon'] = tmp['subzone'].apply(lambda x:country_position[x]['lon']).apply(float)

# Filling nan values
tmp[category] = tmp[category].fillna(0)

tmp.head()

__Map plot__

In [None]:
fig = go.Figure(go.Scattermapbox(
        lat=tmp.lat.values,
        lon=tmp.lon.values,
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=(tmp.confirmed/1000).values
        ),
        text=tmp.subzone.values,
    ))

fig.update_layout(
    hovermode='closest',
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=go.layout.mapbox.Center(
            lat=45,
            lon=-73
        ),
        pitch=0,
        zoom=1
    )
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

# France map

Note that only one category is available for french dataset by region.

The one with dept has no information about confimred cases but is about hospitals 

_(As of March 29th 2020)_

In [None]:
jour = '2020-03-20'
category = 'confirmed' 

In [None]:
tmp = df_fr.xs(jour)[category].reset_index()
tmp['lat'] = tmp['code_region'].apply(lambda x: region_info[x]['lat'])
tmp['lon'] = tmp['code_region'].apply(lambda x: region_info[x]['lon'])
tmp.head()

__... and Plot !__

In [None]:
fig = go.Figure(go.Scattermapbox(
        lat=tmp['lat'].values,
        lon=tmp['lon'].values,
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=(tmp['confirmed']/50).values
        ),
        text=tmp['confirmed'].values,
    ))

fig.update_layout(
    hovermode='closest',
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=go.layout.mapbox.Center(
            lat=45,
            lon=-73
        ),
        pitch=0,
        zoom=1
    )
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

# World  most touched countrie's curves

In [None]:
# Select jour as last day available
jour = df.index.levels[0].max()

# the most touched contries
max_countries = df.xs(jour).confirmed.sort_values(ascending=False).index[:10].tolist()

In [None]:
# Plotting the confirmed case for them
tmp = df.reset_index()[df.reset_index()['subzone'].isin(max_countries)].groupby(['subzone', 'date']).sum()
plt.figure(figsize=(20,5))
for i in max_countries:
    plt.plot(tmp.xs(i)['confirmed'], label=i)
plt.legend()
plt.show()

In [None]:
df.head()

In [None]:
print(max_size)
cc = {}
for key, value in country_curves.items():
    value = list(value) + [np.NaN] * (max_size + 5 -len(value))
    cc[key]=value

In [None]:
def plot_from_j(df, category, nb_people, nb_country):

    jour = df.index.levels[0].max()
    countries = df.reset_index()['subzone'].unique()
    max_countries = df.xs(jour)[category].sort_values(ascending=False).index[:nb_country].tolist()
    
    country_curves = {} 
    max_size=0
    for country in max_countries:
        tmp = df.xs(country, level=1)
        tmp = tmp[tmp[category]>nb_people]

        country_curves[country]=tmp[category].values
        if tmp.shape[0] > max_size:
            max_size = tmp.shape[0]

    cc = {}
    for key, value in country_curves.items():
        value = list(value) + [np.NaN] * (max_size + 5 -len(value))
        cc[key]=value


    covid = pd.DataFrame(cc)
    covid.plot()

    _ = plt.show()
    return covid

In [None]:
_ = plot_from_j(df, 'confirmed', nb_people=100, nb_country = 10)

In [None]:
_ = plot_from_j(df, 'recovered', nb_people=10, nb_country = 10)

In [None]:
_ = plot_from_j(df, 'deaths',  nb_people=10, nb_country = 10)

### Bubble size transformation

In [None]:
plt.plot(x, y)
plt.show()

In [None]:
df = f.load_pickle('df_world.p')
df_fr = f.load_pickle('df_world_fr.p')
country_position = f.load_pickle('country_position.p')
geo_world_ok = f.load_pickle('geo_world.p')
region_info = f.load_pickle('region_info_by_id.p')

startdate = '2020-03-01'

tmp = df.reset_index()
tmp = tmp[tmp['date'] > startdate]
tmp['lat'] = tmp['subzone'].apply(lambda x:country_position[x]['lat']).apply(float)
tmp['lon'] = tmp['subzone'].apply(lambda x:country_position[x]['lon']).apply(float)
tmp = tmp.groupby(['date', 'subzone']).agg({'confirmed':'sum', 'deaths':'sum', 'recovered':'sum','lat':'last', 'lon':'last'})
tmp['confirmed'] = tmp['confirmed'].fillna(0)
tmp['confirmed'] = tmp['confirmed'].fillna(0)
tmp['confirmed'] = tmp['confirmed'].fillna(0)
#tmp['size'] = (tmp['confirmed']/1000).apply(lambda x: 0 if x <=0 else x if x > 4 else 4)



In [None]:
tmp.reset_index()['confirmed'].hist()

In [None]:
max_value = int(tmp.reset_index()['confirmed'].max())
n_size = 100
scales = np.array([i for i in range(0, max_value, int(max_value/n_size))])

def transform_size(x):
    if x==0:
        return 0
    return np.argmin(np.abs(scales - x)) + 1
    
    
x = []
y = []
for i in range(n_size+2):
    x.append(i)
    y.append(np.sqrt(i))
    

tmp['size'] = tmp.reset_index()['confirmed'].apply(transform_size).apply(lambda v:y[v]*10)

In [None]:
tmp