# DataCommunitySA Challenge - Italy data

In [25]:
import os, glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
import plotly.graph_objects as go

### Loading Covid-19 data 

In [73]:
data = pd.read_csv('COVID-19/dati-regioni/dpc-covid19-ita-regioni.csv')
data.head()

Unnamed: 0,data,stato,codice_regione,denominazione_regione,lat,long,ricoverati_con_sintomi,terapia_intensiva,totale_ospedalizzati,isolamento_domiciliare,totale_attualmente_positivi,nuovi_attualmente_positivi,dimessi_guariti,deceduti,totale_casi,tamponi
0,2020-02-24 18:00:00,ITA,13,Abruzzo,42.351222,13.398438,0,0,0,0,0,0,0,0,0,5
1,2020-02-24 18:00:00,ITA,17,Basilicata,40.639471,15.805148,0,0,0,0,0,0,0,0,0,0
2,2020-02-24 18:00:00,ITA,4,P.A. Bolzano,46.499335,11.356624,0,0,0,0,0,0,0,0,0,1
3,2020-02-24 18:00:00,ITA,18,Calabria,38.905976,16.594402,0,0,0,0,0,0,0,0,0,1
4,2020-02-24 18:00:00,ITA,15,Campania,40.839566,14.25085,0,0,0,0,0,0,0,0,0,10


### loading weather data from 20 file with taking the temp_avg and humidity_avg

In [32]:
regions = []
temps = []
humidity = []

path = "ITA_weather/*.csv"
for filename in glob.glob(path):
     regions.append(filename[12:-4])
     temps.append(pd.read_csv(filename).loc[50:, 'T2M'].mean())
     humidity.append(pd.read_csv(filename).loc[50:, 'RH2M'].mean())

ITA = pd.DataFrame()
ITA['Region'] = regions
ITA['temp_avg'] = temps
ITA['humidity_avg'] = humidity
ITA.head()

Unnamed: 0,Region,temp_avg,humidity_avg
0,Abruzzo,5.83,78.543929
1,Basilicata,7.777143,73.881786
2,Calabria,12.501429,71.865
3,Campania,12.0625,74.2
4,EmiliaRomagna,7.225,80.240357


### Data Cleaning
- renameing some columns.
- keeping the coulmns that I need.

In [16]:
data = data.rename(columns={'denominazione_regione':'Region', 'ricoverati_con_sintomi':'with_symptoms', 'terapia_intensiva':'intensive Care', 'totale_ospedalizzati':'total_hospitalised_patients', 'isolamento_domiciliare':'home_confinement', 'totale_attualmente_positivi':'current_positive_cases', 'dimessi_guariti':'Recovered', 'deceduti':'Deaths', 'totale_casi':'Confirmed', 'tamponi':'test_performed', 'data':'date'})
c_data = data.groupby('Region').max().reset_index()[['Region', 'Confirmed', 'Recovered', 'Deaths']]
c_data.head()

Unnamed: 0,Region,Confirmed,Recovered,Deaths
0,Abruzzo,587,15,33
1,Basilicata,81,0,0
2,Calabria,273,5,8
3,Campania,936,41,29
4,Emilia Romagna,7555,349,816


### Merging the dataframes

In [35]:
m_data = pd.merge(c_data, ITA).sort_values(by='Confirmed', ascending=False)
m_data.head()

Unnamed: 0,Region,Confirmed,Recovered,Deaths,temp_avg,humidity_avg
6,Lombardia,27206,5865,3456,7.69,73.657143
18,Veneto,5122,309,169,9.150714,76.733571
11,Piemonte,4420,10,283,4.285,72.107857
7,Marche,2421,6,184,11.997857,70.3325
15,Toscana,2277,42,91,7.042857,83.071786


### Calculating growth rate

In [72]:
data = data[['date', 'Region', 'Confirmed', 'Recovered', 'Deaths']]
arr = []
arr1 = []
for i in range(28):
    arr.append(0)
    arr1.append(0.0)

## Seperating the regions from the data.
data_list = []
for r in c_data['Region']:
    data_list.append(data[data['Region'] == r].reset_index())
    
## Creating new_cases column. 
for i in range(len(data_list)):
    data_list[i]['New_cases'] = arr
    data_list[i]['Growth_rate'] = arr1
    
for i in range(len(data_list)):
    for j in range(len(data_list[i])-1):
        data_list[i].loc[j+1, 'New_cases'] = data_list[i].loc[j+1, 'Confirmed'] - data_list[i].loc[j, 'Confirmed']

        
for i in range(len(data_list)):
    for j in range(len(data_list[i])-1):
        if(data_list[i].loc[j, 'Confirmed'] == 0):
            data_list[i].loc[j+1, 'Growth_rate'] = data_list[i].loc[j+1, 'New_cases']
        else:
            data_list[i].loc[j+1, 'Growth_rate'] = data_list[i].loc[j+1, 'New_cases'] / data_list[i].loc[j, 'Confirmed']

            
data_list[20].head()

Unnamed: 0,index,date,Region,Confirmed,Recovered,Deaths,New_cases,Growth_rate
0,20,2020-02-24 18:00:00,Veneto,33,0,1,0,0.0
1,41,2020-02-25 18:00:00,Veneto,43,0,1,10,0.30303
2,62,2020-02-26 18:00:00,Veneto,71,0,2,28,0.651163
3,83,2020-02-27 18:00:00,Veneto,111,0,2,40,0.56338
4,104,2020-02-28 18:00:00,Veneto,151,0,2,40,0.36036


### Getting Avg of Growth_rate

In [67]:
gr = pd.DataFrame()
mar = []
for i in range(len(data_list)):
    mar.append(data_list[i][10:39])

for i in range(len(data_list)):
    gr.loc[i,'Region'] = mar[i].loc[10,'Region']
    gr.loc[i, 'growth_avg'] = mar[i].loc[:,'Growth_rate'].mean()

with_growth = m_data.merge(gr)
with_growth.head()

Unnamed: 0,Region,Confirmed,Recovered,Deaths,temp_avg,humidity_avg,growth_avg
0,Lombardia,27206,5865,3456,7.69,73.657143,0.164459
1,Veneto,5122,309,169,9.150714,76.733571,0.160538
2,Piemonte,4420,10,283,4.285,72.107857,0.25906
3,Marche,2421,6,184,11.997857,70.3325,0.208976
4,Toscana,2277,42,91,7.042857,83.071786,0.261597


### Data Visualization

In [70]:
x = with_growth

reg = LinearRegression().fit(np.vstack(x['growth_avg']), x['temp_avg'])
x['bestfit'] = reg.predict(np.vstack(x['growth_avg']))
fig=go.Figure()
fig.add_trace(go.Scatter(name='growth_avg vs humidity_avg', x=x['growth_avg'], y=x['temp_avg'], mode='markers'))
fig.add_trace(go.Scatter(name='line of best fit', x=x['growth_avg'], y=x['bestfit'], mode='lines'))
fig.update_layout(title="growth_avg vs temp_avg", xaxis_title="growth_avg", yaxis_title="temp_avg")
fig.update_layout(title="growth_avg vs humidity_avg (Italy)", xaxis_title="growth_avg", yaxis_title="humidity_avg", template='plotly_dark')
fig.show()



reg = LinearRegression().fit(np.vstack(x['growth_avg']), x['humidity_avg'])
x['bestfit'] = reg.predict(np.vstack(x['growth_avg']))
fig=go.Figure()
fig.add_trace(go.Scatter(name='growth_avg vs humidity_avg', x=x['growth_avg'], y=x['humidity_avg'], mode='markers'))
fig.add_trace(go.Scatter(name='line of best fit', x=x['growth_avg'], y=x['bestfit'], mode='lines'))
fig.update_layout(title="growth_avg vs humidity_avg (Italy)", xaxis_title="growth_avg", yaxis_title="humidity_avg", template='plotly_dark')
fig.show()

## Resourses
- regional_data.csv: https://www.kaggle.com/bsridatta/covid-19-italy-updated-regularly#regional_data.csv
- ITA_weather: https://power.larc.nasa.gov/data-access-viewer/