# DataCommunitySA Challenge - China data

In [1]:
import os, glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.linear_model import LinearRegression
import plotly.graph_objects as go

### loading Covid-19 data 

In [2]:
data = pd.read_csv('covid_19_data.csv')

data.head()

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0


### Data Cleaning

In [3]:
data = data.rename(columns={'Province/State':'Region'})

c_data = data[data['Country/Region'].str.contains('China')]
c_data = c_data[['Region', 'Confirmed', 'Recovered', 'Deaths']].groupby('Region').max()
c_data = c_data.sort_values(by='Confirmed', ascending=False).reset_index()
c_data.head()

Unnamed: 0,Region,Confirmed,Recovered,Deaths
0,Hubei,67800.0,58382.0,3133.0
1,Guangdong,1395.0,1323.0,8.0
2,Henan,1273.0,1250.0,22.0
3,Zhejiang,1234.0,1219.0,1.0
4,Hunan,1018.0,1014.0,4.0


### loading weather data from 20 file with taking the temp_avg and humidity_avg

In [4]:
regions = []
temps = []
humidity = []

path = "CHA_weather/*.csv"
for filename in glob.glob(path):
     regions.append(filename[12:-4])
     temps.append(pd.read_csv(filename, skiprows=15).loc[33:62, 'T2M'].mean())
     humidity.append(pd.read_csv(filename, skiprows=15).loc[33:62, 'RH2M'].mean())

CHA = pd.DataFrame()
CHA['Region'] = regions
CHA['temp_avg'] = temps
CHA['humidity_avg'] = humidity
CHA.head()

Unnamed: 0,Region,temp_avg,humidity_avg
0,Anhui,7.867,79.384333
1,Beijing,-1.483,63.625
2,Chongqing,8.750667,81.868
3,Fujian,11.679667,82.149667
4,Gansu,-1.333667,57.576667


### Merging the dataframes

In [5]:
m_data = pd.merge(c_data, CHA)
m_data.head()

Unnamed: 0,Region,Confirmed,Recovered,Deaths,temp_avg,humidity_avg
0,Hubei,67800.0,58382.0,3133.0,8.564333,79.908333
1,Guangdong,1395.0,1323.0,8.0,17.982667,72.097333
2,Henan,1273.0,1250.0,22.0,6.114667,71.649
3,Zhejiang,1234.0,1219.0,1.0,8.807667,82.739333
4,Hunan,1018.0,1014.0,4.0,9.115667,85.834


### Data Visualization

In [35]:
x = m_data[1:]

reg = LinearRegression().fit(np.vstack(x['Confirmed']), x['temp_avg'])
x['bestfit'] = reg.predict(np.vstack(x['Confirmed']))
fig=go.Figure()
fig.add_trace(go.Scatter(name='Confirmed vs temp_avg', x=x['Confirmed'], y=x['temp_avg'], mode='markers'))
fig.add_trace(go.Scatter(name='line of best fit', x=x['Confirmed'], y=x['bestfit'], mode='lines'))
fig.show()


reg = LinearRegression().fit(np.vstack(x['Confirmed']), x['humidity_avg'])
x['bestfit'] = reg.predict(np.vstack(x['Confirmed']))
fig=go.Figure()
fig.add_trace(go.Scatter(name='Confirmed vs humidity_avg', x=x['Confirmed'], y=x['humidity_avg'], mode='markers'))
fig.add_trace(go.Scatter(name='line of best fit', x=x['Confirmed'], y=x['bestfit'], mode='lines'))
fig.show()

# fig = px.bar(x, y="Confirmed", x="Region", title='Confirmed vs Region')
# fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### Calculating Growth Rate

In [8]:
arr = []
arr1 = []
for i in range(53):
    arr.append(0)
    arr1.append(0.0)

## Seperating the regions from the data.
data_list = []
for r in c_data['Region']:
    data_list.append(data[data['Region'] == r][:53].reset_index())

data_list[14].loc[50, 'Confirmed'] = 318
data_list[22].loc[50, 'Confirmed'] = 127
    
## Creating new_cases column. 
for i in range(len(data_list)):
    data_list[i]['New_cases'] = arr
    data_list[i]['Growth_rate'] = arr1
    
for i in range(len(data_list)-1):
    for j in range(len(data_list[i])-1):
        data_list[i].loc[j+1, 'New_cases'] = data_list[i].loc[j+1, 'Confirmed'] - data_list[i].loc[j, 'Confirmed']

        
for i in range(len(data_list)-1):
    for j in range(len(data_list[i])-1):
        if(data_list[i].loc[j, 'Confirmed'] == 0):
            data_list[i].loc[j+1, 'Growth_rate'] = data_list[i].loc[j+1, 'New_cases']
        else:
            data_list[i].loc[j+1, 'Growth_rate'] = data_list[i].loc[j+1, 'New_cases'] / data_list[i].loc[j, 'Confirmed']

In [9]:
data_list[5].head()

Unnamed: 0,index,SNo,ObservationDate,Region,Country/Region,Last Update,Confirmed,Deaths,Recovered,New_cases,Growth_rate
0,0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0,0.0,0.0
1,38,39,01/23/2020,Anhui,Mainland China,1/23/20 17:00,9.0,0.0,0.0,8.0,8.0
2,95,96,01/24/2020,Anhui,Mainland China,1/24/20 17:00,15.0,0.0,0.0,6.0,0.666667
3,131,132,01/25/2020,Anhui,Mainland China,1/25/20 17:00,39.0,0.0,0.0,24.0,1.6
4,176,177,01/26/2020,Anhui,Mainland China,1/26/20 16:00,60.0,0.0,0.0,21.0,0.538462


### Getting Avg of growth_rate

In [16]:
gr = pd.DataFrame()
feb = []
for i in range(len(data_list)):
    feb.append(data_list[i][10:39])

for i in range(len(data_list)):
    gr.loc[i,'Region'] = feb[i].loc[10,'Region']
    gr.loc[i, 'growth_avg'] = feb[i].loc[:,'Growth_rate'].mean()
    

### Merge it with the data

In [32]:
with_growth = m_data.merge(gr)
with_growth.head()

Unnamed: 0,Region,Confirmed,Recovered,Deaths,temp_avg,humidity_avg,growth_avg
0,Hubei,67800.0,58382.0,3133.0,8.564333,79.908333,0.094678
1,Guangdong,1395.0,1323.0,8.0,17.982667,72.097333,0.041294
2,Henan,1273.0,1250.0,22.0,6.114667,71.649,0.047053
3,Zhejiang,1234.0,1219.0,1.0,8.807667,82.739333,0.02896
4,Hunan,1018.0,1014.0,4.0,9.115667,85.834,0.04079


### Data Visualization

In [46]:
x = with_growth[1:]

reg = LinearRegression().fit(np.vstack(x['growth_avg']), x['temp_avg'])
x['bestfit'] = reg.predict(np.vstack(x['growth_avg']))
fig=go.Figure()
fig.add_trace(go.Scatter(name='growth_avg vs humidity_avg', x=x['growth_avg'], y=x['temp_avg'], mode='markers'))
fig.add_trace(go.Scatter(name='line of best fit', x=x['growth_avg'], y=x['bestfit'], mode='lines'))
fig.update_layout(title="growth_avg vs temp_avg (China)", xaxis_title="growth_avg", yaxis_title="temp_avg", template='plotly_dark')
fig.show()



reg = LinearRegression().fit(np.vstack(x['growth_avg']), x['humidity_avg'])
x['bestfit'] = reg.predict(np.vstack(x['growth_avg']))
fig=go.Figure()
fig.add_trace(go.Scatter(name='growth_avg vs humidity_avg', x=x['growth_avg'], y=x['humidity_avg'], mode='markers'))
fig.add_trace(go.Scatter(name='line of best fit', x=x['growth_avg'], y=x['bestfit'], mode='lines'))
fig.update_layout(title="growth_avg vs humidity_avg (China)", xaxis_title="growth_avg", yaxis_title="humidity_avg", template='plotly_dark')
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### Resourses
- covid_19_data.csv: https://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset#covid_19_data.csv
- CHA_weather :https://power.larc.nasa.gov/data-access-viewer/