## Modelling COVID-19 with climate, mobility, and economic features
By building a general regression model, we study how climate, mobility, and economic features impact future number of COVID-19 cases.

In [None]:
import pandas as pd
from itertools import product
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import plotly.express as px
import shap
import warnings
warnings.filterwarnings("ignore")

## COVID-19 Data

In [None]:
def compute_wk_num(input_date):
    wk_num = input_date.week
    if (input_date.year == 2021) & (wk_num < 53):
        wk_num += 52

    return wk_num

In [None]:
df_covid = pd.read_csv("../input/novel-corona-virus-2019-dataset/covid_19_data.csv")

In [None]:
df_covid["Country/Region"].value_counts()

In [None]:
#read COVID-19 data and some clean up

df_covid = pd.read_csv("../input/novel-corona-virus-2019-dataset/covid_19_data.csv")
df_covid['date'] = pd.to_datetime(df_covid['ObservationDate'])
df_covid = df_covid[df_covid["Province/State"] != "Unknown"]
df_covid['Country/Region'] = df_covid['Country/Region'].replace('Mainland China', 'China')
df_covid["province_country"] = df_covid["Province/State"] + " " + df_covid["Country/Region"]

#count the entries for each place, use to filter out places with few data

df_count_entries = df_covid["province_country"].value_counts().to_frame().reset_index()
df_count_entries.columns = ["province_country","count"]
df_count_entries = df_count_entries[df_count_entries["count"] > 40]

#compute daily cases, data records accumulated numbers

df_covid_daily = pd.DataFrame()
for i in df_covid["province_country"].unique():
    
    if i not in list(df_count_entries["province_country"]):
        continue
        
    current_place = df_covid[df_covid["province_country"] == i]
    
    current_place['daily_confirmed'] = current_place["Confirmed"] - current_place["Confirmed"].shift(1) 
    current_place['daily_deaths'] = current_place["Deaths"] - current_place["Deaths"].shift(1) 
    current_place['daily_recovered'] = current_place["Recovered"] - current_place["Recovered"].shift(1)
    
    df_covid_daily = df_covid_daily.append(current_place)
    
df_covid_daily["daily_confirmed"] = df_covid_daily["daily_confirmed"].fillna(df_covid_daily["Confirmed"])
df_covid_daily["daily_deaths"] = df_covid_daily["daily_deaths"].fillna(df_covid_daily["Deaths"])
df_covid_daily["daily_recovered"] = df_covid_daily["daily_recovered"].fillna(df_covid_daily["Recovered"])

#Compute week numbers

df_covid_daily["wk_num"] = df_covid_daily.apply(lambda x: compute_wk_num(x["date"]), axis=1)
df_covid_daily["wk_num_52"] = df_covid_daily["wk_num"]%53
df_covid_daily['wk_num_52'] = df_covid_daily['wk_num_52'].replace(0, 1)

## Temperature Data

In [None]:
df_temperature = pd.read_csv("../input/daily-temperature-of-major-cities/city_temperature.csv")
df_temperature['Country'] = df_temperature['Country'].replace('United Kingdom','UK')
df_temperature['Country'] = df_temperature['Country'].replace('The Netherlands','Netherlands')

#clean temperature data, use 

df_temperature_hold = df_temperature[df_temperature["Year"]==2019]
df_temperature_hold["AvgTemperature_c"] = (df_temperature_hold["AvgTemperature"] - 32)*(5/9)
df_temperature_hold["Year"] = df_temperature_hold["Year"].astype(str)
df_temperature_hold["Month"] = df_temperature_hold["Month"].astype(str)
df_temperature_hold["Day"] = df_temperature_hold["Day"].astype(str)
df_temperature_hold["date"] = df_temperature_hold["Year"] + "-" + \
                                df_temperature_hold["Month"] + "-" + df_temperature_hold["Day"]
df_temperature_hold["date"] = pd.to_datetime(df_temperature_hold["date"])

#compute weekly average for each country, prep for merge

df_temperature_hold["wk_num_52"] = df_temperature_hold.apply(lambda x: compute_wk_num(x["date"]), axis=1)
df_temperature_hold = df_temperature_hold[df_temperature_hold["wk_num_52"]<=52]
df_temperature_hold = df_temperature_hold.groupby(["Country","wk_num_52"]).agg({'AvgTemperature_c': 'mean'}).reset_index()
df_temperature_hold.columns = ["Country/Region", "wk_num_52", "AvgTemperature_c"]

In [None]:
df_covid_daily = pd.merge(df_covid_daily,df_temperature_hold,on=["Country/Region","wk_num_52"],how='left')

#remove since no temperature data
df_covid_daily = df_covid_daily[df_covid_daily["Country/Region"] != "Peru"]
df_covid_daily = df_covid_daily[df_covid_daily["Country/Region"] != "Chile"]
df_covid_daily = df_covid_daily[df_covid_daily["Country/Region"] != "Macau"]

## Mobility Data

In [None]:
df_mobility = pd.read_csv("../input/mobility-country/mobility_country.csv")
df_mobility["date"] = pd.to_datetime(df_mobility["date"])

df_mobility['country_region'] = df_mobility['country_region'].replace('United Kingdom','UK')
df_mobility['country_region'] = df_mobility['country_region'].replace('United States','US')

df_mobility["wk_num"] = df_mobility.apply(lambda x: compute_wk_num(x["date"]), axis=1)

df_mobility = df_mobility.drop(["date","country_region_code","sub_region_1","sub_region_2","metro_area","iso_3166_2_code",\
                                "census_fips_code","place_id"], axis=1)

df_mobility.columns = ["Country/Region","retail_and_recreation","grocery_and_pharmacy","parks","transit_stations",\
                       "workplaces","residential","wk_num"]

retail_and_recreation = df_mobility.groupby(["Country/Region","wk_num"]).agg({'retail_and_recreation': 'mean'}).reset_index()
grocery_and_pharmacy = df_mobility.groupby(["Country/Region","wk_num"]).agg({'grocery_and_pharmacy': 'mean'}).reset_index()
parks = df_mobility.groupby(["Country/Region","wk_num"]).agg({'parks': 'mean'}).reset_index()
transit_stations = df_mobility.groupby(["Country/Region","wk_num"]).agg({'transit_stations': 'mean'}).reset_index()
workplaces = df_mobility.groupby(["Country/Region","wk_num"]).agg({'workplaces': 'mean'}).reset_index()
residential = df_mobility.groupby(["Country/Region","wk_num"]).agg({'residential': 'mean'}).reset_index()

df_covid_daily = pd.merge(df_covid_daily,retail_and_recreation,on=["Country/Region","wk_num"],how='left').fillna(0)
df_covid_daily = pd.merge(df_covid_daily,grocery_and_pharmacy,on=["Country/Region","wk_num"],how='left').fillna(0)
df_covid_daily = pd.merge(df_covid_daily,parks,on=["Country/Region","wk_num"],how='left').fillna(0)
df_covid_daily = pd.merge(df_covid_daily,transit_stations,on=["Country/Region","wk_num"],how='left').fillna(0)
df_covid_daily = pd.merge(df_covid_daily,workplaces,on=["Country/Region","wk_num"],how='left').fillna(0)
df_covid_daily = pd.merge(df_covid_daily,residential,on=["Country/Region","wk_num"],how='left').fillna(0)

## Vaccination Data

In [None]:
df_vac = pd.read_csv("../input/covid-world-vaccination-progress/country_vaccinations.csv")
df_vac["date"] = pd.to_datetime(df_vac["date"])
df_vac['country'] = df_vac['country'].replace('United States','US')
df_vac["wk_num"] = df_vac.apply(lambda x: compute_wk_num(x["date"]), axis=1)
df_vac = df_vac[["wk_num","country","people_fully_vaccinated_per_hundred","people_vaccinated_per_hundred"]]
df_vac.columns = ["wk_num","Country/Region","fully_vac_per100", "vac_per100"]
df_vac["fully_vac_per100"] = df_vac["fully_vac_per100"].fillna(0)
df_vac["vac_per100"] = df_vac["vac_per100"].fillna(0)
df_full_vac_wk = df_vac.groupby(["Country/Region","wk_num"]).agg({'fully_vac_per100': 'mean'}).reset_index()
df_vac_wk = df_vac.groupby(["Country/Region","wk_num"]).agg({'vac_per100': 'mean'}).reset_index()

df_covid_daily = pd.merge(df_covid_daily,df_full_vac_wk,on=["Country/Region","wk_num"],how='left').fillna(0)
df_covid_daily = pd.merge(df_covid_daily,df_vac_wk,on=["Country/Region","wk_num"],how='left').fillna(0)

## Density and Urban pop % Data

In [None]:
df_population = pd.read_csv("../input/population-by-country-2020/population_by_country_2020.csv")
df_population = df_population[["Country (or dependency)", "Density (P/Km²)"]]
df_population.columns = ["Country/Region", "density"]
df_covid_daily = pd.merge(df_covid_daily,df_population,on=["Country/Region"],how='left')

## GDP data

In [None]:
df_gdp = pd.read_csv("../input/gdp-per-capita-all-countries/GDP.csv")
df_gdp['Country '] = df_gdp['Country '].replace('United Kingdom','UK')
df_gdp['Country '] = df_gdp['Country '].replace('United States','US')
df_gdp['Country '] = df_gdp['Country '].replace('Hong Kong SAR, China','Hong Kong')
df_gdp['Country '] = df_gdp['Country '].replace('Russian Federation','Russia')
df_gdp = df_gdp[["Country ","2018"]]
df_gdp.columns = ["Country/Region", "2018_gdp"]
df_covid_daily = pd.merge(df_covid_daily,df_gdp,on=["Country/Region"],how='left')

In [None]:
#label encoding for country/state

df_covid_daily["province_country"] = df_covid_daily["province_country"].astype('category')
df_covid_daily["place"] = df_covid_daily["province_country"].cat.codes
df_covid_daily["Country/Region"] = df_covid_daily["Country/Region"].astype('category')
df_covid_daily["country"] = df_covid_daily["Country/Region"].cat.codes

In [None]:
df_country_code = df_covid_daily[["Country/Region", "country"]]
df_place_code = df_covid_daily[["province_country", "place"]]

In [None]:
df_country_code.drop_duplicates(inplace=True)
df_place_code.drop_duplicates(inplace=True)

In [None]:
#get country/state map

df_place_country = df_covid_daily[["place", "country"]]
df_place_country.drop_duplicates(inplace=True)

In [None]:
# Create "grid" with columns

index_cols = ['place', 'wk_num']

grid = []
for wk_num in df_covid_daily['wk_num'].unique():
    province = df_covid_daily.loc[df_covid_daily['wk_num'] == wk_num, 'place'].unique()
    grid.append(np.array(list(product(*[province, [wk_num]])),dtype='int32'))

grid = pd.DataFrame(np.vstack(grid), columns = index_cols, dtype=np.int32)

In [None]:
# merge weekly sum of place

df_w = df_covid_daily.groupby(['wk_num','place']).agg({'daily_confirmed': 'sum'}).reset_index()
df_w = pd.merge(grid,df_w,on=['wk_num','place'],how='left').fillna(0)
df_w = pd.merge(df_w,df_place_country,on=["place"],how='left')

In [None]:
# merge weekly sum of country

df_c = df_covid_daily.groupby(['wk_num','country']).agg({'daily_confirmed':'sum'}).reset_index()
df_c.columns = ["wk_num","country","daily_confirmed_country"]
df_w = pd.merge(df_w,df_c,on=['wk_num','country'],how='left')

In [None]:
# merge weekly average temperature

df_temp = df_covid_daily[['wk_num','country','AvgTemperature_c']]
df_temp.drop_duplicates(inplace=True)
df_w = pd.merge(df_w,df_temp,on=['wk_num','country'],how='left')
df_w["AvgTemperature_c"] = df_w["AvgTemperature_c"].round()

In [None]:
# merge mobility

df_mobility = df_covid_daily[['wk_num','country','retail_and_recreation','grocery_and_pharmacy','parks','transit_stations','workplaces','residential']]
df_mobility.drop_duplicates(inplace=True)
df_w = pd.merge(df_w,df_mobility,on=['wk_num','country'],how='left')

In [None]:
# merge vaccination

df_vac = df_covid_daily[['wk_num','country','fully_vac_per100', 'vac_per100']]
df_vac.drop_duplicates(inplace=True)
df_w = pd.merge(df_w,df_vac,on=['wk_num','country'],how='left')

In [None]:
# merge pop

df_pop = df_covid_daily[['country','density']]
df_pop.drop_duplicates(inplace=True)
df_w = pd.merge(df_w,df_pop,on=['country'],how='left')

In [None]:
# merge gdp

df_gdp = df_covid_daily[['country','2018_gdp']]
df_gdp.drop_duplicates(inplace=True)
df_w = pd.merge(df_w,df_gdp,on=['country'],how='left')

In [None]:
#compute lag features

lag_variables  = ['daily_confirmed','daily_confirmed_country', 'AvgTemperature_c',
                  'retail_and_recreation','grocery_and_pharmacy','parks','transit_stations','workplaces','residential',
                  'fully_vac_per100','vac_per100']
lags = [3]
# we will keep the results in this dataframe
covid_means = df_w.copy()
for lag in lags:
    new_df = df_w.copy()
    new_df.wk_num += lag
    # subset only the lag variables we want
    new_df = new_df[['place','wk_num'] + lag_variables]
    new_df.columns = ['place','wk_num'] + [lag_feat + '_lag_' + str(lag) for lag_feat in lag_variables]
    covid_means = pd.merge(covid_means, new_df, on=['place','wk_num'] ,how='left')
    
for feat in covid_means.columns:
    if 'daily_confirmed' in feat:
        covid_means[feat]=covid_means[feat].fillna(0)
        
X_train = covid_means[(covid_means['wk_num']<73) & (covid_means["wk_num"]>15)]
X_cv =  covid_means[covid_means['wk_num'] >= 73]
Y_train = X_train['daily_confirmed']
Y_cv = X_cv['daily_confirmed']

#delete answer and non-lagging features
X_train = X_train.drop(lag_variables, axis=1)
X_cv = X_cv.drop(lag_variables, axis=1)

In [None]:
fig, axs = plt.subplots(4, 6)
plt.rcParams["figure.figsize"] = (40,20)
for i in set(covid_means["country"]):

    country = df_country_code[df_country_code["country"] == i]["Country/Region"].iloc[0]
    one_country = covid_means[covid_means["country"] == i][["wk_num","daily_confirmed_country","fully_vac_per100"]]
    one_country.drop_duplicates(inplace=True)
    
    ax = axs[i%4,i//4]
    # Plot linear sequence, and set tick labels to the same color
    ax.plot(one_country["wk_num"], one_country["daily_confirmed_country"], color='red', label = 'daily confirmed')
    ax.tick_params(axis='y', labelcolor='red')
    ax.set_title(country)
    # Generate a new Axes instance, on the twin-X axes (same position)
    ax2 = ax.twinx()

    # Plot exponential sequence, set scale to logarithmic and change tick color
    ax2.plot(one_country["wk_num"], one_country["fully_vac_per100"], color='green', label = 'fully vac per 100 pax')
    ax2.tick_params(axis='y', labelcolor='green')
    
    ax.legend(loc = 'upper left')
    ax2.legend(loc = 'upper right')

plt.show()


## Train Model

In [None]:
#get baseline prediction

covid_test = covid_means[covid_means['wk_num'] >= 73]

preds = covid_test.copy()

baseline_rmse = np.sqrt(mean_squared_error(preds['daily_confirmed'],preds['daily_confirmed_lag_3']))
baseline_mae = mean_absolute_error(preds['daily_confirmed'],preds['daily_confirmed_lag_3'])

print(baseline_mae)

In [None]:
model_xgb = XGBRegressor(n_estimators=15)
model_xgb.fit(X_train, Y_train)
predict_xgb = model_xgb.predict(X_cv)
mae_xgb = mean_absolute_error(Y_cv,predict_xgb)
mae_xgb

In [None]:
feature_importances = pd.DataFrame({'col': X_train.columns,'imp':model_xgb.feature_importances_})
feature_importances = feature_importances.sort_values(by='imp',ascending=False)
px.bar(feature_importances,x='col',y='imp')

## SHAP value analysis

In [None]:
X_importance  = X_train
explainer = shap.TreeExplainer(model_xgb)
shap_values = explainer.shap_values(X_importance,check_additivity=False)

In [None]:
# Plot summary_plot
shap.summary_plot(shap_values, X_importance)

In [None]:
# Plot summary_plot as barplot:
shap.summary_plot(shap_values, X_importance, plot_type='bar')

In [None]:
shap.dependence_plot("grocery_and_pharmacy_lag_3", shap_values, X_importance, interaction_index=None)

In [None]:
shap.dependence_plot("transit_stations_lag_3", shap_values, X_importance, interaction_index=None)

In [None]:
shap.dependence_plot("AvgTemperature_c_lag_3", shap_values, X_importance, interaction_index=None)

In [None]:
shap.dependence_plot("fully_vac_per100_lag_3", shap_values, X_importance, interaction_index=None)

In [None]:
shap_values = explainer(X_train)

# visualize the first prediction's explanation
shap.plots.waterfall(shap_values[100])