In [None]:
import pandas as pd
import numpy as np

In [None]:
PATH_WEEK='/kaggle/input/covid19-global-forecasting-week-2'
df_train = pd.read_csv(f'{PATH_WEEK}/train.csv')
df_test = pd.read_csv(f'{PATH_WEEK}/test.csv')

df_hospital_beds = pd.read_csv(r"/kaggle/input/hospital-beds/API_SH.MED.BEDS.ZS_DS2_en_csv_v2_887506.csv", 
                               skiprows=4)

df_population2 = pd.read_csv("/kaggle/input/populationdata/population_by_country_2020.csv", 
                             na_values="N.A.")

df_environment_pm2 = pd.read_csv("/kaggle/input/environmentpm25/API_EN.ATM.PM25.MC.M3_DS2_en_csv_v2_888986.csv",
                                 skiprows=4)

df_train.rename(columns={'Country_Region':'Country'}, inplace=True)
df_test.rename(columns={'Country_Region':'Country'}, inplace=True)
df_train.rename(columns={'Province_State':'State'}, inplace=True)
df_test.rename(columns={'Province_State':'State'}, inplace=True)
df_hospital_beds.rename(columns={'Country Name' : 'Country'}, inplace=True)
df_environment_pm2.rename(columns={'Country Name' : 'Country'}, inplace=True)

df_population2.set_axis(["Country", "Population", "YearlyChange", 
                      "NetChange", "Density", "LandArea", 
                      "Migrants", "FertilityRate", "MedAge",
                      "UrbanPop", "WorldShare"],
                      axis=1, 
                      inplace=True)

df_train['Date'] = pd.to_datetime(df_train['Date'], infer_datetime_format=True)
df_test['Date'] = pd.to_datetime(df_test['Date'], infer_datetime_format=True)

df_train.info()

In [None]:
country_names = {'Bahamas, The': 'Bahamas',
'Brunei Darussalam' : 'Brunei',
'DR Congo' :  'Congo (Kinshasa)',
"Côte d'Ivoire" : "Cote d'Ivoire",
'Congo' :  'Congo (Brazzaville)',                 
'Congo, Rep.': 'Congo (Brazzaville)',
'Congo, Dem. Rep.': 'Congo (Kinshasa)',
'Czech Republic (Czechia)' : 'Czechia',
'Czech Republic': 'Czechia',
'Diamond Princess': 'Diamond Princess',
'Egypt, Arab Rep.': 'Egypt',
'Gambia, The': 'Gambia',
'Holy See': 'Holy See',
'Iran, Islamic Rep.': 'Iran',
'Korea, Rep.': 'Korea, South',
'South Korea':'Korea, South',
'Kyrgyz Republic': 'Kyrgyzstan',
'Lao PDR': 'Laos',
'Russian Federation': 'Russia',
'St. Kitts and Nevis': 'Saint Kitts and Nevis',
'Saint Kitts & Nevis' : 'Saint Kitts and Nevis',
'St. Lucia': 'Saint Lucia',
'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
'St. Vincent & Grenadines':'Saint Vincent and the Grenadines',
'Serbia': 'Serbia', 
'Slovak Republic': 'Slovakia', 
'Syrian Arab Republic': 'Syria',
'Taiwan': 'Taiwan*', 
'United States': 'US', 
'Venezuela, RB': 'Venezuela'
}

df_population2.Country.replace(country_names, inplace=True)
df_hospital_beds.Country.replace(country_names, inplace=True)
df_environment_pm2.Country.replace(country_names, inplace=True)

df_train = pd.merge(df_train, df_population2, on="Country", how="left")
df_test = pd.merge(df_test, df_population2, on="Country", how="left")

df_hospital_beds.rename(columns={'2011':'HospitalBeds'}, inplace=True)
df_train = pd.merge(df_train, df_hospital_beds[["Country", "HospitalBeds"]], on="Country", how="left")
df_test = pd.merge(df_test, df_hospital_beds[["Country", "HospitalBeds"]], on="Country", how="left")

df_environment_pm2.rename(columns={'2017':'PM25'}, inplace=True)
df_train = pd.merge(df_train, df_environment_pm2[["Country", "PM25"]], on="Country", how="left")
df_test = pd.merge(df_test, df_environment_pm2[["Country", "PM25"]], on="Country", how="left")


In [None]:
df_train['NumDate'] = df_train.Date.astype(int)/((10**9)*60*60*24)
first_date = df_train.NumDate.min()
df_train.NumDate -= first_date
df_train.head()

df_test['NumDate'] = df_test.Date.astype(int)/((10**9)*60*60*24)
df_test.NumDate -= first_date

outbreak_dates = df_train[['Country', 'NumDate']][df_train.ConfirmedCases>0].groupby('Country', as_index=False).min()
outbreak_dates.columns = ['Country', 'FirstOutbreak']

first_death = df_train[['Country', 'NumDate']][df_train.Fatalities>0].groupby('Country', as_index=False).min()
first_death.columns = ['Country', 'FirstDeath']

df_train = pd.merge(df_train, outbreak_dates, how='left')
df_test = pd.merge(df_test, outbreak_dates, how='left')

df_train = pd.merge(df_train, first_death, how='left')
df_train.FirstDeath.fillna(0,inplace=True)

df_test = pd.merge(df_test, first_death, how='left')
df_test.FirstDeath.fillna(0,inplace=True)

df_train['DaysSinceOutbreak'] = df_train.NumDate - df_train.FirstOutbreak
df_test['DaysSinceOutbreak'] = df_test.NumDate - df_test.FirstOutbreak

df_train['DaysSinceFirstDeath'] = df_train.NumDate - df_train.FirstDeath
df_test['DaysSinceFirstDeath'] = df_test.NumDate - df_test.FirstDeath

df_train.head()

In [None]:
df_train.WorldShare = df_train.WorldShare.str.rstrip('%').astype('float') / 100.0
df_test.WorldShare = df_test.WorldShare.str.rstrip('%').astype('float') / 100.0
df_train.UrbanPop = df_train.UrbanPop.str.rstrip('%').astype('float') / 100.0
df_test.UrbanPop = df_test.UrbanPop.str.rstrip('%').astype('float') / 100.0
df_train.YearlyChange = df_train.YearlyChange.str.rstrip('%').astype('float') / 100.0
df_test.YearlyChange = df_test.YearlyChange.str.rstrip('%').astype('float') / 100.0

In [None]:
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt

def log_growth(x, a,r,c):
    return 1.0*c/(1+a * np.exp(-r * x))

def get_log_fit_params(data):
    try:
        popt, pcov = curve_fit(log_growth, data.NumDate, data.ConfirmedCases, method="lm")
        return popt
    except RuntimeError:
        try:
            popt, pcov = curve_fit(log_growth, data.NumDate, data.ConfirmedCases, method="trf")
        except RuntimeError:
            return np.zeros(3)

df_popt = df_train.groupby("Country").apply(get_log_fit_params)
df_opt = df_popt.apply(pd.Series)
df_opt.set_axis(["a", "r", "c"], axis=1, inplace=True)
df_opt.head()

In [None]:
df_opt[df_opt.a>0]

In [None]:
df_train = pd.merge(df_train, df_opt, on="Country", how="left")
df_test = pd.merge(df_test, df_opt, on="Country", how="left")

In [None]:
df_train['LogPrediction'] = df_train.apply(lambda x : log_growth(x.NumDate, x.a, x.r, x.c), axis=1)
df_test['LogPrediction'] = df_test.apply(lambda x : log_growth(x.NumDate, x.a, x.r, x.c), axis=1)

In [None]:
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures
import xgboost as xgb

df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

features = ['NumDate', "Population", "YearlyChange", 
                      "NetChange", "Density", "LandArea", 
                      "Migrants", "FertilityRate", "MedAge",
                     "UrbanPop", "WorldShare", 'FirstOutbreak', 
                    'DaysSinceOutbreak', 'FirstDeath', 'DaysSinceFirstDeath', 'HospitalBeds', 'PM25', 
                    'LogPrediction', 'a', 'r', 'c']

X = pd.concat([df_train[features], pd.get_dummies(df_train.Country,prefix="C_"), pd.get_dummies(df_train.State,prefix="S_")],axis=1)
y1 = df_train.ConfirmedCases
y2 = df_train.Fatalities


fit1 = xgb.XGBRegressor(n_estimators=5000, random_state = 123).fit(X, y1)
fit2 = xgb.XGBRegressor(n_estimators=5000, random_state = 123).fit(X, y2)

error1 = np.sqrt(mean_squared_log_error([max(x,0) for x in fit1.predict(X)], y1))
error2 = np.sqrt(mean_squared_log_error([max(x,0) for x in fit2.predict(X)], y2))
print(error1)
print(error2)
print((error1+error2)/2)

In [None]:
df_out = pd.DataFrame(df_test.ForecastId)


X = pd.concat([df_test[features], pd.get_dummies(df_test.Country,prefix="C_"), pd.get_dummies(df_test.State,prefix="S_")],axis=1)
df_out['ConfirmedCases'] = [max(x,0) for x in fit1.predict(X)]
df_out['Fatalities'] = [max(x,0) for x in fit2.predict(X)]
df_out.tail()

In [None]:
df_out.to_csv('submission.csv', index=False)

In [None]:
feat_importances = pd.Series(fit1.feature_importances_, index=X.columns)
feat_importances.sort_values(ascending=False).head(20)

In [None]:
prediction_ww = pd.merge(df_test, df_out, on="ForecastId")[["Date", "ConfirmedCases", "Fatalities"]].groupby("Date").sum()
prediction_ww.set_axis(["PredictedCases", "PredictedFatalities"], axis=1, inplace=True)
prediction_ww.plot()

In [None]:
train_ww = df_train[["Date", "ConfirmedCases", "Fatalities"]].groupby("Date").sum()
train_ww.plot()

In [None]:
pd.merge(train_ww, prediction_ww, how='outer', on="Date").plot()