In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn import preprocessing
from sklearn import metrics

train_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv')
test_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv')

display(train_df.info())
display(test_df.info())

In [None]:
display(train_df.head())
display(test_df.head())

In [None]:
display(train_df.isnull().sum())


In [None]:
train_df[~train_df['Province_State'].isnull()]['Country_Region'].value_counts()

In [None]:
train_df[train_df['Province_State'].isnull()]['Country_Region'].value_counts()

In [None]:
train_df['Date'].describe()

In [None]:
test_df['Date'].min()

In [None]:
test_df['Date'].describe()

In [None]:
show_cum = train_df.groupby(by='Country_Region')[['ConfirmedCases','Fatalities']].max().reset_index()
plt.figure(figsize=(20,10))
#sns.set()
sns.barplot(x='ConfirmedCases',y='Country_Region',data=show_cum[show_cum['ConfirmedCases'] != 0].sort_values(by='ConfirmedCases',ascending=False).head(30))

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x='Fatalities',y='Country_Region',data=show_cum[show_cum['Fatalities'] != 0].sort_values(by='Fatalities',ascending=False).head(30))

# **Pre-process Data for ANN**

In [None]:
days_df = train_df['Date'].apply(lambda dt: datetime.datetime.strptime(dt, '%Y-%m-%d') - datetime.datetime.strptime('2020-01-21', '%Y-%m-%d')).apply(lambda x : str(x).split()[0]).astype(int)

train_df['Days'] = days_df
train_df.drop(['Date','Id'],axis=1,inplace=True)

train_df.fillna('0',inplace=True)

In [None]:
days_df = test_df['Date'].apply(lambda dt: datetime.datetime.strptime(dt, '%Y-%m-%d') - datetime.datetime.strptime('2020-01-21', '%Y-%m-%d')).apply(lambda x : str(x).split()[0]).astype(int)

test_df['Days'] = days_df
after_use = test_df.copy()
test_df.drop(['Date','ForecastId'],axis=1,inplace=True)

test_df.fillna('0',inplace=True)

In [None]:
enc = preprocessing.OneHotEncoder()
enc.fit(train_df[['Province_State','Country_Region']])
enc_cntry_pvstate = enc.transform(train_df[['Province_State','Country_Region']]).toarray()

In [None]:
enc_test = preprocessing.OneHotEncoder()
enc_test.fit(test_df[['Province_State','Country_Region']])
enc_cntry_pvstate_test = enc_test.transform(test_df[['Province_State','Country_Region']]).toarray()

In [None]:
train_df.drop(['Province_State','Country_Region'],axis=1,inplace=True)

train_df['Province_State'] = enc_cntry_pvstate[:,0]
train_df['Country_Region'] = enc_cntry_pvstate[:,1]

display(train_df.tail())
display(train_df.describe())

train_label_cc = train_df['ConfirmedCases'].to_numpy()
train_label_fa = train_df['Fatalities'].to_numpy()

#normed_train_data = preprocessing.normalize(ncc)
train_data_cc = train_df[['Days','Province_State','Country_Region']]
train_data_fa = train_df[['Days','Province_State','Country_Region','ConfirmedCases']]
display(sns.distplot(train_label_cc,bins=10))


In [None]:
test_df.drop(['Province_State','Country_Region'],axis=1,inplace=True)

test_df['Province_State'] = enc_cntry_pvstate_test[:,0]
test_df['Country_Region'] = enc_cntry_pvstate_test[:,1]

#normed_train_data = preprocessing.normalize(ncc)
test_data_cc = test_df[['Days','Province_State','Country_Region']]

# Deep Learning - time series - ANN

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

In [None]:
def build_model(cc_input_size):
  model = keras.Sequential([
    layers.Dense(3, activation='relu', input_shape=cc_input_size),
    layers.Dense(3, activation='relu'),
    layers.Dense(3, activation='relu'),
    layers.Dense(2, activation='relu'),
    layers.Dense(2, activation='relu'),
    #layers.Dense(2, activation='relu'),
    layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)
  #optimizer = tf.keras.optimizers.Adam()

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
  return model

In [None]:
cc_input_size = [3]
model_cc = build_model(cc_input_size)

EPOCHS = 400

history = model_cc.fit(
  train_data_cc, train_label_cc,
  epochs=EPOCHS, validation_split = 0.3, verbose=2)

hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
display(hist.tail())

In [None]:
hist[['mae']].plot()
plt.ylabel('Confirmaed Cases / Infected ')

In [None]:
hist[['mse']].plot()
plt.ylabel('Confirmaed Cases / Infected ')

In [None]:
# Train Fatality Model

cc_input_size = [4]
model_fa = build_model(cc_input_size)

EPOCHS = 500

history = model_fa.fit(
  train_data_fa, train_label_fa,
  epochs=EPOCHS, validation_split = 0.3, verbose=2)

hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
display(hist.tail())

In [None]:
hist[['mae']].plot()
plt.ylabel('Confirmaed Cases / Infected ')

In [None]:
hist[['mse']].plot()
plt.ylabel('Confirmaed Cases / Infected ')

In [None]:
normed_test_data = test_data_cc
test_predictions_cc = model_cc.predict(normed_test_data)

In [None]:
#Fatality
test_data_fa = test_df[['Days','Province_State','Country_Region']]
#,'ConfirmedCases'
test_data_fa['ConfirmedCases'] = test_predictions_cc
normed_test_data = test_data_fa
test_predictions = model_fa.predict(normed_test_data)

In [None]:
submit_df = pd.DataFrame()
submit_df['ForecastId'] = after_use['ForecastId']
submit_df['ConfirmedCases'] = pd.DataFrame(test_predictions_cc)
submit_df['Fatalities'] = pd.DataFrame(test_predictions)
submit_df.info()

In [None]:
submit_df.to_csv('submission.csv',index=False)

In [None]:
submit_df.head()