# WorkFlow
## Load the data
## Clean the data
## Modelling

## Load the data

In [39]:
import pandas as pd

In [40]:
data_confirmed_deaths = pd.read_csv('./data/raw/data-1.csv')

In [115]:
import pprint

In [117]:
pprint.pprint(data_confirmed_deaths.columns)

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'new_tests', 'total_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'new_vaccinations',
       'new_vaccinations_smoothed', 'total_vaccinations_per_hun

In [41]:
data_recovered_active = pd.read_csv('./data/raw/data-0.csv')

In [42]:
data_confirmed_deaths = data_confirmed_deaths[data_confirmed_deaths['location'] == 'Sri Lanka']

In [43]:
data_recovered_active = data_recovered_active[data_recovered_active['Country_Region'] == 'Sri Lanka']

In [44]:
total_cases = list(data_confirmed_deaths['total_cases'])
total_deaths = list(data_confirmed_deaths['total_deaths'])
total_recovered = list(data_recovered_active['Recovered'])
total_active = list(data_recovered_active['Active'])

## Clean the data

In [45]:
len(total_cases) - len(total_recovered)

130

In [46]:
import numpy as np

In [47]:
for _ in range(len(total_cases) - len(total_recovered)):
    total_recovered.append(np.nan)
    total_active.append(np.nan)

In [48]:
len(total_cases) - len(total_recovered)

0

In [49]:
data = pd.DataFrame({'Date':data_confirmed_deaths['date'],'Cases':total_cases,'Deaths':total_deaths,'Recovered':total_recovered,'Active':total_active})

In [50]:
data.isna().sum()

Date           0
Cases          0
Deaths        61
Recovered    130
Active       130
dtype: int64

In [51]:
data['Recovered'] = data['Recovered'].fillna(data['Recovered'].mean())
data['Active'] = data['Active'].fillna(data['Active'].mean())
data['Deaths'] = data['Deaths'].fillna(data['Deaths'].mean())

In [52]:
dates = []
for date in data['Date']:
    dates.append(str(date).replace('-',''))

In [53]:
data['Date'] = dates

## Modelling

In [54]:
data['Date'] = data['Date'].astype(float)

In [55]:
X = data['Date']
y = data.drop('Date',axis=1)

In [56]:
from sklearn.model_selection import *

In [57]:
from sklearn.metrics import *

In [58]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor,BaggingRegressor,RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRFRegressor
from catboost import CatBoostRegressor,CatBoost

In [68]:
models = [
    ['KNeighborsRegressor',KNeighborsRegressor],
#     ['GaussianNB',GaussianNB],
    ['GradientBoostingRegressor',GradientBoostingRegressor],
    ['AdaBoostRegressor',AdaBoostRegressor],
    ['BaggingRegressor',BaggingRegressor],
    ['RandomForestRegressor',RandomForestRegressor],
    ['SVR',SVR],
    ['DecisionTreeRegressor',DecisionTreeRegressor]
]

In [69]:
data.columns

Index(['Date', 'Cases', 'Deaths', 'Recovered', 'Active'], dtype='object')

In [70]:
X = np.array(X).reshape(-1,1)

In [71]:
try:
    y = y['Deaths']
except:
    pass

In [72]:
model = AdaBoostRegressor().fit(np.array(X).reshape(-1,1),y)
model.score(np.array(X).reshape(-1,1),y)

0.9862156210051783

In [73]:
model.predict(np.array([20210508]).reshape(1,-1))

array([713.])

In [74]:
results = {}

In [75]:
for model in models:
    print('*'*50)
    model_name = model[0]
    print(model_name)
    model = model[1]()
    model.fit(X,y)
    results[model_name] = model.score(X,np.array(y).reshape(-1,1))
    print('*'*50)

**************************************************
KNeighborsRegressor
**************************************************
**************************************************
GradientBoostingRegressor
**************************************************
**************************************************
AdaBoostRegressor
**************************************************
**************************************************
BaggingRegressor
**************************************************
**************************************************
RandomForestRegressor
**************************************************
**************************************************
SVR
**************************************************
**************************************************
DecisionTreeRegressor
**************************************************


In [76]:
results

{'KNeighborsRegressor': 0.9991567495596736,
 'GradientBoostingRegressor': 0.9989531585040866,
 'AdaBoostRegressor': 0.9863879235282036,
 'BaggingRegressor': 0.9988611564552189,
 'RandomForestRegressor': 0.9989037862603037,
 'SVR': 0.43929033598399947,
 'DecisionTreeRegressor': 0.9989581686775351}

In [80]:
model = KNeighborsRegressor().fit(np.array(X).reshape(-1,1),y)
model.score(np.array(X).reshape(-1,1),y)

0.9991567495596736

In [81]:
model.predict(X)[50]

168.21182266009853

In [82]:
np.array(y)[50]

168.21182266009853

In [83]:
data.columns

Index(['Date', 'Cases', 'Deaths', 'Recovered', 'Active'], dtype='object')

In [84]:
# Cases = GaussianNB 1.0
# Deaths = KNeighborsRegressor 0.9991567495596736
# Recovered = KNeighborsRegressor 0.9677711225207198
# Active = KNeighborsRegressor 0.9888386462728688

In [90]:
X = data['Date']
y = data.drop('Date',axis=1)

In [91]:
data.columns

Index(['Date', 'Cases', 'Deaths', 'Recovered', 'Active'], dtype='object')

In [92]:
cols = ['Cases','Deaths','Recovered','Active']

In [95]:
X = np.array(X).reshape(-1,1)

In [104]:
import random

In [112]:
index = random.randint(1,50)
for col in cols:
    model = GaussianNB()
    model.fit(X,np.array(y[col]).astype(np.int))
#     print(col)
#     print(model.score(X,np.array(y[col]).astype(np.int)))
#     print(model.predict(X)[index])
#     print(y.iloc[index][col])
    print(col)
    print(model.predict(np.array([20210508]).reshape(-1,1).astype(np.int)))
    print('\n')

Cases
[121338]


Deaths
[764]


Recovered
[4580]


Active
[1696]


