# This notebook tracks and Analyse the spread of the coronavirus(COVID-19) 🦠.


> ***Please let me know if there is any scope of improvement in the output or something. I don't have much experience in data science, but I dedicated my best in this.Thank you.*** 

# * > IMPORTING MODULLES****

In [None]:

!pip install plotly


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn import ensemble
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/covid19-global-forecasting-week-5/train.csv')
test = pd.read_csv('../input/covid19-global-forecasting-week-5/test.csv')
sample = pd.read_csv('../input/covid19-global-forecasting-week-5/submission.csv')

> Gaining knowledge about data.

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
sample

In [None]:
sample['TargetValue'].sum()

In [None]:
train.sort_values(by=['TargetValue'])

   #                                  Data Visualization 

In [None]:
fig = px.pie(train, values='TargetValue', names='Target')
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.show()

In [None]:
fig = px.pie(train, values='TargetValue', names='Country_Region')
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.show()

In [None]:
fig = px.treemap(train, path=['Country_Region'], values='TargetValue',
                  color='Population', hover_data=['Country_Region'],
                  color_continuous_scale='matter', title='Current share of Worldwide COVID19 Confirmed Cases')
fig.show()

In [None]:
last_date = train.Date.max()
df_countries = train[train['Date']==last_date]
df_countries = df_countries.groupby('Country_Region', as_index=False)['TargetValue'].sum()
df_countries = df_countries.nlargest(10,'TargetValue')
df_trend = train.groupby(['Date','Country_Region'], as_index=False)['TargetValue'].sum()
df_trend = df_trend.merge(df_countries, on='Country_Region')
df_trend.rename(columns={'Country_Region':'Country', 'TargetValue_x':'Cases'}, inplace=True)


In [None]:
px.line(df_trend, x='Date', y='Cases', color='Country', title='COVID19 Total Cases growth for top 10 worst affected countries')


# Data Preprocessing

> We would drop some features Who have many Null values and not that much important.

In [None]:
train = train.drop(['County','Province_State','Country_Region','Target'],axis=1)
test = test.drop(['County','Province_State','Country_Region','Target'],axis=1)
train

In [None]:
from sklearn.preprocessing import OrdinalEncoder

def create_features(df):
    df['day'] = df['Date'].dt.day
    df['month'] = df['Date'].dt.month
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['dayofyear'] = df['Date'].dt.dayofyear
    df['quarter'] = df['Date'].dt.quarter
    df['weekofyear'] = df['Date'].dt.weekofyear
    return df

In [None]:
def train_dev_split(df, days):
    #Last days data as dev set
    date = df['Date'].max() - dt.timedelta(days=days)
    return df[df['Date'] <= date], df[df['Date'] > date]


In [None]:
test_date_min = test['Date'].min()
test_date_max = test['Date'].max()


In [None]:
def avoid_data_leakage(df, date=test_date_min):
    return df[df['Date']<date]

In [None]:
def to_integer(dt_time):
    return 10000*dt_time.year + 100*dt_time.month + dt_time.day


In [None]:
train['Date']=pd.to_datetime(train['Date'])
test['Date']=pd.to_datetime(test['Date'])

In [None]:
test['Date']=test['Date'].dt.strftime("%Y%m%d")
train['Date']=train['Date'].dt.strftime("%Y%m%d").astype(int)


# USING REGRESSOR TO FIND TARGET VALUES

In [None]:
from sklearn.model_selection import train_test_split

predictors = train.drop(['TargetValue', 'Id'], axis=1)
target = train["TargetValue"]
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.22, random_state = 0)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = RandomForestRegressor(n_jobs=-1)
estimators = 500
model.set_params(n_estimators=estimators)

scores = []

pipeline = Pipeline([('scaler2' , StandardScaler()),
                        ('RandomForestRegressor: ', model)])
pipeline.fit(X_train , y_train)
prediction = pipeline_dt.predict(X_test)

pipeline.fit(X_train, y_train)
scores.append(pipeline.score(X_test, y_test))

In [None]:
X_test

In [None]:
test.drop(['ForecastId'],axis=1,inplace=True)
test.index.name = 'Id'
test

In [None]:
y_pred2 = pipeline.predict(X_test)
y_pred2

In [None]:
predictions = pipeline.predict(test)

pred_list = [int(x) for x in predictions]

output = pd.DataFrame({'Id': test.index, 'TargetValue': pred_list})
print(output)

In [None]:
output

# Finding Quantile values from the output.

In [None]:
a=output.groupby(['Id'])['TargetValue'].quantile(q=0.05).reset_index()
b=output.groupby(['Id'])['TargetValue'].quantile(q=0.5).reset_index()
c=output.groupby(['Id'])['TargetValue'].quantile(q=0.95).reset_index()


In [None]:

a.columns=['Id','q0.05']
b.columns=['Id','q0.5']
c.columns=['Id','q0.95']
a=pd.concat([a,b['q0.5'],c['q0.95']],1)
a['q0.05']=a['q0.05'].clip(0,10000)
a['q0.5']=a['q0.5'].clip(0,10000)
a['q0.95']=a['q0.95'].clip(0,10000)
a

In [None]:
a['Id'] =a['Id']+ 1
a

# Submission

In [None]:
sub=pd.melt(a, id_vars=['Id'], value_vars=['q0.05','q0.5','q0.95'])
sub['variable']=sub['variable'].str.replace("q","", regex=False)
sub['ForecastId_Quantile']=sub['Id'].astype(str)+'_'+sub['variable']
sub['TargetValue']=sub['value']
sub=sub[['ForecastId_Quantile','TargetValue']]
sub.reset_index(drop=True,inplace=True)
sub.head()

In [None]:
sub.to_csv("submission.csv",index=False)


# Put that upvote button if you liked it. Thank you.

# Will try to update kernel with much better score than before.