In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python

#load packages
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))

import matplotlib as mpl #collection of functions for scientific and publication-ready visualization
print("matplotlib version: {}". format(mpl.__version__))

import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))

import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__)) 

import pandas_profiling as pp #utility to autoprofile the dataset
print("Pandas Profiling version: {}". format(pp.__version__)) 

import seaborn as sns #ultimate utility for visualizations
print("Seaborn version: {}". format(sns.__version__)) 

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__)) 

import sklearn #collection of machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))

import plotly.express as px
import plotly.graph_objects as go

#misc libraries
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import random
from datetime import datetime, timedelta


#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*80)


In [None]:
#Configure the defaults for the Notebook
from IPython.core.interactiveshell import InteractiveShell  
InteractiveShell.ast_node_interactivity = "all"

#Configure Visualization Defaults
%matplotlib inline 
mpl.style.use('ggplot')
sns.set_style('ticks')
pd.set_option('display.max_rows',20000, 'display.max_columns',100)


In [None]:
# Read the data block, change the input DIR and work on loading the data
input_DIR ='../input/20200306-01'

import os

for dirname, _, filenames in os.walk(input_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        #print("{}=pd.read_csv('{}')".format(filename.split("_")[-1].split(".")[0],os.path.join(dirname, filename)))

In [None]:
# Read the Raw Data from 
import json
import urllib

def loadRawData(url,node):
    operUrl = urllib.request.urlopen(url)
    if(operUrl.getcode()==200):
       data = operUrl.read()
       jsonData = json.loads(data)
       to_return=pd.DataFrame(data=jsonData[node],dtype='object')
    else:
       print("Error receiving data", operUrl.getcode())
    return to_return

rawData=loadRawData("https://raw.githubusercontent.com/covid19india/api/master/raw_data.json","raw_data")


In [None]:
rawData.info()

In [None]:
#Clean up the Raw Data 
#rawData.columns
rawData=rawData[['patientnumber', 'statepatientnumber', 'agebracket','detecteddistrict','detectedcity', 
        'detectedstate', 'currentstatus','statuschangedate', 'dateannounced',
       'estimatedonsetdate', 'gender', 'nationality','contractedfromwhichpatientsuspected', 
        'source1', 'source2', 'source3',
        'notes']]

rawData.index=rawData['patientnumber']

if 'patientnumber' in rawData.columns:
    rawData.drop('patientnumber', axis='columns',inplace=True)

rawData=rawData.replace("",np.nan)

rawData.drop(index=rawData[rawData.detectedstate.isna()].index,inplace=True)


In [None]:
rawData.rename({'statepatientnumber':'PatientId','agebracket':'Age','detecteddistrict':'District','detectedcity':'City','detectedstate':'State/UT','currentstatus':'Status','dateannounced':'AnnouncedDate',
               'estimatedonsetdate':'OnsetDate','gender':'Gender','nationality':'Nationality','contractedfromwhichpatientsuspected':'SourcePatient','statuschangedate':'StatusChangeDate'}, axis=1,inplace=True)

In [None]:
#List all records where district is null
rawData.AnnouncedDate=pd.to_datetime(rawData.AnnouncedDate, format='%d/%m/%Y')
rawData.OnsetDate=pd.to_datetime(rawData.OnsetDate, format='%d/%m/%Y')
rawData.StatusChangeDate=pd.to_datetime(rawData.StatusChangeDate, format='%d/%m/%Y')


In [None]:
#Validate that there are no issues with RawData for the dates conversion
from datetime import datetime 
rawData[rawData.AnnouncedDate >= datetime.now()]
rawData[rawData.AnnouncedDate.isna()]


In [None]:
# Statewise data
dCases=rawData.groupby(by=['State/UT','AnnouncedDate'],as_index=False)['Status'].count()
dCases.rename({'Status':'Reported'},axis=1,inplace=True)
tmp=rawData[rawData.Status=='Recovered']
dCasesRecovered=tmp.groupby(by=['State/UT','StatusChangeDate'],as_index=False)['Status'].count()
dCasesRecovered.rename({'Status':'Recovered'},axis=1,inplace=True)
tmp=rawData[rawData.Status=='Deceased']
dCasesDeceased=tmp.groupby(by=['State/UT','StatusChangeDate'],as_index=False)['Status'].count()
dCasesDeceased.rename({'Status':'Deceased'},axis=1,inplace=True)

tmp=dCases.merge(dCasesRecovered,left_on=['State/UT','AnnouncedDate'],right_on=['State/UT','StatusChangeDate'],how='outer',indicator=True)
 
tmp['AnnouncedDate']=tmp.apply((lambda x: (x['AnnouncedDate']) if (x['AnnouncedDate'] is not pd.NaT) else (x['StatusChangeDate'])),axis=1)

dCases=tmp[['State/UT', 'AnnouncedDate', 'Reported', 'Recovered']]

tmp=dCases.merge(dCasesDeceased,left_on=['State/UT','AnnouncedDate'],right_on=['State/UT','StatusChangeDate'],how='outer',indicator=True)
tmp['AnnouncedDate']=tmp.apply((lambda x: (x['AnnouncedDate']) if (x['AnnouncedDate'] is not pd.NaT) else (x['StatusChangeDate'])),axis=1)

dCases=tmp[['State/UT', 'AnnouncedDate', 'Reported', 'Recovered','Deceased']]

dCases.replace(to_replace=np.nan, value=0,inplace=True)

print("Check - Reported:{} Recovered:{} Deceased:{}".format(dCases.Reported.sum(),dCases.Recovered.sum(),dCases.Deceased.sum()))
dCases

In [None]:
dCases=dCases.sort_values(by=['State/UT','AnnouncedDate']).reset_index(drop=True)
dCases['TotalReported']=dCases['TotalRecovered']=dCases['TotalDeceased']=0
for i in dCases.index:
    if(i==0):
        dCases.loc[i,'TotalReported']=dCases.loc[i,'Reported']
        dCases.loc[i,'TotalRecovered']=dCases.loc[i,'Recovered']
        dCases.loc[i,'TotalDeceased']=dCases.loc[i,'Deceased']
        
    elif( dCases.loc[i,'State/UT']==dCases.loc[i-1,'State/UT']):
        dCases.loc[i,'TotalReported']=dCases.loc[i-1,'TotalReported'] + dCases.loc[i,'Reported']
        dCases.loc[i,'TotalRecovered']=dCases.loc[i-1,'TotalRecovered'] + dCases.loc[i,'Recovered']
        dCases.loc[i,'TotalDeceased']=dCases.loc[i-1,'TotalDeceased'] + dCases.loc[i,'Deceased']
    
    else:
        dCases.loc[i,'TotalReported']=dCases.loc[i,'Reported']
        dCases.loc[i,'TotalRecovered']=dCases.loc[i,'Recovered']
        dCases.loc[i,'TotalDeceased']=dCases.loc[i,'Deceased']

dCases['TotalActive']=dCases.TotalReported - dCases.TotalRecovered - dCases.TotalDeceased 

print("Check - Reported:{} Recovered:{} Deceased:{}".format(dCases.Reported.sum(),dCases.Recovered.sum(),dCases.Deceased.sum()))
print("Check - Reported:{} Recovered:{} Deceased:{}".format(
                                        dCases.groupby(by=['State/UT'])['TotalReported'].max().sum(),
                                        dCases.groupby(by=['State/UT'])['TotalRecovered'].max().sum(),
                                        dCases.groupby(by=['State/UT'])['TotalDeceased'].max().sum(),
                                        ))
dCases

In [None]:
t=dCases[['State/UT', 'AnnouncedDate', 'TotalReported', 'TotalRecovered', 'TotalDeceased', 'TotalActive']]
lstdf=[]
for eachStateUT in t['State/UT'].unique():
    tmp0=t[t['State/UT']==eachStateUT]
    minDate=tmp0.AnnouncedDate.min()
    maxDate=datetime.now()
    df=pd.DataFrame(np.arange(minDate,maxDate, timedelta(days=1)),columns=['AnnouncedDate'])
    tmp1= pd.merge_ordered(left=df,right=tmp0,on='AnnouncedDate', how='outer',fill_method='ffill')
    lstdf+=[tmp1]
dfTotals=pd.concat(lstdf, ignore_index=True)

print("Check - Reported:{} Recovered:{} Deceased:{}".format(
                                        t.groupby(by=['State/UT'])['TotalReported'].max().sum(),
                                        t.groupby(by=['State/UT'])['TotalRecovered'].max().sum(),
                                        t.groupby(by=['State/UT'])['TotalDeceased'].max().sum(),
                                        ))
#dfTotals.groupby(by=['AnnouncedDate'])['TotalReported','TotalRecovered','TotalDeceased','TotalActive'].sum()


In [None]:
#Accomodate for the reporting model changed by Covid19India.org from 29th of March 2020
dfTemp=pd.read_csv('../input/20200414dr01/20200414dr01.csv',dtype='object')
dfTemp.Date=pd.to_datetime(dfTemp.Date,format='%d-%m-%Y')
dfTemp.fillna(0,inplace=True)
dfTemp['Recovered']=dfTemp['Recovered'].astype(str).astype(int)
dfTemp['Deceased']=dfTemp['Deceased'].astype(str).astype(int)
dfTemp=dfTemp.sort_values(by=['State','Date']).reset_index(drop=True)
dfTemp['TotalRecovered']=dfTemp['TotalDeceased']=0
for i in dfTemp.index:
    if(i==0):
        dfTemp.loc[i,'TotalRecovered']=dfTemp.loc[i,'Recovered']
        dfTemp.loc[i,'TotalDeceased']=dfTemp.loc[i,'Deceased']
    elif( dfTemp.loc[i,'State']==dfTemp.loc[i-1,'State']):
        dfTemp.loc[i,'TotalRecovered']=dfTemp.loc[i-1,'TotalRecovered'] + dfTemp.loc[i,'Recovered']
        dfTemp.loc[i,'TotalDeceased']=dfTemp.loc[i-1,'TotalDeceased'] + dfTemp.loc[i,'Deceased']
    else:
        dfTemp.loc[i,'TotalRecovered']=dfTemp.loc[i,'Recovered']
        dfTemp.loc[i,'TotalDeceased']=dfTemp.loc[i,'Deceased']
print("Check - Recovered:{} Deceased:{}".format(dfTemp.Recovered.sum(),dfTemp.Deceased.sum()))
print("Check - Recovered:{} Deceased:{}".format(
                                        dfTemp.groupby(by=['State'])['TotalRecovered'].max().sum(),
                                        dfTemp.groupby(by=['State'])['TotalDeceased'].max().sum(),
                                        ))
t=dfTemp[['State','Date','TotalRecovered','TotalDeceased']]
lstdf=[]
for eachStateUT in t['State'].unique():
    tmp0=t[t['State']==eachStateUT]
    minDate=tmp0.Date.min()
    maxDate=dfTotals.AnnouncedDate.max() + timedelta(days=1)
    #print("minDate:{} maxDate:{}".format(minDate,maxDate))
    df=pd.DataFrame(np.arange(minDate,maxDate, timedelta(days=1)),columns=['Date'])
    tmp1= pd.merge_ordered(left=df,right=tmp0,on='Date', how='outer',fill_method='ffill')
    lstdf+=[tmp1]
dfTemp=pd.concat(lstdf, ignore_index=True)
dfTemp=dfTemp.sort_values(by=['State','Date']).reset_index(drop=True)
print("Check - Recovered:{} Deceased:{}".format(
                                        dfTemp.groupby(by=['State'])['TotalRecovered'].max().sum(),
                                        dfTemp.groupby(by=['State'])['TotalDeceased'].max().sum(),
                                        ))
tmp=dfTotals.merge(right=dfTemp,left_on=['AnnouncedDate','State/UT'],right_on=['Date','State'],indicator=True,how='outer')
tmp.TotalRecovered_x=tmp.apply((lambda x:x['TotalRecovered_y'] if (x['TotalRecovered_y'] >0) else (x['TotalRecovered_x'])),axis=1)
tmp.TotalDeceased_x=tmp.apply((lambda x:x['TotalDeceased_y'] if (x['TotalDeceased_y'] >0) else (x['TotalDeceased_x'])),axis=1)
#tmp[tmp.TotalRecovered_y.notna()].TotalRecovered_x = tmp[tmp.TotalRecovered_y.notna()].TotalRecovered_y
#tmp
print("Check - Recovered:{} Deceased:{}".format(
                                        tmp.groupby(by=['State/UT'])['TotalRecovered_x'].max().sum(),
                                        tmp.groupby(by=['State/UT'])['TotalDeceased_x'].max().sum(),
                                        ))

tmp.rename({'TotalRecovered_x':'TotalRecovered','TotalDeceased_x':'TotalDeceased'},axis=1,inplace=True)
dfTotals=tmp[['AnnouncedDate', 'State/UT', 'TotalReported', 'TotalRecovered','TotalDeceased']]
dfTotals['TotalActive']=dfTotals.TotalReported - dfTotals.TotalRecovered - dfTotals.TotalDeceased
print("Check - Reported:{} Recovered:{} Deceased:{}".format(
                                        dfTotals.groupby(by=['State/UT'])['TotalReported'].max().sum(),
                                        dfTotals.groupby(by=['State/UT'])['TotalRecovered'].max().sum(),
                                        dfTotals.groupby(by=['State/UT'])['TotalDeceased'].max().sum(),
                                        ))

In [None]:
fig, ax = plt.subplots(ncols=2,nrows=1,figsize=(16,4))
ax[0].title.set_text('Linear Plat Cases vs Time')
dfTotals.groupby(by=['AnnouncedDate'])['TotalReported','TotalRecovered','TotalDeceased','TotalActive'].sum().plot(ax=ax[0])
ax[1].set_yscale('log')
dfTotals.groupby(by=['AnnouncedDate'])['TotalReported','TotalRecovered','TotalDeceased','TotalActive'].sum().plot(ax=ax[1])



**The above graph shows that if data after 9th of March is considered then from there onwards Corona Cases in India follow a log-linear pattern i.e. almost a straight line. Let us take note of it, validate with one or two more states data and then try to predict cases using Log-Linear Regression
**

In [None]:
#Total Cases Datewise 
fig = px.bar(dfTotals, y='TotalReported', x='AnnouncedDate',hover_data =['State/UT','TotalActive','TotalRecovered','TotalDeceased'], color='TotalReported',height=600)
fig.update_layout(
    title='Total Cases Datewise in India')


In [None]:
tmp=dfTotals[(dfTotals['State/UT']=='Maharashtra') & (dfTotals.AnnouncedDate >= '2020-03-09')]
fig, ax = plt.subplots(ncols=2,nrows=1,figsize=(16,4))
ax[0].title.set_text('Maharashtra: Linear Plot Cases vs Time')
tmp.groupby(by=['AnnouncedDate'])['TotalReported','TotalRecovered','TotalDeceased','TotalActive'].sum().plot(ax=ax[0])
ax[1].set_yscale('log')
ax[1].title.set_text('Maharashtra: Log-Linear Plot Cases vs Time')
tmp.groupby(by=['AnnouncedDate'])['TotalReported','TotalRecovered','TotalDeceased','TotalActive'].sum().plot(ax=ax[1])



In [None]:
tmp=dfTotals[(dfTotals['State/UT']=='Kerala') & (dfTotals.AnnouncedDate >= '2020-03-09')]
fig, ax = plt.subplots(ncols=2,nrows=1,figsize=(16,4))
ax[0].title.set_text('Kerala: Linear Plot Cases vs Time')
tmp.groupby(by=['AnnouncedDate'])['TotalReported','TotalRecovered','TotalDeceased','TotalActive'].sum().plot(ax=ax[0])
ax[1].set_yscale('log')
ax[1].title.set_text('Kerala: Log-Linear Plot Cases vs Time')
tmp.groupby(by=['AnnouncedDate'])['TotalReported','TotalRecovered','TotalDeceased','TotalActive'].sum().plot(ax=ax[1])



In [None]:
fig, ax = plt.subplots(ncols=2,nrows=1,figsize=(16,4))
ax[0].title.set_text('All States : Linear Plot Cases vs Time')
tmp=dfTotals[dfTotals.AnnouncedDate >= '2020-03-09']
sns.lineplot(data=tmp,x='AnnouncedDate',y='TotalReported', hue='State/UT', ax=ax[0],legend=False)
ax[1].set_yscale('log')
ax[1].title.set_text('All States : Log-Linear Plot Cases vs Time')
sns.lineplot(data=tmp,x='AnnouncedDate',y='TotalReported', hue='State/UT', ax=ax[1],legend=False)


In [None]:
#1. First Predictor - LogLinear Regressor to predict number of Reported Cases in next say 10 days 
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

dtp=10 #next number of days to predict
minDate='2020-03-09'
tmp=dfTotals[dfTotals.AnnouncedDate >= minDate ]
tmp=tmp.groupby(by=['AnnouncedDate'])['TotalReported','TotalRecovered','TotalDeceased','TotalActive'].sum()

#Number of days for the data
x=np.linspace(1,tmp.shape[0],tmp.shape[0]).reshape(-1,1)
x_next10=np.linspace(tmp.shape[0] + 1,tmp.shape[0] + dtp,dtp).reshape(-1,1)

yReported=np.log10(tmp.TotalReported.to_numpy())

X_train, X_test, y_trainReported, y_testReported = train_test_split(x, yReported, test_size=0.33, random_state=1)

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
null=regr.fit(X_train, y_trainReported)

# Make predictions using the testing set
y_predReported = regr.predict(X_test)

y_next10 = regr.predict(x_next10)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_testReported, y_predReported))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_testReported, y_predReported))

# Plot outputs
fig, ax = plt.subplots(ncols=2,nrows=1,figsize=(16,4))

null=ax[0].title.set_text('India : Reported Cases on LogLinear Scale')
null=ax[0].scatter(x, yReported,  color='black')
null=ax[0].scatter(X_test, y_predReported, color='blue')
null=ax[0].plot(x_next10, y_next10,  color='red',linewidth=3)

null=ax[1].title.set_text('India : Reported Cases on Linear Scale')
null=ax[1].scatter(x,tmp.TotalReported,color='black')
null=ax[1].scatter(X_test, 10**y_predReported, color='blue')
null=ax[1].plot(x_next10, 10**y_next10,  color='red',linewidth=3)

plt.show()
print("Black  - Training Set, Blue - Testing Set, Red - Prediction Set")
print("Coefficient of determination: = 0.99 ~ 1 => Perfect Log Linear Relationship, cannot have been better")

minDate=datetime.strptime(minDate,"%Y-%m-%d") + timedelta(days= tmp.shape[0]) 
maxDate=minDate + timedelta(days=dtp)

predictDf=pd.DataFrame({'PredictionForDate':np.arange(minDate,maxDate, timedelta(days=1)),'Total No. Cases Predicted':10**y_next10})
predictDf['Total No. Cases Predicted']=predictDf['Total No. Cases Predicted'].round()
predictDf


## Fitting Sigmoid Function to Indian Data


                        y = a/(1 + exp(-(days - b)/c))

* a => maximum number of estimated Confirmed Cases that we might expect
* b => inflection point on the curve that is at the 50th percentile
* c => slope in the initial phase of Confirmed Cases

In [None]:
## Sigmoid fitting of Indian Data
from scipy import integrate, optimize
from scipy.optimize import curve_fit

dataDf=dfTotals[['AnnouncedDate','TotalReported','TotalDeceased']]
dataDf=dataDf.groupby(by=['AnnouncedDate'],as_index=False)['TotalReported','TotalDeceased'].sum()
dataDf['NumDays'] = list(range(1,len(dataDf)+1))

# Mortality Rate using ployfit
m,b = np.polyfit(dataDf.TotalReported, dataDf.TotalDeceased,1) 
print('Mortality Rate={:.2f}'.format(m*100))

def myFunc(days, InfPop, Inflection, c):
    y = (InfPop/(1+np.exp(-(days-Inflection)/c)))      
    return y

#fit the data, return the best fit parameters and the covariance matrix
popt, pcov = curve_fit(myFunc, dataDf['NumDays'], dataDf['TotalReported'])

print()
print("Fit parameters:")
print("   Max Infected Populatuion =", popt[0])
print("   Inflection point in days = ", popt[1])
print("   c =" , popt[2])

minDate=dataDf.AnnouncedDate.min()
projectionsDf=pd.DataFrame(data=None,columns=tmp.columns)
projectionsDf['NumDays'] = list(range(dataDf.NumDays.max() + 1,int(popt[1] * 2 +5),1))
projectionsDf.AnnouncedDate= projectionsDf.NumDays.map(lambda x:minDate + timedelta(days=(x-1)))
projectionsDf.TotalReported= myFunc(projectionsDf.NumDays, popt[0], popt[1], popt[2])
projectionsDf.TotalDeceased= projectionsDf.TotalReported * m

fig=plt.figure(1,figsize=(8, 6))

null=plt.plot(dataDf.NumDays, dataDf.TotalReported, linewidth=3,label ='Confirmed Cases', color="red" )
null=plt.plot(dataDf.NumDays, dataDf.TotalDeceased, linewidth=3,label ='Confirmed Deaths',color='purple')

#overplot the best fit curve
null=plt.plot(projectionsDf.NumDays, projectionsDf.TotalReported, "rs", label ='Sigmoid Fit',linestyle='dashed')
null=plt.plot(projectionsDf.NumDays, projectionsDf.TotalDeceased, linewidth=3,label ='Estimated Deaths' , color='purple', linestyle='dashed')

for xt,yt in zip(projectionsDf.NumDays[0::10], projectionsDf.TotalDeceased[0::10]):
    label = "{:.0f}".format(yt)
    null=plt.annotate(label, # this is the text
                 (xt,yt), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

null= plt.plot(popt[1],popt[0]/2, label ='Inflection Point', marker='o',markerfacecolor='blue', markersize=12)
#Calculate rate of change in Confirmed Cases
ydiff = np.hstack((0,np.diff(dataDf.TotalReported)*10))
null= plt.bar(dataDf.NumDays, ydiff,align='center', alpha=1, color='green', label ='Confirmed Cases Rate of change')

null= plt.annotate('Inflection Point', color='blue', xy=(popt[1],popt[0]/2),  xycoords='data',
            xytext=(0.3, 0.6), textcoords='axes fraction',
            arrowprops=dict(facecolor='blue', shrink=0.05),
            horizontalalignment='right', verticalalignment='top',
            )
null= plt.annotate('Rate of Change (x10)', color='green', xy=(popt[1],popt[0]/3 ),  xycoords='data',
            xytext=(0.1, 0.3), textcoords='axes fraction',
            )

null=plt.xlim(0,projectionsDf.NumDays.max())
#plt.ylim(0,max(myFunc(x, popt[0], popt[1], popt[2]))+3000)
null=plt.ylim(0,projectionsDf.TotalReported.max()+1000)
null=plt.legend(loc='upper left')
plt.grid(True)
plt.show()




In [None]:
tmp=dfTotals.groupby(by=['AnnouncedDate'])['TotalReported','TotalRecovered','TotalDeceased'].sum()
#DataFrame.pct_change(self: ~FrameOrSeries, periods=1, fill_method='pad', limit=None, freq=None, **kwargs) →
tmp['%Change']=tmp.TotalReported.pct_change(periods=1) * 100
tmp['ReportedBy100']= tmp.TotalReported/100
tmp.drop(labels=['TotalReported','TotalRecovered','TotalDeceased'], axis=1,inplace=True)
tmp.iloc[59:,:].plot()

In [None]:
# Read the Raw Data from 
import json
import urllib

def loadRawData(url,node):
    operUrl = urllib.request.urlopen(url)
    if(operUrl.getcode()==200):
       data = operUrl.read()
       jsonData = json.loads(data)
       to_return=pd.DataFrame(data=jsonData[node],dtype='object')
    else:
       print("Error receiving data", operUrl.getcode())
    return to_return
 
states_daily=loadRawData("https://api.covid19india.org/states_daily.json","states_daily")



In [None]:
lstColumns = list(states_daily.columns)
lstColumns.remove("date")
lstColumns.remove("status")
lstdf=[]
print(lstColumns)
for eachCol in lstColumns:
    tmp=states_daily[[eachCol,'status','date']]
    tmp['State']=eachCol
    tmp[eachCol]=tmp[eachCol].replace(to_replace='', value=0)
    tmp.rename({eachCol:'Count','date':'AnnouncedDate'}, axis=1,inplace=True)
    tmp.Count=tmp.Count.astype(str).astype(int)
    tmpConf=tmp[tmp.status=='Confirmed']
    tmpDece=tmp[tmp.status=='Deceased']
    tmpReco=tmp[tmp.status=='Recovered']
    
    tmpConf.rename({'Count':'Confirmed'}, axis=1,inplace=True)
    tmpDece.rename({'Count':'Deceased'}, axis=1,inplace=True)
    tmpReco.rename({'Count':'Recovered'}, axis=1,inplace=True)

    tmp1=tmpConf.merge(right=tmpReco,on='AnnouncedDate',indicator=True,suffixes=('_c', '_r'))
    tmp1.drop(labels=["status_c","status_r","State_r","_merge"],axis=1,inplace=True)
    tmp2=tmp1.merge(right=tmpDece,on='AnnouncedDate',indicator=True)
    tmp2.drop(labels=["State_c","status","_merge"],axis=1,inplace=True)
    lstdf+=[tmp2]
    
df1=pd.concat(lstdf, ignore_index=True)


In [None]:
df1=df1[df1.State!='tt']
df1['Confirmed'].sum()
df1['Recovered'].sum()
df1['Deceased'].sum()
df1