In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
df = pd.read_csv('covid_19_india.csv')
df.head()

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
0,1,2020-01-30,6:00 PM,Kerala,1,0,0,0,1
1,2,2020-01-31,6:00 PM,Kerala,1,0,0,0,1
2,3,2020-02-01,6:00 PM,Kerala,2,0,0,0,2
3,4,2020-02-02,6:00 PM,Kerala,3,0,0,0,3
4,5,2020-02-03,6:00 PM,Kerala,3,0,0,0,3


In [3]:
#Checking the number of rows and column of the dataframe
df.shape

(16850, 9)

In [4]:
#Checking the datatypes of each column
df.dtypes

Sno                          int64
Date                        object
Time                        object
State/UnionTerritory        object
ConfirmedIndianNational     object
ConfirmedForeignNational    object
Cured                        int64
Deaths                       int64
Confirmed                    int64
dtype: object

In [5]:
#Checking if there are any null values in the data
df.isnull().sum()

Sno                         0
Date                        0
Time                        0
State/UnionTerritory        0
ConfirmedIndianNational     0
ConfirmedForeignNational    0
Cured                       0
Deaths                      0
Confirmed                   0
dtype: int64

In [6]:
#Checking the total number of values with '-' in ConfirmedIndianNational and ConfirmedForeign National
df[['ConfirmedIndianNational','ConfirmedForeignNational']].isin(['-']).sum()

ConfirmedIndianNational     16404
ConfirmedForeignNational    16404
dtype: int64

In [7]:
#replacing the '-' with 0
df['ConfirmedIndianNational'].replace('-',0,inplace=True)
df['ConfirmedForeignNational'].replace('-',0,inplace=True)

df.tail()

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
16845,16846,2021-07-07,8:00 AM,Telangana,0,0,613124,3703,628282
16846,16847,2021-07-07,8:00 AM,Tripura,0,0,63964,701,68612
16847,16848,2021-07-07,8:00 AM,Uttarakhand,0,0,332006,7338,340882
16848,16849,2021-07-07,8:00 AM,Uttar Pradesh,0,0,1682130,22656,1706818
16849,16850,2021-07-07,8:00 AM,West Bengal,0,0,1472132,17834,1507241


In [8]:
#Changing the dtypes of column
df[['ConfirmedIndianNational','ConfirmedForeignNational']] = df[['ConfirmedIndianNational','ConfirmedForeignNational']].astype('int64')
df.dtypes

Sno                          int64
Date                        object
Time                        object
State/UnionTerritory        object
ConfirmedIndianNational      int64
ConfirmedForeignNational     int64
Cured                        int64
Deaths                       int64
Confirmed                    int64
dtype: object

In [9]:
#checking list of the mentioned states in states/unionterritory column
df['State/UnionTerritory'].unique()

array(['Kerala', 'Telengana', 'Delhi', 'Rajasthan', 'Uttar Pradesh',
       'Haryana', 'Ladakh', 'Tamil Nadu', 'Karnataka', 'Maharashtra',
       'Punjab', 'Jammu and Kashmir', 'Andhra Pradesh', 'Uttarakhand',
       'Odisha', 'Puducherry', 'West Bengal', 'Chhattisgarh',
       'Chandigarh', 'Gujarat', 'Himachal Pradesh', 'Madhya Pradesh',
       'Bihar', 'Manipur', 'Mizoram', 'Andaman and Nicobar Islands',
       'Goa', 'Unassigned', 'Assam', 'Jharkhand', 'Arunachal Pradesh',
       'Tripura', 'Nagaland', 'Meghalaya',
       'Dadra and Nagar Haveli and Daman and Diu',
       'Cases being reassigned to states', 'Sikkim', 'Daman & Diu',
       'Lakshadweep', 'Telangana', 'Dadra and Nagar Haveli', 'Bihar****'],
      dtype=object)

In [10]:
#Renaming the wrongly typed states name
df['State/UnionTerritory'].replace(['Bihar****','Telengana','Dadra and Nagar Haveli','Daman & Diu'],['Bihar','Telangana','Dadra and Nagar Haveli and Daman and Diu','Dadra and Nagar Haveli and Daman and Diu'],inplace=True)

In [11]:
Unassigned_index = df[df['State/UnionTerritory'] == 'Unassigned'].index
Reassigned_index = df[df['State/UnionTerritory'] == 'Cases being reassigned to states'].index
df.drop(Unassigned_index,inplace=True)
df.drop(Reassigned_index,inplace=True)

In [12]:
#Rechecking list of the mentioned states in states/unionterritory column
df['State/UnionTerritory'].value_counts()

Kerala                                      525
Delhi                                       493
Telangana                                   493
Rajasthan                                   492
Uttar Pradesh                               491
Haryana                                     491
Ladakh                                      488
Tamil Nadu                                  488
Punjab                                      486
Jammu and Kashmir                           486
Karnataka                                   486
Maharashtra                                 486
Andhra Pradesh                              483
Uttarakhand                                 480
Odisha                                      479
Puducherry                                  477
West Bengal                                 477
Chandigarh                                  476
Chhattisgarh                                476
Gujarat                                     475
Himachal Pradesh                        

In [13]:
df.describe()

Unnamed: 0,Sno,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
count,16787.0,16787.0,16787.0,16787.0,16787.0,16787.0
mean,8445.623876,0.323822,0.039733,236921.1,3498.302258,259315.7
std,4862.049216,4.023742,0.630006,523322.9,9345.586841,568132.3
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,4260.5,0.0,0.0,2802.5,24.0,3650.5
50%,8457.0,0.0,0.0,29004.0,463.0,33441.0
75%,12653.5,0.0,0.0,254740.5,3083.0,267461.5
max,16850.0,177.0,14.0,5872268.0,123531.0,6113335.0


### Getting the total number of cases in each year and months

In [14]:
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
print(df['Year'].value_counts())
print(df['Month'].value_counts())

2020    10019
2021     6768
Name: Year, dtype: int64
5     2148
6     2131
4     2028
3     1612
7     1337
1     1118
12    1106
8     1085
10    1085
9     1050
11    1050
2     1037
Name: Month, dtype: int64


In [49]:
df.head(3)

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed,Year,Month,Day
0,1,2020-01-30,6:00 PM,Kerala,1,0,0,0,1,2020,1,30
1,2,2020-01-31,6:00 PM,Kerala,1,0,0,0,1,2020,1,31
2,3,2020-02-01,6:00 PM,Kerala,2,0,0,0,2,2020,2,1


### Getting the average of Cured,Deaths,Confirmed from every states

In [15]:
df.groupby('State/UnionTerritory')[['Cured','Deaths','Confirmed']].mean()

Unnamed: 0_level_0,Cured,Deaths,Confirmed
State/UnionTerritory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Andaman and Nicobar Islands,3390.053,48.238806,3571.957
Andhra Pradesh,628215.1,5125.913043,671111.4
Arunachal Pradesh,11172.49,41.872017,12143.87
Assam,159851.7,992.602592,173690.0
Bihar,214659.3,1638.82241,228990.4
Chandigarh,16765.3,250.747899,18260.1
Chhattisgarh,246141.9,3342.701681,270486.9
Dadra and Nagar Haveli and Daman and Diu,3476.312,2.055944,3700.629
Delhi,454488.2,8249.30426,480675.1
Goa,43121.62,721.447761,47505.47


In [16]:
# df.groupby('State/UnionTerritory').Deaths.mean()

# df.groupby('State/UnionTerritory').Confirmed.mean()

### Getting the total number of Cured,Death,Confirmed cases from every states 

In [17]:
df.groupby('State/UnionTerritory')[['Cured','Deaths','Confirmed']].sum()

Unnamed: 0_level_0,Cured,Deaths,Confirmed
State/UnionTerritory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Andaman and Nicobar Islands,1589935,22624,1675248
Andhra Pradesh,303427899,2475816,324146783
Arunachal Pradesh,5150519,19303,5598324
Assam,74011348,459575,80418492
Bihar,101533848,775163,108312449
Chandigarh,7980284,119356,8691806
Chhattisgarh,117163544,1591126,128751782
Dadra and Nagar Haveli and Daman and Diu,1491338,882,1587570
Delhi,224062704,4066907,236972842
Goa,20224042,338359,22280065


In [18]:
# df.groupby('State/UnionTerritory').Deaths.sum()

# df.groupby('State/UnionTerritory').Confirmed.sum()

In [31]:
px.histogram(title='Confirmed Cases in each state',x='Confirmed',y='State/UnionTerritory',data_frame=df,color_discrete_sequence=px.colors.qualitative.Set1)

In [35]:
px.histogram(title='Cured Cases in each state',x='Cured',y='State/UnionTerritory',data_frame=df,color_discrete_sequence=px.colors.qualitative.Set2)

In [36]:
px.histogram(title='Death Cases in each state',x='Deaths',y='State/UnionTerritory',data_frame=df,color_discrete_sequence=px.colors.qualitative.Plotly)

In [44]:
#Comparing confirmed case in each year
px.histogram(x='Month',y='Confirmed',barmode='group',color='Year',data_frame=df)

In [56]:
#Comparing Cured case in each year
px.histogram(x='Month',y='Cured',barmode='group',color='Year',data_frame=df)

In [55]:
#Comparing Death case in each year
px.histogram(x='Month',y='Deaths',barmode='group',color='Year',data_frame=df)

In [64]:
px.pie(data_frame=df,names='State/UnionTerritory',values='Confirmed',title='Pie Chart of Confirmed Cases in every states of India')

In [76]:
# fig = px.choropleth(data_frame=df,geojson='https://gist.githubusercontent.com/jbrobst/56c13bbbf9d97d187fea01ca62ea5112/raw/e388c4cae20aa53cb5090210a42ebb9b765c0a36/india_states.geojson',featureidkey='properties.ST_NM',locations='State/UnionTerritory',color='Confirmed',color_continuous_scale='Reds')
# fig.update_geos(fitbounds="locations", visible=False)
# fig.show()