# Importing the libraries 

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 


# Loading the Dataset

In [2]:
df = pd.read_csv('covid_19_india.csv')
df.head()

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
0,1,2020-01-30,6:00 PM,Kerala,1,0,0,0,1
1,2,2020-01-31,6:00 PM,Kerala,1,0,0,0,1
2,3,2020-02-01,6:00 PM,Kerala,2,0,0,0,2
3,4,2020-02-02,6:00 PM,Kerala,3,0,0,0,3
4,5,2020-02-03,6:00 PM,Kerala,3,0,0,0,3


# Learning about the Data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18110 entries, 0 to 18109
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Sno                       18110 non-null  int64 
 1   Date                      18110 non-null  object
 2   Time                      18110 non-null  object
 3   State/UnionTerritory      18110 non-null  object
 4   ConfirmedIndianNational   18110 non-null  object
 5   ConfirmedForeignNational  18110 non-null  object
 6   Cured                     18110 non-null  int64 
 7   Deaths                    18110 non-null  int64 
 8   Confirmed                 18110 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 1.2+ MB


In [4]:
df.nunique()

Sno                         18110
Date                          560
Time                            7
State/UnionTerritory           46
ConfirmedIndianNational        62
ConfirmedForeignNational       12
Cured                       14445
Deaths                       6471
Confirmed                   14971
dtype: int64

In [5]:
df.describe()

Unnamed: 0,Sno,Cured,Deaths,Confirmed
count,18110.0,18110.0,18110.0,18110.0
mean,9055.5,278637.5,4052.402264,301031.4
std,5228.051023,614890.9,10919.076411,656148.9
min,1.0,0.0,0.0,0.0
25%,4528.25,3360.25,32.0,4376.75
50%,9055.5,33364.0,588.0,39773.5
75%,13582.75,278869.8,3643.75,300149.8
max,18110.0,6159676.0,134201.0,6363442.0


In [6]:
df.shape

(18110, 9)

# Describing the null present in the data 

In [7]:
df.isnull().sum()

Sno                         0
Date                        0
Time                        0
State/UnionTerritory        0
ConfirmedIndianNational     0
ConfirmedForeignNational    0
Cured                       0
Deaths                      0
Confirmed                   0
dtype: int64

# Removing the unwanted row

In [8]:
del df['Sno']

In [9]:
# pre-processing the data 

In [10]:
df['Date'] = pd.to_datetime(df['Date'])


In [11]:
df['day'] = df['Date'].dt.day
df['month'] = df['Date'].dt.month
df['year']= df['Date'].dt.year
df.head()

Unnamed: 0,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed,day,month,year
0,2020-01-30,6:00 PM,Kerala,1,0,0,0,1,30,1,2020
1,2020-01-31,6:00 PM,Kerala,1,0,0,0,1,31,1,2020
2,2020-02-01,6:00 PM,Kerala,2,0,0,0,2,1,2,2020
3,2020-02-02,6:00 PM,Kerala,3,0,0,0,3,2,2,2020
4,2020-02-03,6:00 PM,Kerala,3,0,0,0,3,3,2,2020


In [12]:
df['Time'] = pd.to_datetime(df['Time'])

In [13]:
df['Hour'] = df['Time'].dt.hour

In [14]:
del df['Date']
del df['Time']

In [15]:
len(df['State/UnionTerritory'].unique())

46

In [16]:
df.head()

Unnamed: 0,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed,day,month,year,Hour
0,Kerala,1,0,0,0,1,30,1,2020,18
1,Kerala,1,0,0,0,1,31,1,2020,18
2,Kerala,2,0,0,0,2,1,2,2020,18
3,Kerala,3,0,0,0,3,2,2,2020,18
4,Kerala,3,0,0,0,3,3,2,2020,18


In [17]:
# pre processing the confirmedIndianNational

In [18]:
data = []
for i in df['ConfirmedIndianNational']:
    if i == '-':
        data.append(int(i.replace('-','0')))
    else:
        data.append(int(i))
df['ConfirmedIndianNational'] = data

In [19]:
# preprocessing the ConfirmedForeignNational column 

In [20]:
data = []
for i in df['ConfirmedForeignNational']:
    if i == '-':
        data.append(int(i.replace('-','0')))
    else:
        data.append(int(i))
df['ConfirmedForeignNational'] = data

In [21]:
# preprocessing the Cured column 

In [22]:
cures = []
for i in df['Cured']:
    cures.append(int(i))
df['Cured'] = cures
df.head()

Unnamed: 0,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed,day,month,year,Hour
0,Kerala,1,0,0,0,1,30,1,2020,18
1,Kerala,1,0,0,0,1,31,1,2020,18
2,Kerala,2,0,0,0,2,1,2,2020,18
3,Kerala,3,0,0,0,3,2,2,2020,18
4,Kerala,3,0,0,0,3,3,2,2020,18


In [23]:
# preprocessing the confirmed column 

In [24]:
confirm = []
for i in df['Confirmed']:
    confirm.append(int(i))
df['Confirmed'] = confirm


In [26]:
states = []

for state in df['State/UnionTerritory']:
    states.append(state)
    
states =list(set(states))

In [42]:
df[df['State/UnionTerritory'] == 'Cases being reassigned to states']
    

Unnamed: 0,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed,day,month,year,Hour
2133,Cases being reassigned to states,0,0,0,0,1096,20,5,2020,8
2167,Cases being reassigned to states,0,0,0,0,1403,21,5,2020,8
2201,Cases being reassigned to states,0,0,0,0,1620,22,5,2020,8
2235,Cases being reassigned to states,0,0,0,0,1899,23,5,2020,8
2270,Cases being reassigned to states,0,0,0,0,2338,24,5,2020,8
2305,Cases being reassigned to states,0,0,0,0,2642,25,5,2020,8
2341,Cases being reassigned to states,0,0,0,0,2970,26,5,2020,8
2377,Cases being reassigned to states,0,0,0,0,4013,27,5,2020,8
2413,Cases being reassigned to states,0,0,0,0,4332,28,5,2020,8
2449,Cases being reassigned to states,0,0,0,0,4673,29,5,2020,8


In [40]:
print(len(df))

18110


In [41]:
18050/18110

0.9966869133075649

In [34]:
for state in states:
    count = 0
    for i in df.values:
        if state == i[0]:
            count += i[1]
    print(state,'|',count)
            

Meghalaya | 0
Dadra and Nagar Haveli and Daman and Diu | 0
Mizoram | 4
Assam | 0
Cases being reassigned to states | 0
Daman & Diu | 0
Dadra and Nagar Haveli | 0
Andhra Pradesh | 81
Puducherry | 11
Ladakh | 162
Nagaland | 0
West Bengal | 71
Madhya Pradesh*** | 0
Telangana | 0
Uttar Pradesh | 462
Arunachal Pradesh | 0
Andaman and Nicobar Islands | 8
Telengana | 246
Chhattisgarh | 25
Delhi | 352
Karnataka | 405
Haryana | 111
Jharkhand | 0
Rajasthan | 296
Gujarat | 256
Madhya Pradesh | 105
Maharashtra*** | 0
Maharashtra | 1111
Uttarakhand | 35
Lakshadweep | 0
Karanataka | 0
Himachal Pradesh | 21
Bihar | 32
Kerala | 1091
Punjab | 231
Bihar**** | 0
Manipur | 5
Unassigned | 0
Tripura | 0
Sikkim | 0
Chandigarh | 50
Goa | 9
Odisha | 23
Tamil Nadu | 138
Himanchal Pradesh | 0
Jammu and Kashmir | 95


# Column analysis

In [10]:
df[df['State/UnionTerritory'] == 'Kerala']

Unnamed: 0,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
0,2020-01-30,6:00 PM,Kerala,1,0,0,0,1
1,2020-01-31,6:00 PM,Kerala,1,0,0,0,1
2,2020-02-01,6:00 PM,Kerala,2,0,0,0,2
3,2020-02-02,6:00 PM,Kerala,3,0,0,0,3
4,2020-02-03,6:00 PM,Kerala,3,0,0,0,3
...,...,...,...,...,...,...,...,...
17946,2021-08-07,8:00 AM,Kerala,-,-,3317314,17515,3513551
17982,2021-08-08,8:00 AM,Kerala,-,-,3337579,17654,3533918
18018,2021-08-09,8:00 AM,Kerala,-,-,3357687,17747,3552525
18054,2021-08-10,8:00 AM,Kerala,-,-,3377691,17852,3565574


In [11]:
lst = []
for i in (df['State/UnionTerritory'].unique()):
    lst.append(df[df['State/UnionTerritory'] == i])


In [12]:
lst

[             Date     Time State/UnionTerritory ConfirmedIndianNational  \
 0      2020-01-30  6:00 PM               Kerala                       1   
 1      2020-01-31  6:00 PM               Kerala                       1   
 2      2020-02-01  6:00 PM               Kerala                       2   
 3      2020-02-02  6:00 PM               Kerala                       3   
 4      2020-02-03  6:00 PM               Kerala                       3   
 ...           ...      ...                  ...                     ...   
 17946  2021-08-07  8:00 AM               Kerala                       -   
 17982  2021-08-08  8:00 AM               Kerala                       -   
 18018  2021-08-09  8:00 AM               Kerala                       -   
 18054  2021-08-10  8:00 AM               Kerala                       -   
 18090  2021-08-11  8:00 AM               Kerala                       -   
 
       ConfirmedForeignNational    Cured  Deaths  Confirmed  
 0                      

In [13]:
df.head()

Unnamed: 0,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
0,2020-01-30,6:00 PM,Kerala,1,0,0,0,1
1,2020-01-31,6:00 PM,Kerala,1,0,0,0,1
2,2020-02-01,6:00 PM,Kerala,2,0,0,0,2
3,2020-02-02,6:00 PM,Kerala,3,0,0,0,3
4,2020-02-03,6:00 PM,Kerala,3,0,0,0,3


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000024F41F39760>