In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('covid19india.csv')

In [3]:
df.head()

Unnamed: 0,patientId,reportedOn,onsetEstimate,ageEstimate,gender,city,district,state,status,notes,contractedFrom
0,1,30/01/2020,,20.0,female,Thrissur,Thrissur,Kerala,Recovered,Travelled from Wuhan,
1,2,02/02/2020,,,,Alappuzha,Alappuzha,Kerala,Recovered,Travelled from Wuhan,
2,3,03/02/2020,,,,Kasaragod,Kasaragod,Kerala,Recovered,Travelled from Wuhan,
3,4,02/03/2020,,45.0,male,East Delhi (Mayur Vihar),East Delhi,Delhi,Recovered,"Travelled from Austria, Italy",
4,5,02/03/2020,,24.0,male,Hyderabad,Hyderabad,Telangana,Recovered,"Travelled from Dubai to Bangalore on 20th Feb,...",


In [4]:
df.columns

Index(['patientId', 'reportedOn', 'onsetEstimate', 'ageEstimate', 'gender',
       'city', 'district', 'state', 'status', 'notes', 'contractedFrom'],
      dtype='object')

We only need the following columns:

​“patient id, reported on, age estimate,
gender, state, status” 

Hence we will drop rest of the columns

In [5]:
df.drop(['onsetEstimate', 'city', 'district', 'notes', 'contractedFrom'], inplace = True, axis = 1)

In [6]:
df.head()

Unnamed: 0,patientId,reportedOn,ageEstimate,gender,state,status
0,1,30/01/2020,20.0,female,Kerala,Recovered
1,2,02/02/2020,,,Kerala,Recovered
2,3,03/02/2020,,,Kerala,Recovered
3,4,02/03/2020,45.0,male,Delhi,Recovered
4,5,02/03/2020,24.0,male,Telangana,Recovered


In [7]:
df_columns = df.columns.to_list()

In [8]:
df.isnull().sum()

patientId          0
reportedOn         0
ageEstimate    25545
gender         22578
state              1
status             0
dtype: int64

We can see that most of null values are in the columns 'ageEstimate' and 'Gender'. The number of null values are also pretty huge so we can't drop these values but in contrast the number of null values in state is 1, we can drop that particular row.

Dropping that particular row

In [9]:
df.dropna(axis = 0, subset = ['state'], inplace = True)

In [10]:
df['gender'].value_counts()

male      3547
female    1766
Name: gender, dtype: int64

In [11]:
df.isnull().sum()

patientId          0
reportedOn         0
ageEstimate    25544
gender         22577
state              0
status             0
dtype: int64

In [12]:
ratio_of_male_to_female = df['gender'].value_counts().values[0] / df['gender'].value_counts().values[1]
print("Ratio of number of males to female given is", round(ratio_of_male_to_female, 2))

Ratio of number of males to female given is 2.01


In [13]:
import random
while(df['gender'].isnull().sum() != 0):
    df['gender'].fillna(random.randint(0, 1),inplace = True, limit = 1)

In [14]:
df['gender'].value_counts()

0         11308
1         11269
male       3547
female     1766
Name: gender, dtype: int64

In [15]:
gender = {0 : 'Male', 1 : 'Female', 'male' : 'Male', 'female' : 'Female' }
df['gender'] = df['gender'].map(gender)

In [16]:
df['gender'].value_counts()

Male      14855
Female    13035
Name: gender, dtype: int64

In [17]:
df.isnull().sum()

patientId          0
reportedOn         0
ageEstimate    25544
gender             0
state              0
status             0
dtype: int64

In [18]:
df.shape

(27890, 6)

In [19]:
test = df.copy()

In [20]:
test.shape

(27890, 6)

In [21]:
df.shape

(27890, 6)

In [22]:
to_be_dropped = df['ageEstimate'].isnull()

In [23]:
to_be_dropped.shape

(27890,)

In [24]:
df.dropna(axis = 0, subset = ['ageEstimate'], inplace = True)

In [25]:
df.shape

(2346, 6)

In [26]:
test.shape

(27890, 6)

In [27]:
print(type(list(df['ageEstimate'])[0]))

<class 'str'>


As the values for ageEstimate is a string and not integer, we won't be able to take mean, hence we need to convert to integer

In [28]:
age = list(df['ageEstimate'])
for i in range(len(age)):
  if age[i] == '28-35': #There is a value of '28-35', which will cause problem while converting to float.
    age[i] = '30'
  age[i] = float(age[i])

In [29]:
age_mean = round(np.mean(age))
age_mean

38.0

In [30]:
t = 0

dropped_age = list(to_be_dropped.values)
for i in range(0, len(dropped_age)):
  dropped_age[i] = str(dropped_age[i])
  if dropped_age[i] == 'False':
    dropped_age[i] = age[t]
    t += 1
    
  if dropped_age[i] == 'True':
    dropped_age[i] = age_mean

Joining the to_be_dropped table back with original table

In [31]:
len(dropped_age)

27890

In [32]:
test.shape

(27890, 6)

In [33]:
test['ageEstimate'] = pd.Series(dropped_age)

In [34]:
test.dropna(inplace = True)

In [35]:
test.isnull().sum()

patientId      0
reportedOn     0
ageEstimate    0
gender         0
state          0
status         0
dtype: int64

In [36]:
test.to_csv('finalcovid19india.csv')