In [127]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from scipy import stats
from IPython.display import display, HTML

In [129]:
data = pd.read_csv('patients_data.csv',low_memory=False)

In [131]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247236 entries, 0 to 247235
Data columns (total 22 columns):
 #   Column                                     Non-Null Count   Dtype  
---  ------                                     --------------   -----  
 0   Patient Number                             99793 non-null   float64
 1   State Patient Number                       110759 non-null  object 
 2   Date Announced                             247234 non-null  object 
 3   Estimated Onset Date                       0 non-null       float64
 4   Age Bracket                                108030 non-null  object 
 5   Gender                                     110800 non-null  object 
 6   Detected City                              12052 non-null   object 
 7   Detected District                          238066 non-null  object 
 8   Detected State                             247217 non-null  object 
 9   State code                                 247114 non-null  object 
 10  Current 

In [157]:
#Dropping the columns which are not required
data_new=data.drop(['State Patient Number','Estimated Onset Date','State code','Notes','Contracted from which Patient (Suspected)','Nationality','Source_1','Source_2','Source_3','Backup Notes','Num Cases','Entry_ID'], axis=1)
print(data_new)

        Patient Number Date Announced Age Bracket Gender  \
0                  1.0     30-01-2020          20      F   
1                  2.0     02-02-2020         NaN    NaN   
2                  3.0     03-02-2020         NaN    NaN   
3                  4.0     02-03-2020          45      M   
4                  5.0     02-03-2020          24      M   
...                ...            ...         ...    ...   
247231             NaN     06-08-2020         NaN    NaN   
247232             NaN     06-08-2020         NaN    NaN   
247233             NaN     06-08-2020         NaN    NaN   
247234             NaN     06-08-2020         NaN    NaN   
247235             NaN            NaN         NaN    NaN   

                   Detected City Detected District Detected State  \
0                       Thrissur          Thrissur         Kerala   
1                      Alappuzha         Alappuzha         Kerala   
2                      Kasaragod         Kasaragod         Kerala   
3  

In [159]:
# Dropping Rows (with Nan values) and Columns
new=data_new.dropna (subset=['Detected District','Detected State','Detected City'])
new.drop(columns=['Patient Number', 'Status Change Date'], inplace=True)
# Handling Gender Attribute (Forward Filling)
new['Gender']=new['Gender'].ffill()
new.isnull().sum()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new.drop(columns=['Patient Number', 'Status Change Date'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['Gender']=new['Gender'].ffill()


Date Announced              0
Age Bracket              1548
Gender                      0
Detected City               0
Detected District           0
Detected State              0
Current Status              0
Type of transmission    11084
dtype: int64

In [161]:
#Handling Age Bracket Attribute (Mean Imputation)
new['Age Bracket'].dtype

dtype('O')

In [163]:
new['Age Bracket']=pd.to_numeric(new['Age Bracket'],errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['Age Bracket']=pd.to_numeric(new['Age Bracket'],errors='coerce')


In [165]:
new['Age Bracket'].fillna(int(new['Age Bracket'].mean()), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new['Age Bracket'].fillna(int(new['Age Bracket'].mean()), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['Age Bracket'].fillna(int(new['Age Bracket'].mean()), inplace=True)


In [167]:
# Handling Current Status Attribute
new['Current Status'].replace(np.nan, data['Current Status'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new['Current Status'].replace(np.nan, data['Current Status'].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['Current Status'].replace(np.nan, data['Current Status'].mode()[0], inplace=True)


In [169]:
# Handling Type Of Transmission Attribute (Replacing Nan values with TBD)
new['Type of transmission'].replace(to_replace=np.nan, value='TBD', inplace=True)
new.insert(0, 'Patient Number', range (1, 1+len (new)))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new['Type of transmission'].replace(to_replace=np.nan, value='TBD', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['Type of transmission'].replace(to_replace=np.nan, value='TBD', inplace=True)


In [171]:
new.isnull().sum()

Patient Number          0
Date Announced          0
Age Bracket             0
Gender                  0
Detected City           0
Detected District       0
Detected State          0
Current Status          0
Type of transmission    0
dtype: int64