In [1]:
#create synthetic data
#load it into dataframe
#perform data cleaning


In [2]:
import numpy as np
import pandas as pd
from datetime import datetime

# step 1. create synthetic healthcare data

In [3]:
data={
    "patient_id": [1,2,3,4,5,6,7,8],
    "name":["AB","BC","CD","DE","EF","GH","IJ","KL"],
    "age":[34,np.nan,45,29,50,62,np.nan,34],
    "gender":["M","F","M","F","M","F","M","F"],
    "blood_pressure":["120/80",None,"140/90","abc","130/85","150/85","120/80",None],
    "diagnosis":["Hypertension","Diabetes","hypertension","HYPERTENSION","Pre-diabetes","diabetes","None","Pre-diabetes"],
    "data_of_visit":["2023-01-10","2023-02-19","15/02/2023","2023-05-23","2023-02-09","invalid_date","18/02/2023","10/04/2024"]
}

In [4]:
df = pd.DataFrame(data)

In [5]:
df

Unnamed: 0,patient_id,name,age,gender,blood_pressure,diagnosis,data_of_visit
0,1,AB,34.0,M,120/80,Hypertension,2023-01-10
1,2,BC,,F,,Diabetes,2023-02-19
2,3,CD,45.0,M,140/90,hypertension,15/02/2023
3,4,DE,29.0,F,abc,HYPERTENSION,2023-05-23
4,5,EF,50.0,M,130/85,Pre-diabetes,2023-02-09
5,6,GH,62.0,F,150/85,diabetes,invalid_date
6,7,IJ,,M,120/80,,18/02/2023
7,8,KL,34.0,F,,Pre-diabetes,10/04/2024


In [6]:
df['diagnosis']=df['diagnosis'].str.lower().str.strip()
df

Unnamed: 0,patient_id,name,age,gender,blood_pressure,diagnosis,data_of_visit
0,1,AB,34.0,M,120/80,hypertension,2023-01-10
1,2,BC,,F,,diabetes,2023-02-19
2,3,CD,45.0,M,140/90,hypertension,15/02/2023
3,4,DE,29.0,F,abc,hypertension,2023-05-23
4,5,EF,50.0,M,130/85,pre-diabetes,2023-02-09
5,6,GH,62.0,F,150/85,diabetes,invalid_date
6,7,IJ,,M,120/80,none,18/02/2023
7,8,KL,34.0,F,,pre-diabetes,10/04/2024


In [7]:
df['diagnosis'].replace(["none","null",""],np.nan,inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['diagnosis'].replace(["none","null",""],np.nan,inplace=True)


Unnamed: 0,patient_id,name,age,gender,blood_pressure,diagnosis,data_of_visit
0,1,AB,34.0,M,120/80,hypertension,2023-01-10
1,2,BC,,F,,diabetes,2023-02-19
2,3,CD,45.0,M,140/90,hypertension,15/02/2023
3,4,DE,29.0,F,abc,hypertension,2023-05-23
4,5,EF,50.0,M,130/85,pre-diabetes,2023-02-09
5,6,GH,62.0,F,150/85,diabetes,invalid_date
6,7,IJ,,M,120/80,,18/02/2023
7,8,KL,34.0,F,,pre-diabetes,10/04/2024


In [8]:
df['age']=df['age'].fillna(df['age'].median())

In [9]:
df

Unnamed: 0,patient_id,name,age,gender,blood_pressure,diagnosis,data_of_visit
0,1,AB,34.0,M,120/80,hypertension,2023-01-10
1,2,BC,39.5,F,,diabetes,2023-02-19
2,3,CD,45.0,M,140/90,hypertension,15/02/2023
3,4,DE,29.0,F,abc,hypertension,2023-05-23
4,5,EF,50.0,M,130/85,pre-diabetes,2023-02-09
5,6,GH,62.0,F,150/85,diabetes,invalid_date
6,7,IJ,39.5,M,120/80,,18/02/2023
7,8,KL,34.0,F,,pre-diabetes,10/04/2024


In [10]:
df['blood_pressure']=df['blood_pressure'].where(df['blood_pressure'].str.contains(r'^\d+/\d+$'))
df

Unnamed: 0,patient_id,name,age,gender,blood_pressure,diagnosis,data_of_visit
0,1,AB,34.0,M,120/80,hypertension,2023-01-10
1,2,BC,39.5,F,,diabetes,2023-02-19
2,3,CD,45.0,M,140/90,hypertension,15/02/2023
3,4,DE,29.0,F,,hypertension,2023-05-23
4,5,EF,50.0,M,130/85,pre-diabetes,2023-02-09
5,6,GH,62.0,F,150/85,diabetes,invalid_date
6,7,IJ,39.5,M,120/80,,18/02/2023
7,8,KL,34.0,F,,pre-diabetes,10/04/2024


r'^\d+/\d+$'   --> regex pattern
- ^ -> start of string
- \d -> one or more digits
- / -> a forward slash
- \d -> one or more digits
- $ -> marks end of the string

In [11]:
df['data_of_visit']= pd.to_datetime(df['data_of_visit'], errors='coerce')
df

Unnamed: 0,patient_id,name,age,gender,blood_pressure,diagnosis,data_of_visit
0,1,AB,34.0,M,120/80,hypertension,2023-01-10
1,2,BC,39.5,F,,diabetes,2023-02-19
2,3,CD,45.0,M,140/90,hypertension,NaT
3,4,DE,29.0,F,,hypertension,2023-05-23
4,5,EF,50.0,M,130/85,pre-diabetes,2023-02-09
5,6,GH,62.0,F,150/85,diabetes,NaT
6,7,IJ,39.5,M,120/80,,NaT
7,8,KL,34.0,F,,pre-diabetes,NaT


coerce (not a time)

In [12]:
df.drop_duplicates(inplace=True)g

In [13]:
df

Unnamed: 0,patient_id,name,age,gender,blood_pressure,diagnosis,data_of_visit
0,1,AB,34.0,M,120/80,hypertension,2023-01-10
1,2,BC,39.5,F,,diabetes,2023-02-19
2,3,CD,45.0,M,140/90,hypertension,NaT
3,4,DE,29.0,F,,hypertension,2023-05-23
4,5,EF,50.0,M,130/85,pre-diabetes,2023-02-09
5,6,GH,62.0,F,150/85,diabetes,NaT
6,7,IJ,39.5,M,120/80,,NaT
7,8,KL,34.0,F,,pre-diabetes,NaT


In [14]:
df.dropna(subset=['diagnosis','data_of_visit'],inplace=True)
df

Unnamed: 0,patient_id,name,age,gender,blood_pressure,diagnosis,data_of_visit
0,1,AB,34.0,M,120/80,hypertension,2023-01-10
1,2,BC,39.5,F,,diabetes,2023-02-19
3,4,DE,29.0,F,,hypertension,2023-05-23
4,5,EF,50.0,M,130/85,pre-diabetes,2023-02-09


In [15]:
df.dropna(subset=['blood_pressure'],inplace=True)
df

Unnamed: 0,patient_id,name,age,gender,blood_pressure,diagnosis,data_of_visit
0,1,AB,34.0,M,120/80,hypertension,2023-01-10
4,5,EF,50.0,M,130/85,pre-diabetes,2023-02-09


In [16]:
# read_csv