In [7]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set random seed for reproducibility
random.seed(123)
np.random.seed(123)

# Generate sample data
num_patients = 100

patient_ids = np.array(['P{:03}'.format(i) for i in range(1, num_patients + 1)])
admission_dates = np.array([datetime(2022, 1, 1) + timedelta(days=random.randint(0, 365)) for _ in range(num_patients)])
discharge_dates = np.array([admission + timedelta(days=random.randint(1, 14)) for admission in admission_dates])
diagnoses = np.random.choice(['Heart Failure', 'Pneumonia', 'Stroke', 'Heart Attack', 'Asthma', None, '', ], size=num_patients)
comorbidities = np.random.choice(['Hypertension', 'Diabetes', None, '', ], size=num_patients)
medications = np.array([['ACE Inhibitors', 'Diuretics'],
                        ['Antibiotics', 'Insulin'],
                        ['Antiplatelets', 'Blood Pressure Medications'],
                        ['Statins', 'Beta-blockers', 'Aspirin', 'Insulin'],
                        ['Inhalers', 'Corticosteroids'], None])
length_of_stay = np.random.randint(-1, 10, size=num_patients)
readmission_status = np.random.choice(['Yes', 'No', None], size=num_patients, p=[0.2, 0.7, .1])
ages = np.random.randint(18, 90, size=num_patients)
genders = np.random.choice(['Male', 'Female'], size=num_patients)

# Create the DataFrame
data = np.vstack((patient_ids, admission_dates, discharge_dates, diagnoses, comorbidities,
                  np.random.choice(medications, size=num_patients),
                  length_of_stay, readmission_status, ages, genders)).T

columns = ['Patient ID', 'Admission Date', 'Discharge Date', 'Primary Diagnosis', 'Comorbidities',
           'Medication History', 'Length of Stay', 'Readmission Status', 'Age', 'Gender']

df = pd.DataFrame(data, columns=columns)
df.head()


  medications = np.array([['ACE Inhibitors', 'Diuretics'],


Unnamed: 0,Patient ID,Admission Date,Discharge Date,Primary Diagnosis,Comorbidities,Medication History,Length of Stay,Readmission Status,Age,Gender
0,P001,2022-01-27,2022-02-07,,,"[Statins, Beta-blockers, Aspirin, Insulin]",8,No,38,Female
1,P002,2022-05-18,2022-05-23,,Diabetes,,9,No,48,Female
2,P003,2022-02-14,2022-02-22,,,,0,No,87,Male
3,P004,2022-07-28,2022-08-10,Stroke,,"[Inhalers, Corticosteroids]",6,,32,Female
4,P005,2022-05-17,2022-05-28,Asthma,,,0,Yes,74,Male


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Patient ID          100 non-null    object        
 1   Admission Date      100 non-null    datetime64[ns]
 2   Discharge Date      100 non-null    datetime64[ns]
 3   Primary Diagnosis   90 non-null     object        
 4   Comorbidities       75 non-null     object        
 5   Medication History  73 non-null     object        
 6   Length of Stay      100 non-null    object        
 7   Readmission Status  92 non-null     object        
 8   Age                 100 non-null    object        
 9   Gender              100 non-null    object        
dtypes: datetime64[ns](2), object(8)
memory usage: 7.9+ KB


In [9]:
df.describe()

  df.describe()
  df.describe()


Unnamed: 0,Patient ID,Admission Date,Discharge Date,Primary Diagnosis,Comorbidities,Medication History,Length of Stay,Readmission Status,Age,Gender
count,100,100,100,90.0,75.0,73,100.0,92,100.0,100
unique,100,88,93,6.0,3.0,5,11.0,2,53.0,2
top,P001,2022-06-22 00:00:00,2022-02-07 00:00:00,,,"[Statins, Beta-blockers, Aspirin, Insulin]",6.0,No,89.0,Male
freq,1,3,2,16.0,27.0,18,14.0,75,6.0,58
first,,2022-01-01 00:00:00,2022-01-02 00:00:00,,,,,,,
last,,2022-12-26 00:00:00,2022-12-29 00:00:00,,,,,,,


In [10]:
df.isna()

Unnamed: 0,Patient ID,Admission Date,Discharge Date,Primary Diagnosis,Comorbidities,Medication History,Length of Stay,Readmission Status,Age,Gender
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,True,False,True,False,False,False,False
2,False,False,False,False,True,True,False,False,False,False
3,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
95,False,False,False,False,True,False,False,False,False,False
96,False,False,False,True,True,False,False,False,False,False
97,False,False,False,False,False,True,False,False,False,False
98,False,False,False,False,False,False,False,False,False,False


In [11]:
df.isnull()

Unnamed: 0,Patient ID,Admission Date,Discharge Date,Primary Diagnosis,Comorbidities,Medication History,Length of Stay,Readmission Status,Age,Gender
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,True,False,True,False,False,False,False
2,False,False,False,False,True,True,False,False,False,False
3,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
95,False,False,False,False,True,False,False,False,False,False
96,False,False,False,True,True,False,False,False,False,False
97,False,False,False,False,False,True,False,False,False,False
98,False,False,False,False,False,False,False,False,False,False


In [12]:
df.isna().sum()

Patient ID             0
Admission Date         0
Discharge Date         0
Primary Diagnosis     10
Comorbidities         25
Medication History    27
Length of Stay         0
Readmission Status     8
Age                    0
Gender                 0
dtype: int64

In [16]:
df['Readmission Status'].isna().sum()

8

In [15]:
df['Readmission Status'].fillna("filled")

0         No
1         No
2         No
3     filled
4        Yes
       ...  
95        No
96        No
97        No
98        No
99        No
Name: Readmission Status, Length: 100, dtype: object

In [17]:
df['Readmission Status'].isna().sum()

8