In [1]:
import pandas as pd
from datetime import datetime

df = pd.read_csv("KaggleV2-May-2016.csv")

In [2]:
#Check for missing values

missing = df.isnull().sum()
print("Missing Values:\n", missing)

Missing Values:
 PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64


#### Handling missing values

In [3]:
# 1)Drop rows with missing values
df_cleaned = df.dropna()

# 2) Fill numeric columns with mean, and string columns with mode
df_filled = df.copy()
for col in df.columns:
    if df[col].dtype == 'object':
        df_filled[col] = df[col].fillna(df[col].mode()[0])
    else:
        df_filled[col] = df[col].fillna(df[col].mean())

In [4]:
print(df_cleaned.head())

      PatientId  AppointmentID Gender          ScheduledDay  \
0  2.987250e+13        5642903      F  2016-04-29T18:38:08Z   
1  5.589978e+14        5642503      M  2016-04-29T16:08:27Z   
2  4.262962e+12        5642549      F  2016-04-29T16:19:04Z   
3  8.679512e+11        5642828      F  2016-04-29T17:29:31Z   
4  8.841186e+12        5642494      F  2016-04-29T16:07:23Z   

         AppointmentDay  Age      Neighbourhood  Scholarship  Hipertension  \
0  2016-04-29T00:00:00Z   62    JARDIM DA PENHA            0             1   
1  2016-04-29T00:00:00Z   56    JARDIM DA PENHA            0             0   
2  2016-04-29T00:00:00Z   62      MATA DA PRAIA            0             0   
3  2016-04-29T00:00:00Z    8  PONTAL DE CAMBURI            0             0   
4  2016-04-29T00:00:00Z   56    JARDIM DA PENHA            0             1   

   Diabetes  Alcoholism  Handcap  SMS_received No-show  
0         0           0        0             0      No  
1         0           0        0      

In [5]:
# Remove Duplicate Rows

df = df_cleaned.drop_duplicates()

In [6]:
# Convert all gender entries to uppercase (F/M)
df['Gender'] = df['Gender'].str.upper().str.strip()

df['Neighbourhood'] = df['Neighbourhood'].str.title().str.strip()

In [7]:
# Convert to datetime
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'], errors='coerce')
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'], errors='coerce')

# Convert to specific string format
df['ScheduledDay'] = df['ScheduledDay'].dt.strftime('%d-%m-%Y')
df['AppointmentDay'] = df['AppointmentDay'].dt.strftime('%d-%m-%Y')

print(df.head())

      PatientId  AppointmentID Gender ScheduledDay AppointmentDay  Age  \
0  2.987250e+13        5642903      F   29-04-2016     29-04-2016   62   
1  5.589978e+14        5642503      M   29-04-2016     29-04-2016   56   
2  4.262962e+12        5642549      F   29-04-2016     29-04-2016   62   
3  8.679512e+11        5642828      F   29-04-2016     29-04-2016    8   
4  8.841186e+12        5642494      F   29-04-2016     29-04-2016   56   

       Neighbourhood  Scholarship  Hipertension  Diabetes  Alcoholism  \
0    Jardim Da Penha            0             1         0           0   
1    Jardim Da Penha            0             0         0           0   
2      Mata Da Praia            0             0         0           0   
3  Pontal De Camburi            0             0         0           0   
4    Jardim Da Penha            0             1         1           0   

   Handcap  SMS_received No-show  
0        0             0      No  
1        0             0      No  
2        0 

In [8]:
# Convert date columns to datetime again
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'], format='%d-%m-%Y', errors='coerce')
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'], format='%d-%m-%Y', errors='coerce')


In [9]:
#Rename Column Headers (clean, lowercase, no spaces)

df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

In [10]:
#Check & Fix Data Types

# Convert age to int 
df['age'] = df['age'].astype(int)

# Check data types
print(df.dtypes)

patientid                float64
appointmentid              int64
gender                    object
scheduledday      datetime64[ns]
appointmentday    datetime64[ns]
age                        int32
neighbourhood             object
scholarship                int64
hipertension               int64
diabetes                   int64
alcoholism                 int64
handcap                    int64
sms_received               int64
no-show                   object
dtype: object


In [11]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   patientid       110527 non-null  float64       
 1   appointmentid   110527 non-null  int64         
 2   gender          110527 non-null  object        
 3   scheduledday    110527 non-null  datetime64[ns]
 4   appointmentday  110527 non-null  datetime64[ns]
 5   age             110527 non-null  int32         
 6   neighbourhood   110527 non-null  object        
 7   scholarship     110527 non-null  int64         
 8   hipertension    110527 non-null  int64         
 9   diabetes        110527 non-null  int64         
 10  alcoholism      110527 non-null  int64         
 11  handcap         110527 non-null  int64         
 12  sms_received    110527 non-null  int64         
 13  no-show         110527 non-null  object        
dtypes: datetime64[ns](2), float64(1), in