In [28]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import unicodedata

In [29]:
df = pd.read_csv("Medical info.csv")
df.head(20)

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No
5,95985130000000.0,5626772,F,2016-04-27T08:36:51Z,2016-04-29T00:00:00Z,76,REPÚBLICA,0,1,0,0,0,0,No
6,733688200000000.0,5630279,F,2016-04-27T15:05:12Z,2016-04-29T00:00:00Z,23,GOIABEIRAS,0,0,0,0,0,0,Yes
7,3449833000000.0,5630575,F,2016-04-27T15:39:58Z,2016-04-29T00:00:00Z,39,GOIABEIRAS,0,0,0,0,0,0,Yes
8,56394730000000.0,5638447,F,2016-04-29T08:02:16Z,2016-04-29T00:00:00Z,21,ANDORINHAS,0,0,0,0,0,0,No
9,78124560000000.0,5629123,F,2016-04-27T12:48:25Z,2016-04-29T00:00:00Z,19,CONQUISTA,0,0,0,0,0,0,No


# Data Cleaning

In [30]:
# Check for the null values
df.isnull().sum()

PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64

In [31]:
# Remove duplicate rows
df_no_duplicates = df.drop_duplicates()

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [33]:
# Function to remove accents
def remove_accents(text):
    if isinstance(text, str):
        return ''.join(
            char for char in unicodedata.normalize('NFKD', text)
            if not unicodedata.combining(char)
        )
    return text

# Apply to the 'Neighbourhood' column
df['Neighbourhood'] = df['Neighbourhood'].apply(remove_accents).str.strip().str.title()

# See result
print(df['Neighbourhood'].unique()[:10])

['Jardim Da Penha' 'Mata Da Praia' 'Pontal De Camburi' 'Republica'
 'Goiabeiras' 'Andorinhas' 'Conquista' 'Nova Palestina' 'Da Penha'
 'Tabuazeiro']


In [34]:
# Convert columns to datetime format
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype              
---  ------          --------------   -----              
 0   PatientId       110527 non-null  float64            
 1   AppointmentID   110527 non-null  int64              
 2   Gender          110527 non-null  object             
 3   ScheduledDay    110527 non-null  datetime64[ns, UTC]
 4   AppointmentDay  110527 non-null  datetime64[ns, UTC]
 5   Age             110527 non-null  int64              
 6   Neighbourhood   110527 non-null  object             
 7   Scholarship     110527 non-null  int64              
 8   Hipertension    110527 non-null  int64              
 9   Diabetes        110527 non-null  int64              
 10  Alcoholism      110527 non-null  int64              
 11  Handcap         110527 non-null  int64              
 12  SMS_received    110527 non-null  int64              
 13  No-show       

In [35]:
# Convert to string format 'dd-mm-yyyy'
df['ScheduledDay'] = df['ScheduledDay'].dt.strftime('%d-%m-%Y')
df['AppointmentDay'] = df['AppointmentDay'].dt.strftime('%d-%m-%Y')


In [36]:
df.head(10)

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,29-04-2016,29-04-2016,62,Jardim Da Penha,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,29-04-2016,29-04-2016,56,Jardim Da Penha,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,29-04-2016,29-04-2016,62,Mata Da Praia,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,29-04-2016,29-04-2016,8,Pontal De Camburi,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,29-04-2016,29-04-2016,56,Jardim Da Penha,0,1,1,0,0,0,No
5,95985130000000.0,5626772,F,27-04-2016,29-04-2016,76,Republica,0,1,0,0,0,0,No
6,733688200000000.0,5630279,F,27-04-2016,29-04-2016,23,Goiabeiras,0,0,0,0,0,0,Yes
7,3449833000000.0,5630575,F,27-04-2016,29-04-2016,39,Goiabeiras,0,0,0,0,0,0,Yes
8,56394730000000.0,5638447,F,29-04-2016,29-04-2016,21,Andorinhas,0,0,0,0,0,0,No
9,78124560000000.0,5629123,F,27-04-2016,29-04-2016,19,Conquista,0,0,0,0,0,0,No


In [37]:
# Rename all columns to lowercase and remove spaces
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# View new column names
print(df.columns)

Index(['patientid', 'appointmentid', 'gender', 'scheduledday',
       'appointmentday', 'age', 'neighbourhood', 'scholarship', 'hipertension',
       'diabetes', 'alcoholism', 'handcap', 'sms_received', 'no-show'],
      dtype='object')
