In [9]:
import pandas as pd
import numpy as np
import os

In [10]:
%pwd

'd:\\SICSR\\project\\MediFlow-Predict\\notebooks'

In [11]:
os.chdir("../")

In [12]:
%pwd

'd:\\SICSR\\project\\MediFlow-Predict'

In [15]:
import os
os.listdir("data/raw")


['.gitkeep', 'NoShow.csv']

In [27]:
# Force PatientId to be read as string to preserve exact values
df = pd.read_csv("data/raw/NoShow.csv", dtype={'PatientId': str})


In [28]:

# Clean and rename columns
df.columns = [
    'patient_id', 'appointment_id', 'gender', 'scheduled_day', 'appointment_day',
    'age', 'neighbourhood', 'scholarship', 'hypertension', 'diabetes',
    'alcoholism', 'handicap', 'sms_received', 'no_show'
]

# Convert dates to datetime
df['scheduled_day'] = pd.to_datetime(df['scheduled_day'])
df['appointment_day'] = pd.to_datetime(df['appointment_day'])

# Filter out invalid ages
df = df[df['age'] >= 0]

# Fix categorical fields
df['gender'] = df['gender'].str.upper().str.strip()

# Convert 'no_show' to binary (1 = missed, 0 = showed up)
df['no_show'] = df['no_show'].map({'Yes': 1, 'No': 0})

# Reset index after filtering
df.reset_index(drop=True, inplace=True)

# Preview cleaned data
df.head()


Unnamed: 0,patient_id,appointment_id,gender,scheduled_day,appointment_day,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show
0,29872499824296,5642903,F,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,JARDIM DA PENHA,0,1,0,0,0,0,0
1,558997776694438,5642503,M,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,0,0,0,0,0,0
2,4262962299951,5642549,F,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,MATA DA PRAIA,0,0,0,0,0,0,0
3,867951213174,5642828,F,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,PONTAL DE CAMBURI,0,0,0,0,0,0,0
4,8841186448183,5642494,F,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,1,1,0,0,0,0


In [29]:
# 1. Simulate Appointment Mode
df['appointment_mode'] = np.random.choice(['video', 'audio', 'physical'], size=len(df), p=[0.4, 0.4, 0.2])

# 2. Simulate Reminder Sent (based loosely on sms_received or fresh random)
df['reminder_sent'] = np.random.choice([1, 0], size=len(df), p=[0.7, 0.3])

# 3. Calculate Lead Time in Days
df['lead_time_days'] = (df['appointment_day'] - df['scheduled_day']).dt.days
# Optional: handle negatives if any
df = df[df['lead_time_days'] >= 0]

# 4. Simulate Reported Issues
df['reported_issue'] = np.random.choice(
    ['Flu', 'Diabetes', 'Hypertension', 'Fever', 'Cold', 'Injury', 'Checkup'],
    size=len(df)
)

# Preview final dataset
df[['appointment_mode', 'reminder_sent', 'lead_time_days', 'reported_issue']].head()


Unnamed: 0,appointment_mode,reminder_sent,lead_time_days,reported_issue
5,physical,0,1,Flu
6,video,0,1,Cold
7,audio,1,1,Injury
9,video,1,1,Diabetes
10,video,1,1,Checkup


In [30]:
# 1. Standardize date formats (remove time & timezone)
df['scheduled_day'] = pd.to_datetime(df['scheduled_day']).dt.date
df['appointment_day'] = pd.to_datetime(df['appointment_day']).dt.date

# 2. Format Patient ID: remove scientific notation, convert to string
df['patient_id'] = df['patient_id'].apply(lambda x: str(int(float(x))))


# 3. Format gender to capital letters (F/M)
df['gender'] = df['gender'].str.upper()

# 4. Reset column order for clarity
ordered_columns = [
    'patient_id', 'appointment_id', 'gender', 'age', 'neighbourhood',
    'scheduled_day', 'appointment_day', 'lead_time_days',
    'appointment_mode', 'reminder_sent', 'sms_received',
    'scholarship', 'hypertension', 'diabetes', 'alcoholism', 'handicap',
    'reported_issue', 'no_show'
]
df = df[ordered_columns]

# 5. Final check
df.head()


Unnamed: 0,patient_id,appointment_id,gender,age,neighbourhood,scheduled_day,appointment_day,lead_time_days,appointment_mode,reminder_sent,sms_received,scholarship,hypertension,diabetes,alcoholism,handicap,reported_issue,no_show
5,95985133231274,5626772,F,76,REPÚBLICA,2016-04-27,2016-04-29,1,physical,0,0,0,1,0,0,0,Flu,0
6,733688164476661,5630279,F,23,GOIABEIRAS,2016-04-27,2016-04-29,1,video,0,0,0,0,0,0,0,Cold,1
7,3449833394123,5630575,F,39,GOIABEIRAS,2016-04-27,2016-04-29,1,audio,1,0,0,0,0,0,0,Injury,1
9,78124564369297,5629123,F,19,CONQUISTA,2016-04-27,2016-04-29,1,video,1,0,0,0,0,0,0,Diabetes,0
10,734536231958495,5630213,F,30,NOVA PALESTINA,2016-04-27,2016-04-29,1,video,1,0,0,0,0,0,0,Checkup,0


In [32]:
# Save processed dataset
df.to_csv("data/processed/NoShow.csv", index=False)
print("Processed dataset saved")

Processed dataset saved
