In [140]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder,StandardScaler
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report,confusion_matrix,roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [127]:
# Load the dataset
data_path = "../data/processed/cleaned_data.csv"
df = pd.read_csv(data_path)
df.head()


Unnamed: 0,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,waiting_days,appointment_weekday
0,F,2016-04-27 08:36:51+00:00,2016-04-29 00:00:00+00:00,76,REPÚBLICA,0,1,0,0,0,0,0,1,Friday
1,F,2016-04-27 15:05:12+00:00,2016-04-29 00:00:00+00:00,23,GOIABEIRAS,0,0,0,0,0,0,1,1,Friday
2,F,2016-04-27 15:39:58+00:00,2016-04-29 00:00:00+00:00,39,GOIABEIRAS,0,0,0,0,0,0,1,1,Friday
3,F,2016-04-27 12:48:25+00:00,2016-04-29 00:00:00+00:00,19,CONQUISTA,0,0,0,0,0,0,0,1,Friday
4,F,2016-04-27 14:58:11+00:00,2016-04-29 00:00:00+00:00,30,NOVA PALESTINA,0,0,0,0,0,0,0,1,Friday


In [128]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64800 entries, 0 to 64799
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Gender               64800 non-null  object
 1   ScheduledDay         64800 non-null  object
 2   AppointmentDay       64800 non-null  object
 3   Age                  64800 non-null  int64 
 4   Neighbourhood        64800 non-null  object
 5   Scholarship          64800 non-null  int64 
 6   Hipertension         64800 non-null  int64 
 7   Diabetes             64800 non-null  int64 
 8   Alcoholism           64800 non-null  int64 
 9   Handcap              64800 non-null  int64 
 10  SMS_received         64800 non-null  int64 
 11  No-show              64800 non-null  int64 
 12  waiting_days         64800 non-null  int64 
 13  appointment_weekday  64800 non-null  object
dtypes: int64(9), object(5)
memory usage: 6.9+ MB


In [129]:
for col in ['ScheduledDay','AppointmentDay']:
    df[col] = pd.to_datetime(df[col])

In [130]:
# Separate features and target
X = df.drop('No-show',axis=1)
y = df['No-show']

In [131]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64800 entries, 0 to 64799
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   Gender               64800 non-null  object             
 1   ScheduledDay         64800 non-null  datetime64[ns, UTC]
 2   AppointmentDay       64800 non-null  datetime64[ns, UTC]
 3   Age                  64800 non-null  int64              
 4   Neighbourhood        64800 non-null  object             
 5   Scholarship          64800 non-null  int64              
 6   Hipertension         64800 non-null  int64              
 7   Diabetes             64800 non-null  int64              
 8   Alcoholism           64800 non-null  int64              
 9   Handcap              64800 non-null  int64              
 10  SMS_received         64800 non-null  int64              
 11  waiting_days         64800 non-null  int64              
 12  appointment_weekda

In [132]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [133]:
# Column groups
categorical_binary = ['Gender']               # Label / Ordinal
categorical_onehot = ['appointment_weekday']  # One-Hot
categorical_target = ['Neighbourhood']        # Target Encoding
numeric_features = [
    'Age', 'Scholarship', 'Hipertension', 'Diabetes',
    'Alcoholism', 'Handcap', 'SMS_received', 'waiting_days'
]

# week-day order
weekday_order = [['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']]

# Encoders
gender_encoder = OrdinalEncoder()  # F→0, M→1
weekday_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
target_encoder = ce.TargetEncoder(cols=['Neighbourhood'])
scaler = StandardScaler()

# Build the Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('gender', gender_encoder, categorical_binary),
        ('weekday', weekday_encoder, categorical_onehot),
        ('neighbourhood', target_encoder, categorical_target),
        ('numeric', scaler, numeric_features)
    ]
)


In [134]:
# fit and transform on training data
X_train_transformed = preprocessor.fit_transform(X_train, y_train)
X_test_transformed = preprocessor.transform(X_test)

feature_names = preprocessor.get_feature_names_out()


In [135]:
X_train_encoded = pd.DataFrame(
    X_train_transformed,
    columns=feature_names,
    index=X_train.index
)

In [136]:
X_train_encoded

Unnamed: 0,gender__Gender,weekday__appointment_weekday_Friday,weekday__appointment_weekday_Monday,weekday__appointment_weekday_Saturday,weekday__appointment_weekday_Thursday,weekday__appointment_weekday_Tuesday,weekday__appointment_weekday_Wednesday,neighbourhood__Neighbourhood,numeric__Age,numeric__Scholarship,numeric__Hipertension,numeric__Diabetes,numeric__Alcoholism,numeric__Handcap,numeric__SMS_received,numeric__waiting_days
59701,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.267845,-0.158472,-0.325852,-0.518474,-0.284747,-0.162019,-0.129946,0.934364,-0.765879
9130,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.339623,-0.969456,-0.325852,-0.518474,-0.284747,-0.162019,-0.129946,-1.070247,-0.524667
15468,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.404000,-1.600221,-0.325852,-0.518474,-0.284747,-0.162019,-0.129946,0.934364,0.440180
38313,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.303030,-0.563964,3.068877,-0.518474,-0.284747,-0.162019,-0.129946,0.934364,-0.223152
55284,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.304452,-1.239784,-0.325852,-0.518474,-0.284747,-0.162019,-0.129946,0.934364,-0.102546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40689,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.260834,-0.879346,-0.325852,-0.518474,-0.284747,-0.162019,-0.129946,0.934364,1.947754
15252,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.316539,-1.059565,-0.325852,-0.518474,-0.284747,-0.162019,-0.129946,0.934364,1.103513
10923,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.213699,0.427239,-0.325852,-0.518474,-0.284747,-0.162019,-0.129946,-1.070247,-0.886485
62404,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.296185,0.697567,-0.325852,-0.518474,-0.284747,-0.162019,-0.129946,0.934364,1.103513


In [137]:
X_test_encoded = pd.DataFrame(
    X_test_transformed,
    columns=feature_names,
    index=X_test.index
)

In [138]:
X_test_encoded

Unnamed: 0,gender__Gender,weekday__appointment_weekday_Friday,weekday__appointment_weekday_Monday,weekday__appointment_weekday_Saturday,weekday__appointment_weekday_Thursday,weekday__appointment_weekday_Tuesday,weekday__appointment_weekday_Wednesday,neighbourhood__Neighbourhood,numeric__Age,numeric__Scholarship,numeric__Hipertension,numeric__Diabetes,numeric__Alcoholism,numeric__Handcap,numeric__SMS_received,numeric__waiting_days
49421,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.251852,0.337130,3.068877,1.928736,-0.284747,-0.162019,-0.129946,0.934364,-0.464364
52547,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.282353,0.472294,-0.325852,-0.518474,-0.284747,-0.162019,-0.129946,0.934364,-0.705576
21546,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.267845,-0.428800,-0.325852,-0.518474,-0.284747,-0.162019,-0.129946,-1.070247,0.741695
27087,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.390244,-1.645276,-0.325852,-0.518474,-0.284747,-0.162019,-0.129946,0.934364,-0.705576
9094,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.304452,-0.789237,-0.325852,-0.518474,-0.284747,-0.162019,-0.129946,0.934364,-0.705576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12733,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.364472,-0.293636,3.068877,-0.518474,-0.284747,-0.162019,-0.129946,0.934364,-0.102546
50082,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.260834,-0.203526,-0.325852,-0.518474,-0.284747,-0.162019,-0.129946,0.934364,-0.584970
34247,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.251852,0.922840,-0.325852,1.928736,-0.284747,-0.162019,-0.129946,-1.070247,-0.826182
27893,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.287025,0.922840,-0.325852,1.928736,-0.284747,-0.162019,-0.129946,-1.070247,-0.645273


In [152]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.02,
        scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),  # handle imbalance
        random_state=42,
        eval_metric='auc'
    ))
])


In [153]:
model.fit(X_train, y_train)
print("Training Accuracy:", model.score(X_train, y_train))
print("Test Accuracy:", model.score(X_test, y_test))


Training Accuracy: 0.6074266975308642
Test Accuracy: 0.5908179012345679


In [143]:
y.value_counts(normalize=True)


No-show
0    0.708966
1    0.291034
Name: proportion, dtype: float64

In [None]:


feature_names = model.named_steps['preprocessor'].get_feature_names_out()
importances = model.named_steps['classifier'].feature_importances_

feat_imp = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feat_imp.sort_values(by='Importance', ascending=False).head(10)


Unnamed: 0,Feature,Importance
8,numeric__Age,0.3467
7,neighbourhood__Neighbourhood,0.283522
15,numeric__waiting_days,0.227947
0,gender__Gender,0.027563
14,numeric__SMS_received,0.015896
10,numeric__Hipertension,0.011303
9,numeric__Scholarship,0.011294
5,weekday__appointment_weekday_Tuesday,0.010882
2,weekday__appointment_weekday_Monday,0.010845
1,weekday__appointment_weekday_Friday,0.010612


In [145]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy: %.3f ± %.3f" % (scores.mean(), scores.std()))


Cross-Validation Accuracy: 0.664 ± 0.006
