In [69]:
import tarfile 
import pandas as pd 
import os 
import numpy as np 
import lzma
from sklearn import preprocessing, pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression

In [50]:
results_dir = "../results"

In [51]:
filepath = "data/appointments.tar.xz"
output_dir = "data/extracted_data/appointments"
os.makedirs(output_dir, exist_ok=True)

with lzma.open(filepath, "rb") as xz_file:
    with tarfile.open(fileobj=xz_file, mode="r") as tar:
        for member in tar.getmembers():
            if member.name.endswith(".txt"):
                extracted_file = tar.extractfile(member)
                if extracted_file is not None:
                    output_path = os.path.join(output_dir, os.path.basename(member.name))
                    with open(output_path, "wb") as output_file:
                        output_file.write(extracted_file.read())
                    print(f"File extracted and saved to: {output_path}")

appointments_path = "data/extracted_data/appointments/appointments.txt"
participants_path = "data/extracted_data/appointments/participants.txt"
appointments_df = pd.read_csv(appointments_path, sep=r'\s+')
participants_df = pd.read_csv(participants_path, sep=r'\s+')

File extracted and saved to: data/extracted_data/appointments\description.txt
File extracted and saved to: data/extracted_data/appointments\participants.txt
File extracted and saved to: data/extracted_data/appointments\appointments.txt


In [52]:
appointments_df.head()#

Unnamed: 0,participant,sms_received,advance,day,month,weekday,status
0,2987249982,NO,0,29,4,Fri,fullfilled
1,5589977766,NO,0,29,4,Fri,fullfilled
2,4262962299,NO,0,29,4,Fri,fullfilled
3,8679512131,NO,0,29,4,Fri,fullfilled
4,8841186448,NO,0,29,4,Fri,fullfilled


In [53]:
appointments_df.isna().sum()

participant     0
sms_received    0
advance         0
day             0
month           0
weekday         0
status          0
dtype: int64

In [54]:
#merge code
combined_df = pd.merge(appointments_df, participants_df, on="participant", how="inner")
combined_df = combined_df[combined_df["count"] >= 5]

In [55]:
combined_df["status"]

21           no-show
27        fullfilled
37        fullfilled
47        fullfilled
52           no-show
             ...    
110498    fullfilled
110500    fullfilled
110502    fullfilled
110503    fullfilled
110505    fullfilled
Name: status, Length: 18838, dtype: object

In [56]:
# combined_df["sex"] = combined_df["sex"].replace({"M": 1, "F":0})
# combined_df["sms_received"] = combined_df["sms_received"].replace({"YES":1, "NO":0})
# combined_df["status"] = combined_df["status"].replace({"fulfilled":1, "no-show":0})

In [57]:
#dynamically handlde categorical values and encoding them using sk-learn 
#nominal data scaled using standard scaler
# categorical_columns = ["sex", "sms_received", "status","hipertension","diabetes","alcoholism","weekday"]
# nominal_columns =  ["age", "advance", "day", "month", "count" ]
# combined_df[categorical_columns] = combined_df[categorical_columns].apply(preprocessing.LabelEncoder().fit_transform)
# for col in nominal_columns: 
#     combined_df[col] = preprocessing.StandardScaler().fit_transform(combined_df[[col]])
# combined_df.head()

In [58]:
combined_df["status"] = combined_df["status"].replace({"fullfilled":0, "no-show":1})
categorical_features = ['sms_received', 'weekday', 'sex', 'hipertension', 'diabetes', 'alcoholism']
numerical_features = ['advance', 'day', 'month', 'age', 'count']

categorical_transformer = preprocessing.OneHotEncoder(handle_unknown='ignore')
numerical_transformer = preprocessing.StandardScaler() 

# 4. Create the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

preprocessing_pipeline = pipeline.Pipeline(steps=[
    ('preprocessor', preprocessor)
])
df_transformed_array = preprocessing_pipeline.fit_transform(combined_df)
feature_names = (
    preprocessing_pipeline.named_steps['preprocessor']
    .named_transformers_['cat']
    .get_feature_names_out(categorical_features)
)
all_feature_names = list(feature_names) + numerical_features 
df_transformed_scaled = pd.DataFrame(df_transformed_array, columns=all_feature_names)
df_transformed_scaled["status"] = combined_df["status"].reset_index(drop=True)


  combined_df["status"] = combined_df["status"].replace({"fullfilled":0, "no-show":1})


In [59]:
rng = np.random.RandomState(31)
cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=rng)  

In [60]:
y = df_transformed_scaled["status"]
x = df_transformed_scaled.drop(columns=["status"])

In [61]:
#iniitalizing cv and rng 
#for an 80-20 distribution 
rf = RandomForestClassifier(1000, max_depth=4, random_state=rng, criterion="gini", max_features=4)

#fitting the feeatures and target variable to the model, selected scoring criteria and implemeneting CV, this automatically does the test trian split 
scores_rf = model_selection.cross_validate(rf, x, y, scoring = ["f1", "accuracy"], cv=cv)

mean_accuracy = scores_rf["test_accuracy"].mean()
mean_f1_macro = scores_rf["test_f1"].mean()

print(f"Mean Accuracy (5-fold CV): {mean_accuracy:.4f}")
print(f"Mean F1 Score (5-fold CV): {mean_f1_macro:.4f}")

Mean Accuracy (5-fold CV): 0.7932
Mean F1 Score (5-fold CV): 0.0000


In [62]:
# rf = RandomForestClassifier(random_state=rng) 
# param_grid_rf = {
#     'n_estimators': [100, 200],        
#     'max_depth': [3, 4, 5],              
#     'criterion': ["gini", "entropy"],      
#     'max_features': [4, 6]       
# }

# scoring_metrics = {
#     'F1_Score': 'f1',
#     'Accuracy': 'accuracy'
# }
# refit_metric = "F1_Score"

# grid_search_rf = model_selection.GridSearchCV(
#     estimator=rf,
#     param_grid=param_grid_rf,
#     scoring=scoring_metrics,
#     cv=cv,
#     refit=refit_metric, 
#     n_jobs=-1
# )

# print(f"Starting Grid Search for Random Forest. Testing {len(param_grid_rf['n_estimators']) * len(param_grid_rf['max_depth']) * len(param_grid_rf['criterion']) * len(param_grid_rf['max_features'])} combinations...")


# grid_search_rf.fit(x, y) 

# print(f"\n--- Random Forest Grid Search Results ---")

# best_f1_score_rf = grid_search_rf.best_score_
# print(f"Best Model Selected using {refit_metric}: {grid_search_rf.best_params_}")
# print(f"Best {refit_metric} achieved (5-fold CV mean): {best_f1_score_rf:.4f}")


# results_df = pd.DataFrame(grid_search_rf.cv_results_)
# best_index = grid_search_rf.best_index_

# mean_accuracy_best = results_df.loc[best_index, 'mean_test_Accuracy']
# mean_f1_best = results_df.loc[best_index, 'mean_test_F1_Score']

# print(f"Mean Accuracy of the Best RF Model (5-fold CV mean): {mean_accuracy_best:.4f}")
# print(f"Mean F1 Score of the Best RF Model (5-fold CV mean): {mean_f1_best:.4f}")

In [63]:
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=1.0, max_depth=4, random_state=rng)
scores_gb = model_selection.cross_validate(gb, x, y, scoring = ["f1", "accuracy"], cv=cv)

mean_accuracy_gb = scores_gb["test_accuracy"].mean()
mean_f1_macro_gb = scores_gb["test_f1"].mean()
print(f"Mean Accuracy (10-fold CV) - GB: {mean_accuracy_gb:.4f}")
print(f"Mean F1 Score (10-fold CV) - GB: {mean_f1_macro_gb:.4f}")

Mean Accuracy (10-fold CV) - GB: 0.7876
Mean F1 Score (10-fold CV) - GB: 0.3738


In [64]:
# param_grid_gb = {
#     'n_estimators': [100, 200],         
#     'learning_rate': [0.1, 0.5, 1.0],      
#     'max_depth': [4],                 
# }


# gb = GradientBoostingClassifier(random_state=rng)

# grid_search_gb = model_selection.GridSearchCV(
#     estimator=gb,
#     param_grid=param_grid_gb,
#     scoring=['f1',"accuracy"], 
#     cv=cv,
#     n_jobs=-1 
# )


# grid_search_gb.fit(x, y)


# best_f1_score_gb = grid_search_gb.best_score_
# print(f"Best F1 Score found by Grid Search - GB: {best_f1_score_gb:.4f}")
# best_params_gb = grid_search_gb.best_params_
# print(f"Best parameters - GB: {best_params_gb}")

In [65]:
knn = KNeighborsClassifier(n_neighbors=5) 

# Perform cross-validation
scores_knn = model_selection.cross_validate(knn, x, y, scoring = ["f1", "accuracy"], cv=cv)

# Calculate and print the mean scores
mean_accuracy_knn = scores_knn["test_accuracy"].mean()
mean_f1_macro_knn = scores_knn["test_f1"].mean()

print(f"Mean Accuracy (10-fold CV) - KNN: {mean_accuracy_knn:.4f}")
print(f"Mean F1 Score (10-fold CV) - KNN: {mean_f1_macro_knn:.4f}")

Mean Accuracy (10-fold CV) - KNN: 0.7734
Mean F1 Score (10-fold CV) - KNN: 0.2782


In [None]:
lr = LogisticRegression(solver='liblinear', random_state=rng, max_iter=1000, C=1.0) 

# Perform cross-validation
scores_lr = model_selection.cross_validate(lr, x, y, scoring = ["f1", "accuracy"], cv=cv)

# Calculate and print the mean scores
mean_accuracy_lr = scores_lr["test_accuracy"].mean()
mean_f1_macro_lr = scores_lr["test_f1"].mean()

print(f"Mean Accuracy (10-fold CV) - LR: {mean_accuracy_lr:.4f}")
print(f"Mean F1 Score (10-fold CV) - LR: {mean_f1_macro_lr:.4f}")

Mean Accuracy (10-fold CV) - LR: 0.7891
Mean F1 Score (10-fold CV) - LR: 0.0385


In [None]:
from sklearn.ensemble import StackingClassifier

estimators = list(zip("rf_gb_lr", [rf, gb, lr]))
s = StackingClassifier(estimators, final_estimator=GradientBoostingClassifier(random_state=rng, n_estimators=100, learning_rate=0.5, max_depth=4))
scores = model_selection.cross_val_score(s, x, y, scoring="f1", cv=cv).mean()

In [80]:
scores

np.float64(0.14654588616542488)

Tried using hard voting, results were lower compared to soft

In [81]:
from sklearn.ensemble import VotingClassifier
v = VotingClassifier(estimators, voting="soft")
v_scores = model_selection.cross_val_score(v, x, y, scoring="f1", cv = cv).mean()
v_scores

np.float64(0.08881125210832329)

21        False
27        False
37         True
47        False
52        False
          ...  
110498     True
110500    False
110502    False
110503    False
110505     True
Name: count, Length: 18838, dtype: bool

In [93]:
from sklearn.compose import ColumnTransformer
categorical_features = ['sms_received', 'weekday', 'sex', 'hipertension', 'diabetes', 'alcoholism']
numerical_features = ['advance', 'day', 'month', 'age', 'count']

categorical_transformer = preprocessing.OneHotEncoder(handle_unknown='ignore')
numerical_transformer = preprocessing.StandardScaler() 

# 4. Create the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

preprocessing_pipeline = pipeline.Pipeline(steps=[
    ('preprocessor', preprocessor)
    ])
rf = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=rng, criterion="gini")
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=4, random_state=rng)

# ---------------------------------------------------------
# STRATEGY 1: Check Correlation first
# ---------------------------------------------------------
print("--- Step 1: Checking Model Correlation ---")

# We need pipelines to run cross_val_predict on raw X
pipe_rf = pipeline.Pipeline([('prep', preprocessor), ('clf', rf)])
pipe_gb = pipeline.Pipeline([('prep', preprocessor), ('clf', gb)])

# Get out-of-fold probability predictions
print("Generating predictions to check correlation (this takes a moment)...")
pred_rf = model_selection.cross_val_predict(pipe_rf, x, y, cv=5, method='predict_proba')[:, 1]
pred_gb = model_selection.cross_val_predict(pipe_gb, x, y, cv=5, method='predict_proba')[:, 1]

correlation = np.corrcoef(pred_rf, pred_gb)[0, 1]
print(f"Correlation between RF and GB predictions: {correlation:.4f}")
if correlation > 0.90:
        print(">> WARNING: High correlation. Ensembling might have diminishing returns.")
else:
    print(">> Good diversity. Ensembling should help.")

--- Step 1: Checking Model Correlation ---
Generating predictions to check correlation (this takes a moment)...


ValueError: A given column is not a column of the dataframe