Required Packages for Analysis

In [3]:
# Importing relevant packages
import numpy as np # 
import pandas as pd # For dataframes
from IPython.display import display # For cleaner printing of data
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix # For metrics
from sklearn.model_selection import train_test_split # For splitting the dataset
from sklearn.ensemble import GradientBoostingClassifier # For Gradient Boost
from sklearn.model_selection import GridSearchCV # For hyperparameter tuning


Part 1: Data Preparation

In [5]:
# Importing Dataset
file_location = "C:/Users/bconn/OneDrive/Documents/WGUCoursework/Data/medical_clean.csv"
data = pd.read_csv(file_location)
df = data.copy()

In [6]:
# Initial profiling of data
display(df.info())
print ('\n')
display(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 50 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CaseOrder           10000 non-null  int64  
 1   Customer_id         10000 non-null  object 
 2   Interaction         10000 non-null  object 
 3   UID                 10000 non-null  object 
 4   City                10000 non-null  object 
 5   State               10000 non-null  object 
 6   County              10000 non-null  object 
 7   Zip                 10000 non-null  int64  
 8   Lat                 10000 non-null  float64
 9   Lng                 10000 non-null  float64
 10  Population          10000 non-null  int64  
 11  Area                10000 non-null  object 
 12  TimeZone            10000 non-null  object 
 13  Job                 10000 non-null  object 
 14  Children            10000 non-null  int64  
 15  Age                 10000 non-null  int64  
 16  Incom

None





Unnamed: 0,CaseOrder,Zip,Lat,Lng,Population,Children,Age,Income,VitD_levels,Doc_visits,...,TotalCharge,Additional_charges,Item1,Item2,Item3,Item4,Item5,Item6,Item7,Item8
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,50159.3239,38.751099,-91.24308,9965.2538,2.0972,53.5117,40490.49516,17.964262,5.0122,...,5312.172769,12934.528587,3.5188,3.5067,3.5111,3.5151,3.4969,3.5225,3.494,3.5097
std,2886.89568,27469.588208,5.403085,15.205998,14824.758614,2.163659,20.638538,28521.153293,2.017231,1.045734,...,2180.393838,6542.601544,1.031966,1.034825,1.032755,1.036282,1.030192,1.032376,1.021405,1.042312
min,1.0,610.0,17.96719,-174.2097,0.0,0.0,18.0,154.08,9.806483,1.0,...,1938.312067,3125.703,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2500.75,27592.0,35.25512,-97.352982,694.75,0.0,36.0,19598.775,16.626439,4.0,...,3179.374015,7986.487755,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
50%,5000.5,50207.0,39.419355,-88.39723,2769.0,1.0,53.0,33768.42,17.951122,5.0,...,5213.952,11573.977735,4.0,3.0,4.0,4.0,3.0,4.0,3.0,3.0
75%,7500.25,72411.75,42.044175,-80.43805,13945.0,3.0,71.0,54296.4025,19.347963,6.0,...,7459.69975,15626.49,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
max,10000.0,99929.0,70.56099,-65.29017,122814.0,10.0,89.0,207249.1,26.394449,9.0,...,9180.728,30566.07,8.0,7.0,8.0,7.0,7.0,7.0,7.0,7.0


In [7]:
# Removing columns not related to health conditions, personal decisions, or treatment received
df = df.drop(['CaseOrder', 'Zip', 'Customer_id', 'Interaction', 'UID', 'City', 'State',
       'County', 'Lat', 'Lng', 'Population', 'Area', 'TimeZone', 'Job',
       'Children', 'Age', 'Income', 'Marital', 'Gender','Initial_days',
       'TotalCharge', 'Additional_charges', 'Item1', 'Item2', 'Item3', 'Item4',
       'Item5', 'Item6', 'Item7', 'Item8'], axis=1)

In [8]:
# Standardizing Column Header names
df.columns = df.columns.str.lower().str.replace(" ", "_")

In [9]:
# Checking for null values
print(df.isnull().sum())

readmis               0
vitd_levels           0
doc_visits            0
full_meals_eaten      0
vitd_supp             0
soft_drink            0
initial_admin         0
highblood             0
stroke                0
complication_risk     0
overweight            0
arthritis             0
diabetes              0
hyperlipidemia        0
backpain              0
anxiety               0
allergic_rhinitis     0
reflux_esophagitis    0
asthma                0
services              0
dtype: int64


In [10]:
# Profiling Data before encoding values
display(df.info())
print ('\n')
display(df.describe(include=['object']))
print ('\n')
print("Reasons Initially Admitted: ", df['initial_admin'].unique())
print("Level of Complication Risks: ", df['complication_risk'].unique())
print("Primary Services Received while Admitted: ", df['services'].unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   readmis             10000 non-null  object 
 1   vitd_levels         10000 non-null  float64
 2   doc_visits          10000 non-null  int64  
 3   full_meals_eaten    10000 non-null  int64  
 4   vitd_supp           10000 non-null  int64  
 5   soft_drink          10000 non-null  object 
 6   initial_admin       10000 non-null  object 
 7   highblood           10000 non-null  object 
 8   stroke              10000 non-null  object 
 9   complication_risk   10000 non-null  object 
 10  overweight          10000 non-null  object 
 11  arthritis           10000 non-null  object 
 12  diabetes            10000 non-null  object 
 13  hyperlipidemia      10000 non-null  object 
 14  backpain            10000 non-null  object 
 15  anxiety             10000 non-null  object 
 16  aller

None





Unnamed: 0,readmis,soft_drink,initial_admin,highblood,stroke,complication_risk,overweight,arthritis,diabetes,hyperlipidemia,backpain,anxiety,allergic_rhinitis,reflux_esophagitis,asthma,services
count,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
unique,2,2,3,2,2,3,2,2,2,2,2,2,2,2,2,4
top,No,No,Emergency Admission,No,No,Medium,Yes,No,No,No,No,No,No,No,No,Blood Work
freq,6331,7425,5060,5910,8007,4517,7094,6426,7262,6628,5886,6785,6059,5865,7107,5265




Reasons Initially Admitted:  ['Emergency Admission' 'Elective Admission' 'Observation Admission']
Level of Complication Risks:  ['Medium' 'High' 'Low']
Primary Services Received while Admitted:  ['Blood Work' 'Intravenous' 'CT Scan' 'MRI']


In [11]:
# Encoding data

# Changing binary columns to numerical values
cols_to_binary = ['readmis', 'soft_drink', 'highblood', 'stroke', 'overweight', 'arthritis', 
'diabetes', 'hyperlipidemia', 'backpain', 'anxiety', 'allergic_rhinitis', 'reflux_esophagitis', 'asthma']

for col in cols_to_binary:
    df[col] = df[col].map({'No': 0, 'Yes': 1})

# One-Hot Encoding (Initial Admission, Primary Services)
df = pd.get_dummies(df, columns=["initial_admin", "services"], drop_first=True)

## Converting new boolean columns to 0 and 1
boolean_cols = ['initial_admin_Emergency Admission', 'initial_admin_Observation Admission',
             'services_CT Scan', 'services_Intravenous', 'services_MRI']
df[boolean_cols] = df[boolean_cols].astype(int) 
    
# Ordinal Encoding (Complication Risks)
complication_risk_mapping = {'Low' : 0, 'Medium': 1, 'High': 2}
df['complication_risk'] = df['complication_risk'].map(complication_risk_mapping)

In [12]:
print(df.columns)

Index(['readmis', 'vitd_levels', 'doc_visits', 'full_meals_eaten', 'vitd_supp',
       'soft_drink', 'highblood', 'stroke', 'complication_risk', 'overweight',
       'arthritis', 'diabetes', 'hyperlipidemia', 'backpain', 'anxiety',
       'allergic_rhinitis', 'reflux_esophagitis', 'asthma',
       'initial_admin_Emergency Admission',
       'initial_admin_Observation Admission', 'services_CT Scan',
       'services_Intravenous', 'services_MRI'],
      dtype='object')


In [13]:
# Validation data cleaning worked properly

## Binary Columns:
for col in cols_to_binary:
    print(f"Unique values in {col}: {df[col].unique()}")

## One-hot Encoding:
print("\n")
display(df.columns)

## Ordinal Encoding:
print("\n")
display(df['complication_risk'].unique())

## Confirming no NaN Values:
print("\nNumber of NaN Values:")
display(df.isnull().sum().sum())

Unique values in readmis: [0 1]
Unique values in soft_drink: [0 1]
Unique values in highblood: [1 0]
Unique values in stroke: [0 1]
Unique values in overweight: [0 1]
Unique values in arthritis: [1 0]
Unique values in diabetes: [1 0]
Unique values in hyperlipidemia: [0 1]
Unique values in backpain: [1 0]
Unique values in anxiety: [1 0]
Unique values in allergic_rhinitis: [1 0]
Unique values in reflux_esophagitis: [0 1]
Unique values in asthma: [1 0]




Index(['readmis', 'vitd_levels', 'doc_visits', 'full_meals_eaten', 'vitd_supp',
       'soft_drink', 'highblood', 'stroke', 'complication_risk', 'overweight',
       'arthritis', 'diabetes', 'hyperlipidemia', 'backpain', 'anxiety',
       'allergic_rhinitis', 'reflux_esophagitis', 'asthma',
       'initial_admin_Emergency Admission',
       'initial_admin_Observation Admission', 'services_CT Scan',
       'services_Intravenous', 'services_MRI'],
      dtype='object')





array([1, 2, 0], dtype=int64)


Number of NaN Values:


0

In [14]:
# Final Profiling
display(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   readmis                              10000 non-null  int64  
 1   vitd_levels                          10000 non-null  float64
 2   doc_visits                           10000 non-null  int64  
 3   full_meals_eaten                     10000 non-null  int64  
 4   vitd_supp                            10000 non-null  int64  
 5   soft_drink                           10000 non-null  int64  
 6   highblood                            10000 non-null  int64  
 7   stroke                               10000 non-null  int64  
 8   complication_risk                    10000 non-null  int64  
 9   overweight                           10000 non-null  int64  
 10  arthritis                            10000 non-null  int64  
 11  diabetes                     

None

Unnamed: 0,readmis,vitd_levels,doc_visits,full_meals_eaten,vitd_supp,soft_drink,highblood,stroke,complication_risk,overweight,...,backpain,anxiety,allergic_rhinitis,reflux_esophagitis,asthma,initial_admin_Emergency Admission,initial_admin_Observation Admission,services_CT Scan,services_Intravenous,services_MRI
0,0,19.141466,6,0,0,0,1,0,1,0,...,1,1,1,0,1,1,0,0,0,0
1,0,18.940352,4,2,1,0,1,0,2,1,...,0,0,0,1,0,1,0,0,1,0
2,0,18.057507,4,1,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,16.576858,4,1,0,0,0,1,1,0,...,0,0,0,1,1,0,0,0,0,0
4,0,17.439069,5,0,2,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0


In [15]:
# Exporting encoded dataset
df.to_csv("encoded_data.csv")

Part 2: Splitting and Initial Model

In [17]:
# Splitting the dataset into Test, Train, Validation

# Separating out the target variable
X = df.drop(columns=['readmis'])
y = df['readmis']

X_train, X_temp, y_train,  y_temp = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=42) # Splitting into Train and Temp
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, stratify = y_temp, random_state=42) # Splitting into Validate and Test

In [18]:
# Validaitng proper split
print(f"X_train Shape: {X_train.shape}")
print(f"X_val Shape: {X_val.shape}")
print(f"X_test Shape: {X_test.shape}")

print(f"y_train Shape: {y_train.shape}")
print(f"y_val Shape: {y_val.shape}")
print(f"y_test Shape: {y_test.shape}")

# Validating stratification
print("\nDistribution in Training Set:")
print(y_train.value_counts(normalize=True))

print("\nDistribution in Validation Set:")
print(y_val.value_counts(normalize=True))

print("\nDistribution in Test Set:")
print(y_test.value_counts(normalize=True))


X_train Shape: (8000, 22)
X_val Shape: (1000, 22)
X_test Shape: (1000, 22)
y_train Shape: (8000,)
y_val Shape: (1000,)
y_test Shape: (1000,)

Distribution in Training Set:
readmis
0    0.633125
1    0.366875
Name: proportion, dtype: float64

Distribution in Validation Set:
readmis
0    0.633
1    0.367
Name: proportion, dtype: float64

Distribution in Test Set:
readmis
0    0.633
1    0.367
Name: proportion, dtype: float64


In [19]:
# Exporting the separate datasets

## Re-combining the data for export
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

## Exporting the separate files
train_data.to_csv("train_data.csv", index=False)
val_data.to_csv("val_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)


In [20]:
# Initial Gradient Boost Model

## Defining Hyperparameters
gbc = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)
    
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_val)
y_proba = gbc.predict_proba(X_val)[:, 1]

In [21]:
# Printing Metrics for the model

## Defining and Calculating Metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_proba)
conf_matrix = confusion_matrix(y_val, y_pred)

## Printing Metrics
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"AUC-ROC: {auc:.3f}")
print("\nConfusion Matrix:")
print(conf_matrix)

Accuracy: 0.630
Precision: 0.385
Recall: 0.014
F1 Score: 0.026
AUC-ROC: 0.516

Confusion Matrix:
[[625   8]
 [362   5]]


In [22]:
# Performing K-Fold Cross Validation to tune hyperparameters

param_grid = {
    'n_estimators': [50, 75, 100, 200],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9]
}

gbm = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(
    estimator=gbm,
    param_grid = param_grid,
    scoring='roc_auc',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Fitting Training Data to the New Model
grid_search.fit(X_train, y_train)
print("\n GridSearch Complete")
print("Best hyperparameters:", grid_search.best_params_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits

 GridSearch Complete
Best hyperparameters: {'learning_rate': 0.2, 'max_depth': 9, 'n_estimators': 200}


In [23]:
# Retraining the model using the optimized hyperparameters
best_model_retrained = GradientBoostingClassifier(
    n_estimators=200, 
    learning_rate=0.2, 
    max_depth=9, 
    random_state=42
)

best_model_retrained.fit(X_train, y_train) 

In [24]:
# Testing on Validation set again
y_val_pred = best_model_retrained.predict(X_val)
y_val_proba = best_model_retrained.predict_proba(X_val)[:, 1]


In [25]:
# Printing Metrics for the updated model

## Defining and Calculating Metrics
best_accuracy = accuracy_score(y_val, y_val_pred)
best_precision = precision_score(y_val, y_val_pred)
best_recall = recall_score(y_val, y_val_pred)
best_f1 = f1_score(y_val, y_val_pred)
best_auc = roc_auc_score(y_val, y_val_proba)
best_conf_matrix = confusion_matrix(y_val, y_val_pred)

## Printing Metrics
print(f"Accuracy: {best_accuracy:.3f}")
print(f"Precision: {best_precision:.3f}")
print(f"Recall: {best_recall:.3f}")
print(f"F1 Score: {best_f1:.3f}")
print(f"AUC-ROC: {best_auc:.3f}")
print("\nConfusion Matrix:")
print(best_conf_matrix)

Accuracy: 0.582
Precision: 0.404
Recall: 0.292
F1 Score: 0.339
AUC-ROC: 0.516

Confusion Matrix:
[[475 158]
 [260 107]]


In [26]:
# Testing the model on the Test Dataset

# Final Predictions
y_test_pred = best_model_retrained.predict(X_test)
y_test_proba = best_model_retrained.predict_proba(X_test)[:, 1]

In [27]:
## Defining and Calculating Metrics
final_accuracy = accuracy_score(y_test, y_test_pred)
final_precision = precision_score(y_test, y_test_pred)
final_recall = recall_score(y_test, y_test_pred)
final_f1 = f1_score(y_test, y_test_pred)
final_auc = roc_auc_score(y_test, y_test_proba)
final_conf_matrix = confusion_matrix(y_test, y_test_pred)

## Printing Metrics
print(f"Accuracy: {final_accuracy:.3f}")
print(f"Precision: {final_precision:.3f}")
print(f"Recall: {final_recall:.3f}")
print(f"F1 Score: {final_f1:.3f}")
print(f"AUC-ROC: {final_auc:.3f}")
print("\nConfusion Matrix:")
print(final_conf_matrix)

Accuracy: 0.553
Precision: 0.336
Recall: 0.223
F1 Score: 0.268
AUC-ROC: 0.487

Confusion Matrix:
[[471 162]
 [285  82]]
