In [None]:
# Import necessary libraries

import sklearn
import xgboost
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTENC
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
# Loading the dataset
df = pd.read_excel('/content/va_dataset_full.xlsx')

In [None]:
# Identifying duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Removing duplicates
df = df.drop_duplicates()
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 1503
Number of duplicate rows: 0


In [None]:
# Select numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Select categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

In [None]:
# Shape and head of the dataset
print("Dataset shape: ",df.shape,"\n")
df.head()

Dataset shape:  (6338, 21) 



Unnamed: 0,age,had_diabetes,had_heart_disease,had_hypertension,had_obesity,had_stroke,had_blue_lips,had_ankle_swelling,had_puffiness,had_diff_breathing,...,fast_breathing,had_wheezed,had_chest_pain,chest_pain_duration,physical_action_painful,pain_location,urine_stop,had_lost_consciousness,had_confusion,heart_disease
0,76,No,Yes,No,No,No,No,No,No,Yes,...,Yes,No,No,Don't Know,No,Don't Know,No,No,No,1
1,50,No,No,Yes,No,Yes,No,No,No,Yes,...,Don't Know,No,No,Don't Know,No,Don't Know,No,Yes,No,1
2,61,Yes,Yes,Yes,No,No,No,Yes,No,Yes,...,Yes,No,No,Don't Know,No,Don't Know,Yes,No,No,1
3,78,No,No,No,No,No,No,No,No,Yes,...,No,No,No,Don't Know,No,Don't Know,No,Yes,No,1
4,70,No,Yes,Yes,No,No,No,No,No,Yes,...,Yes,Yes,Yes,0.5-24 hours,Yes,Upper/middle chest,No,No,No,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6338 entries, 0 to 7840
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   age                      6338 non-null   object
 1   had_diabetes             6334 non-null   object
 2   had_heart_disease        6335 non-null   object
 3   had_hypertension         6335 non-null   object
 4   had_obesity              6333 non-null   object
 5   had_stroke               6333 non-null   object
 6   had_blue_lips            6337 non-null   object
 7   had_ankle_swelling       6335 non-null   object
 8   had_puffiness            6338 non-null   object
 9   had_diff_breathing       6338 non-null   object
 10  breathing_on_off         6338 non-null   object
 11  fast_breathing           6326 non-null   object
 12  had_wheezed              6325 non-null   object
 13  had_chest_pain           6328 non-null   object
 14  chest_pain_duration      6338 non-null   obje

In [None]:
# Print unique values for each categorical column
for column in categorical_cols:
    print('Column:', column)
    print(df[column].unique())
    print()

Column: age
[76 50 61 78 70 82 54 62 52 45 60 "Don't Know" 69 75 49 43 71 33 59 64 90
 55 65 81 42 80 84 53 72 74 36 73 57 48 51 67 34 38 79 58 66 40 87 35 68
 92 77 56 63 85 89 41 22 37 30 96 25 91 102 46 32 44 47 17 28 39 83 19 86
 24 95 26 88 93 98 16 20 18 23 94 15 31 29 21 13 27 12 14 97 100 99 11]

Column: had_diabetes
['No' 'Yes' "Don't Know" nan 'Refused to Answer']

Column: had_heart_disease
['Yes' 'No' "Don't Know" 'Refused to Answer' nan]

Column: had_hypertension
['No' 'Yes' "Don't Know" 'Refused to Answer' nan]

Column: had_obesity
['No' 'Yes' "Don't Know" nan 'Refused to Answer']

Column: had_stroke
['No' 'Yes' "Don't Know" nan]

Column: had_blue_lips
['No' 'Yes' "Don't Know" nan]

Column: had_ankle_swelling
['No' 'Yes' "Don't Know" nan]

Column: had_puffiness
['No' 'Yes' "Don't Know" 'Refused to Answer']

Column: had_diff_breathing
['Yes' 'No' "Don't Know"]

Column: breathing_on_off
['Continuous' 'On and Off' "Don't Know"]

Column: fast_breathing
['Yes' "Don't Know" 'No'

In [None]:
# Check the distribution of the target variable
target_distribution = df['heart_disease'].value_counts()

# Display the class imbalance
print("Class Distribution in Target Variable (y):")
print(target_distribution)

Class Distribution in Target Variable (y):
heart_disease
0    5345
1     993
Name: count, dtype: int64


In [None]:
# Checking for null values
print(df.isnull().sum())

print("\nThere are no null values in this dataset")

age                         0
had_diabetes                4
had_heart_disease           3
had_hypertension            3
had_obesity                 5
had_stroke                  5
had_blue_lips               1
had_ankle_swelling          3
had_puffiness               0
had_diff_breathing          0
breathing_on_off            0
fast_breathing             12
had_wheezed                13
had_chest_pain             10
chest_pain_duration         0
physical_action_painful     0
pain_location               0
urine_stop                  5
had_lost_consciousness      1
had_confusion               3
heart_disease               0
dtype: int64

There are no null values in this dataset


In [None]:
for col in categorical_cols:
    mode_value = df[col].mode()[0]
    df[col].fillna(mode_value, inplace=True)

print("\nNull values in categorical columns replaced with the most frequent class")



Null values in categorical columns replaced with the most frequent class


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_value, inplace=True)


In [None]:
df['age'] = df['age'].replace("Don't Know", '-1')

# Convert 'class' column to integers
df['age'] = df['age'].astype(int)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6338 entries, 0 to 7840
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   age                      6338 non-null   int64 
 1   had_diabetes             6338 non-null   object
 2   had_heart_disease        6338 non-null   object
 3   had_hypertension         6338 non-null   object
 4   had_obesity              6338 non-null   object
 5   had_stroke               6338 non-null   object
 6   had_blue_lips            6338 non-null   object
 7   had_ankle_swelling       6338 non-null   object
 8   had_puffiness            6338 non-null   object
 9   had_diff_breathing       6338 non-null   object
 10  breathing_on_off         6338 non-null   object
 11  fast_breathing           6338 non-null   object
 12  had_wheezed              6338 non-null   object
 13  had_chest_pain           6338 non-null   object
 14  chest_pain_duration      6338 non-null   obje

In [None]:
# Extract features (X) and target (y)
X = df.drop('heart_disease', axis=1)
y = df['heart_disease']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

In [None]:
# Select numerical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

# Select categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

In [None]:
# Calculate the required number of minority class samples for a 1:2 ratio
majority_class_count = y_train.value_counts().max()  # Majority class count (6811 in your case)
minority_class_count = y_train.value_counts().min()  # Minority class count (1030 in your case)

# Calculate the number of minority samples needed to achieve the 1:2 ratio
required_minority_samples = majority_class_count // 2  # For a 1:2 ratio

# Define sampling_strategy to ensure the minority class is oversampled to the desired count
sampling_strategy = {0: majority_class_count, 1: required_minority_samples}  # Adjust 1:2 ratio

# Get categorical column indices
categorical_indices = [X_train.columns.get_loc(col) for col in categorical_cols]

# Apply SMOTENC with a custom sampling strategy
smote_nc = SMOTENC(categorical_features=categorical_indices, sampling_strategy=sampling_strategy, random_state=42)
X_train_resampled, y_train_resampled = smote_nc.fit_resample(X_train, y_train)

# Check the new distribution
balanced_distribution = pd.Series(y_train_resampled).value_counts()
print("\nClass Distribution After Applying SMOTE (1:2 ratio):")
print(balanced_distribution)


Class Distribution After Applying SMOTE (1:2 ratio):
heart_disease
0    4276
1    2138
Name: count, dtype: int64


In [None]:
# # Get categorical column indices
# categorical_indices = [X_train.columns.get_loc(col) for col in categorical_cols]

# # Apply SMOTENC to handle imbalanced classes
# smote_nc = SMOTENC(categorical_features=categorical_indices, random_state=42)
# X_train, y_train = smote_nc.fit_resample(X_train, y_train)

# # Check the new distribution
# balanced_distribution = pd.Series(y_train).value_counts()
# print("\nClass Distribution After Applying SMOTE:")
# print(balanced_distribution)

ValueError: could not convert string to float: 'No'

In [None]:
if len(categorical_cols) > 0:
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    # Fit the encoder on the training data and transform it
    X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
    X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(categorical_cols))

    # Transform the testing data using the same encoder
    X_test_encoded = encoder.transform(X_test[categorical_cols])
    X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(categorical_cols))

    # Combine encoded categorical features with numerical ones for both X_train and X_test
    X_train = pd.concat([X_train.drop(columns=categorical_cols).reset_index(drop=True), X_train_encoded], axis=1)
    X_test = pd.concat([X_test.drop(columns=categorical_cols).reset_index(drop=True), X_test_encoded], axis=1)
else:
    # If no categorical columns, just reset the index for both
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)

X_train.head()

Unnamed: 0,age,had_diabetes_Don't Know,had_diabetes_No,had_diabetes_Yes,had_heart_disease_Don't Know,had_heart_disease_No,had_heart_disease_Refused to Answer,had_heart_disease_Yes,had_hypertension_Don't Know,had_hypertension_No,...,urine_stop_Don't Know,urine_stop_No,urine_stop_Refused to Answer,urine_stop_Yes,had_lost_consciousness_Don't Know,had_lost_consciousness_No,had_lost_consciousness_Yes,had_confusion_Don't Know,had_confusion_No,had_confusion_Yes
0,51,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,62,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,16,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,22,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,65,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
# # Initialize Random Forest with class weight balancing
# rf_model = RandomForestClassifier(
#     random_state=42,  # Ensures reproducibility
#     n_jobs=-1,        # Use all available processors
#     criterion='entropy',  # Use 'entropy' criterion
#     max_depth=30,          # Set the maximum depth of the tree
#     min_samples_leaf=1,    # Minimum number of samples required to be at a leaf node
#     min_samples_split=10,  # Minimum number of samples required to split an internal node
#     n_estimators=350      # Number of trees in the forest
# )

# # Train the model
# rf_model.fit(X_train, y_train)

# # Perform cross-validation
# cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)  # cv=5 means 5-fold cross-validation

In [None]:
# # Make predictions
# rf_predictions = rf_model.predict(X_test)

# # Evaluate the model
# print("Random Forest Classifier Performance:\n")
# print(f"Mean CV Score: {np.mean(cv_scores):.2f}")
# print(f"Accuracy: {accuracy_score(y_test, rf_predictions):.2f}")
# print("\nClassification Report:")
# print(classification_report(y_test, rf_predictions))

# # ROC-AUC score (for binary classification)
# if len(set(y_test)) == 2:  # Check if it's binary classification
#     rf_roc_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])
#     print(f"ROC-AUC Score: {rf_roc_auc:.2f}")
# else:
#     print("ROC-AUC is only applicable for binary classification.")

# print("\nConfusion Matrix:")
# print(confusion_matrix(y_test, rf_predictions))

In [None]:
# Define the model
rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)

param_grid = {
    'n_estimators': [100, 200],  # Reduced number of trees
    'max_depth': [10, 20],        # Reduced depth range
    'min_samples_split': [2, 5],  # Reduced number of splits
    'min_samples_leaf': [1, 2],   # Reduced leaf node sample sizes
    'criterion': ['gini'],        # Gini is usually sufficient
    'max_features': ['sqrt'],     # Changed from 'auto' to 'sqrt'
    'bootstrap': [True]           # Keep bootstrap as True
}

# Set up the GridSearchCV with 5-fold cross-validation and verbose=2 for progress
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                           cv=5, n_jobs=-1,verbose=10, scoring='accuracy')

# Fit the model and track the progress
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [None]:
# Output the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.4f}".format(grid_search.best_score_))

# Get the best model after the grid search
best_rf_model = grid_search.best_estimator_

# Optionally, you can also evaluate the model's performance on the test set
# y_pred = best_rf_model.predict(X_test)

Best parameters found:  {'bootstrap': True, 'criterion': 'gini', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best cross-validation score: 0.8717


In [None]:
# Initialize Random Forest with class weight balancing
rf_model = RandomForestClassifier(
    random_state=42,
    n_jobs=-1,
    criterion='gini',
    max_depth=20,
    min_samples_leaf=1,
    min_samples_split=5,
    max_features='sqrt',
    n_estimators=200,
    bootstrap=True,
    class_weight='balanced',  # Adjust class weights
    min_impurity_decrease=0.01,  # Prune nodes that don't contribute significantly
)

# Train the model
rf_model.fit(X_train, y_train)

# Perform cross-validation
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)  # cv=5 means 5-fold cross-validation

In [None]:
# Make predictions
rf_predictions = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest Classifier Performance:\n")
print(f"Mean CV Score: {np.mean(cv_scores):.2f}")
print(f"Accuracy: {accuracy_score(y_test, rf_predictions):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, rf_predictions))

# ROC-AUC score (for binary classification)
if len(set(y_test)) == 2:  # Check if it's binary classification
    rf_roc_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])
    print(f"ROC-AUC Score: {rf_roc_auc:.2f}")
else:
    print("ROC-AUC is only applicable for binary classification.")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))

Random Forest Classifier Performance:

Mean CV Score: 0.72
Accuracy: 0.72

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.71      0.81      1069
           1       0.33      0.76      0.46       199

    accuracy                           0.72      1268
   macro avg       0.64      0.74      0.64      1268
weighted avg       0.85      0.72      0.76      1268

ROC-AUC Score: 0.82

Confusion Matrix:
[[763 306]
 [ 47 152]]


In [None]:
print(sklearn.__version__)
print(xgboost.__version__)

1.6.1
2.1.3


In [None]:
# # Define the model
# xgb_model = xgb.XGBClassifier(random_state=42, n_jobs=-1)

# # Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],            # Number of boosting rounds (trees)
#     'max_depth': [3, 5, 7],                     # Maximum depth of each tree
#     'learning_rate': [0.01, 0.1, 0.2],          # Step size at each iteration
#     'subsample': [0.8, 1.0],                    # Fraction of samples used for training each tree
#     'colsample_bytree': [0.8, 1.0],              # Fraction of features used for each tree
#     'gamma': [0, 0.1, 0.2],                     # Minimum loss reduction required to make a further partition
#     'reg_alpha': [0, 0.1, 1],                   # L1 regularization term
#     'reg_lambda': [0, 0.1, 1],                  # L2 regularization term
#     'scale_pos_weight': [1, 10]                 # For handling class imbalance (if needed)
# }

# # Set up GridSearchCV with 5-fold cross-validation
# grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
#                            cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

In [None]:
# If X_train is a pandas DataFrame, make sure feature names are valid
X_train.columns = [str(col).replace('[', '_').replace(']', '_').replace('<', '_').replace('>', '_') for col in X_train.columns]
X_test.columns = [str(col).replace('[', '_').replace(']', '_').replace('<', '_').replace('>', '_') for col in X_train.columns]

In [None]:
# Define the model with the recommended parameters
xgb_model = xgb.XGBClassifier(
    n_estimators=100,  # Moderate number of trees
    max_depth=5,        # Balanced tree depth
    learning_rate=0.1,  # Standard learning rate
    subsample=0.8,      # Prevent overfitting with a bit of randomness
    colsample_bytree=0.8,  # Use 80% of features for each tree
    gamma=0.1,          # Moderate regularization to control complexity
    reg_alpha=0,        # No L1 regularization (unless overfitting occurs)
    reg_lambda=1,       # Standard L2 regularization
    scale_pos_weight=1, # Use balanced weights for classes
    random_state=42,
    n_jobs=-1
)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.1, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=-1,
              num_parallel_tree=None, random_state=42, ...)

In [None]:
# Make predictions
xgb_predictions = xgb_model.predict(X_test)

# Calculate the mean cross-validation score (you can modify this part if you used cross-validation)
cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5)  # If you want to perform CV scoring

# Evaluate the model
print("XGBoost Classifier Performance:\n")
print(f"Mean CV Score: {np.mean(cv_scores):.2f}")
print(f"Accuracy: {accuracy_score(y_test, xgb_predictions):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, xgb_predictions))

# ROC-AUC Score
roc_auc = roc_auc_score(y_test, xgb_predictions)
print(f"\nROC-AUC Score: {roc_auc:.2f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, xgb_predictions)
print("\nConfusion Matrix:")
print(conf_matrix)

ValueError: feature_names mismatch: ['age', "had_diabetes_Don't Know", 'had_diabetes_No', 'had_diabetes_Yes', "had_heart_disease_Don't Know", 'had_heart_disease_No', 'had_heart_disease_Refused to Answer', 'had_heart_disease_Yes', "had_hypertension_Don't Know", 'had_hypertension_No', 'had_hypertension_Refused to Answer', 'had_hypertension_Yes', "had_obesity_Don't Know", 'had_obesity_No', 'had_obesity_Refused to Answer', 'had_obesity_Yes', "had_stroke_Don't Know", 'had_stroke_No', 'had_stroke_Yes', "had_blue_lips_Don't Know", 'had_blue_lips_No', 'had_blue_lips_Yes', "had_ankle_swelling_Don't Know", 'had_ankle_swelling_No', 'had_ankle_swelling_Yes', "had_puffiness_Don't Know", 'had_puffiness_No', 'had_puffiness_Refused to Answer', 'had_puffiness_Yes', "had_diff_breathing_Don't Know", 'had_diff_breathing_No', 'had_diff_breathing_Yes', 'breathing_on_off_Continuous', "breathing_on_off_Don't Know", 'breathing_on_off_On and Off', "fast_breathing_Don't Know", 'fast_breathing_No', 'fast_breathing_Yes', "had_wheezed_Don't Know", 'had_wheezed_No', 'had_wheezed_Yes', "had_chest_pain_Don't Know", 'had_chest_pain_No', 'had_chest_pain_Yes', 'chest_pain_duration_0.5-24 hours', 'chest_pain_duration__30 minutes', 'chest_pain_duration__24 hr', "chest_pain_duration_Don't Know", 'chest_pain_duration_Refused to Answer', "physical_action_painful_Don't Know", 'physical_action_painful_No', 'physical_action_painful_Yes', "pain_location_Don't Know", 'pain_location_Left Arm', 'pain_location_Lower chest', 'pain_location_Other', 'pain_location_Refused to Answer', 'pain_location_Upper/middle chest', "urine_stop_Don't Know", 'urine_stop_No', 'urine_stop_Refused to Answer', 'urine_stop_Yes', "had_lost_consciousness_Don't Know", 'had_lost_consciousness_No', 'had_lost_consciousness_Yes', "had_confusion_Don't Know", 'had_confusion_No', 'had_confusion_Yes'] ['age', "had_diabetes_Don't Know", 'had_diabetes_No', 'had_diabetes_Yes', "had_heart_disease_Don't Know", 'had_heart_disease_No', 'had_heart_disease_Refused to Answer', 'had_heart_disease_Yes', "had_hypertension_Don't Know", 'had_hypertension_No', 'had_hypertension_Refused to Answer', 'had_hypertension_Yes', "had_obesity_Don't Know", 'had_obesity_No', 'had_obesity_Refused to Answer', 'had_obesity_Yes', "had_stroke_Don't Know", 'had_stroke_No', 'had_stroke_Yes', "had_blue_lips_Don't Know", 'had_blue_lips_No', 'had_blue_lips_Yes', "had_ankle_swelling_Don't Know", 'had_ankle_swelling_No', 'had_ankle_swelling_Yes', "had_puffiness_Don't Know", 'had_puffiness_No', 'had_puffiness_Refused to Answer', 'had_puffiness_Yes', "had_diff_breathing_Don't Know", 'had_diff_breathing_No', 'had_diff_breathing_Yes', 'breathing_on_off_Continuous', "breathing_on_off_Don't Know", 'breathing_on_off_On and Off', "fast_breathing_Don't Know", 'fast_breathing_No', 'fast_breathing_Yes', "had_wheezed_Don't Know", 'had_wheezed_No', 'had_wheezed_Yes', "had_chest_pain_Don't Know", 'had_chest_pain_No', 'had_chest_pain_Yes', 'chest_pain_duration_0.5-24 hours', 'chest_pain_duration_<30 minutes', 'chest_pain_duration_>24 hr', "chest_pain_duration_Don't Know", 'chest_pain_duration_Refused to Answer', "physical_action_painful_Don't Know", 'physical_action_painful_No', 'physical_action_painful_Yes', "pain_location_Don't Know", 'pain_location_Left Arm', 'pain_location_Lower chest', 'pain_location_Other', 'pain_location_Refused to Answer', 'pain_location_Upper/middle chest', "urine_stop_Don't Know", 'urine_stop_No', 'urine_stop_Refused to Answer', 'urine_stop_Yes', "had_lost_consciousness_Don't Know", 'had_lost_consciousness_No', 'had_lost_consciousness_Yes', "had_confusion_Don't Know", 'had_confusion_No', 'had_confusion_Yes']
expected chest_pain_duration__30 minutes, chest_pain_duration__24 hr in input data
training data did not have the following fields: chest_pain_duration_>24 hr, chest_pain_duration_<30 minutes