In [None]:
!pip install scikeras


Collecting scikeras
  Downloading scikeras-0.12.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.12.0


In [124]:
# Necessary Imports
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from scikeras.wrappers import KerasClassifier
from keras import layers, models, optimizers, losses, metrics
from keras.models import Sequential
from keras.layers import Conv2D, GlobalAveragePooling2D, Dense, MaxPooling1D, Dropout, BatchNormalization, Conv1D, Flatten
from keras.losses import SparseCategoricalCrossentropy
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
from keras.utils import to_categorical

In [125]:
# Load dataset
df = pd.read_csv('dataset.csv')
print("Dataset Size:", df.shape)

# Label encoder
label_encoder = LabelEncoder()

# Replace 'Unknown' with NaN
df['Genetic Disorder'].replace('Unknown', np.nan, inplace=True)
df['Disorder Subclass'].replace('Unknown', np.nan, inplace=True)

df.dropna(inplace=True)




Dataset Size: (22083, 33)


In [126]:
categorical_unordered_columns = [
    'Genes in mother\'s side', 'Inherited from father', 'Maternal gene', 'Paternal gene',
    'Gender', 'Birth asphyxia',
    'Autopsy shows birth defect (if applicable)', 'Place of birth',
    'Folic acid details (peri-conceptional)', 'H/O serious maternal illness',
    'H/O radiation exposure (xUnknownray)', 'H/O substance abuse', 'Status',
    'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies',
    'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5'
]

quantitative_with_unknowns_or_ordered_columns = ['Patient Age', "Mother's age", "Father's age", 'Respiratory Rate (breaths/min)',
    'Heart Rate (rates/min)', 'Follow-up', 'No. of previous abortion',
    'Birth defects', 'White Blood cell count (thousand per microliter)', 'Blood test result']


df_encoded=df.copy()
for column in quantitative_with_unknowns_or_ordered_columns:
    df_encoded[column] = label_encoder.fit_transform(df_encoded[column].astype(str))
df_encoded=pd.get_dummies(df_encoded, columns=categorical_unordered_columns, drop_first=False)

Run this following cell if you want to use features selection:

In [127]:
# With feature selection

df=df[["Symptom 5", "Symptom 4", "Symptom 3", "Symptom 2", "Genes in mother's side", "Inherited from father","Genetic Disorder","Disorder Subclass"]]

df_encoded=df.copy()
df_encoded=pd.get_dummies(df_encoded, columns=["Symptom 5", "Symptom 4", "Symptom 3", "Symptom 2", "Genes in mother's side", "Inherited from father"], drop_first=False)

In [128]:
# Extract features and target
X = df_encoded.drop(columns=['Genetic Disorder', 'Disorder Subclass'], axis=1)
y = df_encoded[['Genetic Disorder','Disorder Subclass']]

In [129]:
# Check if the target variable has more than 2 classes
if y['Genetic Disorder'].nunique() > 2:
    label_encoder_gd = LabelEncoder()
    y['Genetic Disorder'] = label_encoder_gd.fit_transform(y['Genetic Disorder'])

if y['Disorder Subclass'].nunique() > 2:
    label_encoder_ds = LabelEncoder()
    y['Disorder Subclass'] = label_encoder_ds.fit_transform(y['Disorder Subclass'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['Genetic Disorder'] = label_encoder_gd.fit_transform(y['Genetic Disorder'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['Disorder Subclass'] = label_encoder_ds.fit_transform(y['Disorder Subclass'])


In [130]:
# Scale features

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# Tensorboard

log_dir_gd = "logs/fit/train_gd"
tensorboard_callback_gd = TensorBoard(log_dir=log_dir_gd, histogram_freq=1)

log_dir_ds= "logs/fit/train_ds"
tensorboard_callback_ds = TensorBoard(log_dir=log_dir_ds, histogram_freq=1)

In [131]:
# Split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)  # 75% for training

# One-Hot Encode the Target Variable
y_train_categorical_gd = to_categorical(label_encoder.fit_transform(y_train['Genetic Disorder']))
y_train_categorical_ds = to_categorical(label_encoder.fit_transform(y_train['Disorder Subclass']))

# Hyperparameter search space
param_dist = {
    'batch_size': [16, 32, 64],
    'epochs': [5, 10, 15],
}

def create_nn_model(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(64, input_dim=input_dim, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(output_dim, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Build and train the model for Genetic Disorder
model_gd = KerasClassifier(build_fn=create_nn_model, input_dim=X_train.shape[1], output_dim=y_train_categorical_gd.shape[1])

random_search_gd = RandomizedSearchCV(model_gd, param_distributions=param_dist, n_iter=3, cv=3, verbose=1)
random_search_gd.fit(X_train, y_train_categorical_gd)

# Get the best hyperparameters for Genetic Disorder
best_params_gd = random_search_gd.best_params_

# Build and train the model for Disorder Subclass
model_ds = KerasClassifier(build_fn=create_nn_model, input_dim=X_train.shape[1], output_dim=y_train_categorical_ds.shape[1])

random_search_ds = RandomizedSearchCV(model_ds, param_distributions=param_dist, n_iter=3, cv=3, verbose=1)
random_search_ds.fit(X_train, y_train_categorical_ds)

# Get the best hyperparameters for Disorder Subclass
best_params_ds = random_search_ds.best_params_

# Now you can use the best hyperparameters to train your final models
model_gd_final = create_nn_model(input_dim=X_train.shape[1], output_dim=y_train_categorical_gd.shape[1])
model_gd_final.fit(X_train, y_train_categorical_gd, epochs=best_params_gd['epochs'], batch_size=best_params_gd['batch_size'],callbacks=[tensorboard_callback_gd])

model_ds_final = create_nn_model(input_dim=X_train.shape[1], output_dim=y_train_categorical_ds.shape[1])
model_ds_final.fit(X_train, y_train_categorical_ds, epochs=best_params_ds['epochs'], batch_size=best_params_ds['batch_size'],callbacks=[tensorboard_callback_gd])


Fitting 3 folds for each of 3 candidates, totalling 9 fits
Epoch 1/5


  X, y = self._initialize(X, y)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  X, y = self._initialize(X, y)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5


  X, y = self._initialize(X, y)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5


  X, y = self._initialize(X, y)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5


  X, y = self._initialize(X, y)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5


  X, y = self._initialize(X, y)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/15


  X, y = self._initialize(X, y)


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


  X, y = self._initialize(X, y)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15


  X, y = self._initialize(X, y)


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/5


  X, y = self._initialize(X, y)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Epoch 1/15


  X, y = self._initialize(X, y)


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15


  X, y = self._initialize(X, y)


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15


  X, y = self._initialize(X, y)


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/5


  X, y = self._initialize(X, y)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5


  X, y = self._initialize(X, y)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5


  X, y = self._initialize(X, y)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/10


  X, y = self._initialize(X, y)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10


  X, y = self._initialize(X, y)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10


  X, y = self._initialize(X, y)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/5


  X, y = self._initialize(X, y)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x78e08c9bffa0>

In [132]:
from sklearn.metrics import classification_report, balanced_accuracy_score, f1_score

# Convert true labels to strings if they are not already
y_test_gd = y_test['Genetic Disorder']

# Predictions for Genetic Disorder
y_pred_gd = model_gd_final.predict(X_test)
y_pred_gd_categorical = y_pred_gd.argmax(axis=1)

# Print Outputs for Genetic Disorder
print('\nGenetic Disorder')
print(classification_report(y_test_gd, y_pred_gd_categorical))
print('\nBalanced accuracy:')
print(balanced_accuracy_score(y_test_gd, y_pred_gd_categorical))
print('\nMacro f1:')
print(f1_score(y_test_gd, y_pred_gd_categorical, average='macro'))
print('\nWeighted f1:')
print(f1_score(y_test_gd, y_pred_gd_categorical, average='weighted'))



Genetic Disorder
              precision    recall  f1-score   support

           0       0.61      0.83      0.71      1397
           1       0.44      0.32      0.37       273
           2       0.55      0.33      0.41      1038

    accuracy                           0.59      2708
   macro avg       0.53      0.49      0.50      2708
weighted avg       0.57      0.59      0.56      2708


Balanced accuracy:
0.49226857779040206

Macro f1:
0.4951387359342956

Weighted f1:
0.558515134485125


In [133]:

# Convert true labels to strings if they are not already
y_test_ds = y_test['Disorder Subclass']

# Predictions for Genetic Disorder
y_pred_ds = model_ds_final.predict(X_test)
y_pred_ds_categorical = y_pred_ds.argmax(axis=1)

# Print Outputs for Genetic Disorder
print('\nDisorder Subclass')
print(classification_report(y_test_ds, y_pred_ds_categorical))
print('\nBalanced accuracy:')
print(balanced_accuracy_score(y_test_ds, y_pred_ds_categorical))
print('\nMacro f1:')
print(f1_score(y_test_ds, y_pred_ds_categorical, average='macro'))
print('\nWeighted f1:')
print(f1_score(y_test_ds, y_pred_ds_categorical, average='weighted'))


Disorder Subclass
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        20
           1       0.00      0.00      0.00        13
           2       0.43      0.46      0.44       444
           3       0.38      0.28      0.32       240
           4       0.36      0.32      0.34       184
           5       0.00      0.00      0.00        86
           6       0.39      0.62      0.48       703
           7       0.34      0.36      0.35       608
           8       0.33      0.10      0.15       410

    accuracy                           0.38      2708
   macro avg       0.25      0.24      0.23      2708
weighted avg       0.36      0.38      0.35      2708


Balanced accuracy:
0.23838735760932556

Macro f1:
0.23207266265383975

Weighted f1:
0.3507185673971928


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Run the following cells to test with classes balanced

In [141]:
# Get the class labels and their counts
class_labels_gd = y['Genetic Disorder']
class_counts_gd = class_labels_gd.value_counts()
class_labels_ds = y['Disorder Subclass']
class_counts_ds = class_labels_ds.value_counts()

# Order class counts by class labels
sorted_class_counts_gd = class_counts_gd.sort_index()
sorted_class_counts_ds = class_counts_ds.sort_index()

# Calculate class weights for Genetic Disorder
total_samples_gd = len(class_labels_gd)
num_classes_gd = len(sorted_class_counts_gd)
class_weights_gd = {label: total_samples_gd / (num_classes_gd * count) for label, count in sorted_class_counts_gd.items()}
print("Class Weights Genetic Disorder:", class_weights_gd)

# Calculate class weights for Disorder Subclass
total_samples_ds = len(class_labels_ds)
num_classes_ds = len(sorted_class_counts_ds)
class_weights_ds = {label: total_samples_ds / (num_classes_ds * count) for label, count in sorted_class_counts_ds.items()}
print("Class Weights Disorder Subclass:", class_weights_ds)


Class Weights Genetic Disorder: {0: 0.6509757241279803, 1: 3.2049369561356773, 2: 0.8681868475489488}
Class Weights Disorder Subclass: {0: 15.07685881370092, 1: 22.035409035409035, 2: 0.6375905317081788, 3: 1.2130805942058211, 4: 1.6329171190734708, 5: 3.416051485898164, 6: 0.42819180487341924, 7: 0.5049665631383083, 8: 0.7845157363936707}


In [142]:
# Build and train the model for Genetic Disorder with class weights
model_gd_final = create_nn_model(input_dim=X_train.shape[1], output_dim=y_train_categorical_gd.shape[1])
model_gd_final.fit(X_train, y_train_categorical_gd, epochs=best_params_gd['epochs'], batch_size=best_params_gd['batch_size'], class_weight=class_weights_gd)

# Build and train the model for Disorder Subclass with class weights
model_ds_final = create_nn_model(input_dim=X_train.shape[1], output_dim=y_train_categorical_ds.shape[1])
model_ds_final.fit(X_train, y_train_categorical_ds, epochs=best_params_ds['epochs'], batch_size=best_params_ds['batch_size'], class_weight=class_weights_ds)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x78e0806fd360>

In [145]:
# Convert true labels to strings if they are not already
y_test_gd = y_test['Genetic Disorder']

# Predictions for Genetic Disorder
y_pred_gd = model_gd_final.predict(X_test)
y_pred_gd_categorical = y_pred_gd.argmax(axis=1)

# Print Outputs for Genetic Disorder
print('\nGenetic Disorder')
print(classification_report(y_test_gd, y_pred_gd_categorical))
print('\nBalanced accuracy:')
print(balanced_accuracy_score(y_test_gd, y_pred_gd_categorical))
print('\nMacro f1:')
print(f1_score(y_test_gd, y_pred_gd_categorical, average='macro'))
print('\nWeighted f1:')
print(f1_score(y_test_gd, y_pred_gd_categorical, average='weighted'))


Genetic Disorder
              precision    recall  f1-score   support

           0       0.68      0.67      0.68      1397
           1       0.28      0.83      0.42       273
           2       0.61      0.31      0.41      1038

    accuracy                           0.55      2708
   macro avg       0.52      0.60      0.50      2708
weighted avg       0.61      0.55      0.55      2708


Balanced accuracy:
0.6021172754594083

Macro f1:
0.5011513599501859

Weighted f1:
0.5470313960088686


In [146]:
# Convert true labels to strings if they are not already
y_test_ds = y_test['Disorder Subclass']

# Predictions for Genetic Disorder
y_pred_ds = model_ds_final.predict(X_test)
y_pred_ds_categorical = y_pred_ds.argmax(axis=1)

# Print Outputs for Genetic Disorder
print('\nDisorder Subclass')
print(classification_report(y_test_ds, y_pred_ds_categorical))
print('\nBalanced accuracy:')
print(balanced_accuracy_score(y_test_ds, y_pred_ds_categorical))
print('\nMacro f1:')
print(f1_score(y_test_ds, y_pred_ds_categorical, average='macro'))
print('\nWeighted f1:')
print(f1_score(y_test_ds, y_pred_ds_categorical, average='weighted'))


Disorder Subclass
              precision    recall  f1-score   support

           0       0.11      0.35      0.17        20
           1       0.06      0.62      0.10        13
           2       0.38      0.45      0.42       444
           3       0.24      0.12      0.16       240
           4       0.23      0.26      0.24       184
           5       0.17      0.53      0.26        86
           6       0.42      0.29      0.34       703
           7       0.40      0.33      0.36       608
           8       0.36      0.34      0.35       410

    accuracy                           0.33      2708
   macro avg       0.26      0.37      0.27      2708
weighted avg       0.36      0.33      0.33      2708


Balanced accuracy:
0.3658757110643376

Macro f1:
0.26689726119565643

Weighted f1:
0.3315668834412152


In [147]:
# Apply metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

accuracy_gd = accuracy_score(y_test_gd, y_pred_gd_categorical)
precision_gd = precision_score(y_test_gd, y_pred_gd_categorical, average='weighted')
recall_gd = recall_score(y_test_gd, y_pred_gd_categorical, average='weighted')
f1_gd = f1_score(y_test_gd, y_pred_gd_categorical, average='weighted')
balanced_acc_gd = balanced_accuracy_score(y_test_gd, y_pred_gd_categorical)


accuracy_ds = accuracy_score(y_test_ds, y_pred_ds_categorical)
precision_ds = precision_score(y_test_ds, y_pred_ds_categorical, average='weighted')
recall_ds= recall_score(y_test_ds, y_pred_ds_categorical, average='weighted')
f1_ds = f1_score(y_test_ds, y_pred_ds_categorical, average='weighted')
balanced_acc_ds = balanced_accuracy_score(y_test_ds, y_pred_ds_categorical)


# Metrics for Genetic Disorder
print("Metrics for Genetic Disorder:")
print("Accuracy: {:.3f}".format(accuracy_gd))
print("Precision: {:.3f}".format(precision_gd))
print("Recall: {:.3f}".format(recall_gd))
print("F1 Score: {:.3f}".format(f1_gd))
print("Balanced Accuracy: {:.3f}".format(balanced_acc_gd))

# Metrics for Disorder Subclass
print("\nMetrics for Disorder Subclass:")
print("Accuracy: {:.3f}".format(accuracy_ds))
print("Precision: {:.3f}".format(precision_ds))
print("Recall: {:.3f}".format(recall_ds))
print("F1 Score: {:.3f}".format(f1_ds))
print("Balanced Accuracy: {:.3f}".format(balanced_acc_ds))

Metrics for Genetic Disorder:
Accuracy: 0.548
Precision: 0.612
Recall: 0.548
F1 Score: 0.547
Balanced Accuracy: 0.602

Metrics for Disorder Subclass:
Accuracy: 0.326
Precision: 0.359
Recall: 0.326
F1 Score: 0.332
Balanced Accuracy: 0.366
