In [54]:
# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.svm import SVC, SVR
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report

In [55]:
# Load dataset
df = pd.read_csv('dataset.csv')
print("Dataset Size:", df.shape)

# Label encoder
label_encoder = LabelEncoder()

# Replace 'Unknown' with NaN
df['Genetic Disorder'].replace('Unknown', np.nan, inplace=True)
df['Disorder Subclass'].replace('Unknown', np.nan, inplace=True)

df.dropna(inplace=True)

Dataset Size: (22083, 33)


In [56]:
categorical_unordered_columns = [
    'Genes in mother\'s side', 'Inherited from father', 'Maternal gene', 'Paternal gene',
    'Gender', 'Birth asphyxia',
    'Autopsy shows birth defect (if applicable)', 'Place of birth',
    'Folic acid details (peri-conceptional)', 'H/O serious maternal illness',
    'H/O radiation exposure (xUnknownray)', 'H/O substance abuse', 'Status',
    'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies',
    'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5'
]

quantitative_with_unknowns_or_ordered_columns = ['Patient Age', "Mother's age", "Father's age", 'Respiratory Rate (breaths/min)',
    'Heart Rate (rates/min)', 'Follow-up', 'No. of previous abortion',
    'Birth defects', 'White Blood cell count (thousand per microliter)', 'Blood test result']


df_encoded=df.copy()
for column in quantitative_with_unknowns_or_ordered_columns:
    df_encoded[column] = label_encoder.fit_transform(df_encoded[column].astype(str))
df_encoded=pd.get_dummies(df_encoded, columns=categorical_unordered_columns, drop_first=False)

Run this following cell if you want to use features selection:

In [45]:
# With feature selection


df=df[["Symptom 5", "Symptom 4", "Symptom 3", "Symptom 2", "Genes in mother's side", "Inherited from father","Genetic Disorder","Disorder Subclass"]]

df_encoded=df.copy()
df_encoded=pd.get_dummies(df_encoded, columns=["Symptom 5", "Symptom 4", "Symptom 3", "Symptom 2", "Genes in mother's side", "Inherited from father"], drop_first=False)

In [57]:
# Extract features and target
X = df_encoded.drop(columns=['Genetic Disorder', 'Disorder Subclass'], axis=1)
y = df_encoded[['Genetic Disorder','Disorder Subclass']]

In [58]:
# Check if the target variable has more than 2 classes
if y['Genetic Disorder'].nunique() > 2:
    label_encoder_gd = LabelEncoder()
    y['Genetic Disorder'] = label_encoder_gd.fit_transform(y['Genetic Disorder'])

if y['Disorder Subclass'].nunique() > 2:
    label_encoder_ds = LabelEncoder()
    y['Disorder Subclass'] = label_encoder_ds.fit_transform(y['Disorder Subclass'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['Genetic Disorder'] = label_encoder_gd.fit_transform(y['Genetic Disorder'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['Disorder Subclass'] = label_encoder_ds.fit_transform(y['Disorder Subclass'])


In [59]:
# Scale features

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [60]:
# Split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)  # 75% for training


class weights manually balanced

In [None]:
# Get the class labels and their counts
class_labels_gd = y['Genetic Disorder']
class_counts_gd = class_labels_gd.value_counts()
class_labels_ds = y['Disorder Subclass']
class_counts_ds = class_labels_ds.value_counts()

# Order class counts by class labels
sorted_class_counts_gd = class_counts_gd.sort_index()
sorted_class_counts_ds = class_counts_ds.sort_index()

# Calculate class weights for Genetic Disorder
total_samples_gd = len(class_labels_gd)
num_classes_gd = len(sorted_class_counts_gd)
class_weights_gd = {label: total_samples_gd / (num_classes_gd * count) for label, count in sorted_class_counts_gd.items()}
print("Class Weights Genetic Disorder:", class_weights_gd)

# Calculate class weights for Disorder Subclass
total_samples_ds = len(class_labels_ds)
num_classes_ds = len(sorted_class_counts_ds)
class_weights_ds = {label: total_samples_ds / (num_classes_ds * count) for label, count in sorted_class_counts_ds.items()}
print("Class Weights Disorder Subclass:", class_weights_ds)

class weights automatically balanced:

In [61]:
# Select and train model of classifier
svm_classifier = SVC(kernel='linear', random_state=1, class_weight='balanced')

# Create MultiOutputClassifier
clf = MultiOutputClassifier(svm_classifier)

# Training Genetic Disorder
clf.fit(X_train, y_train[['Genetic Disorder']])
y_pred_gd = clf.predict(X_test)

# Training Disorder Subclass
clf.fit(X_train, y_train[['Disorder Subclass']])
y_pred_ds = clf.predict(X_test)


In [62]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

# Metrics for Genetic Disorder
accuracy_gd = accuracy_score(y_test['Genetic Disorder'], y_pred_gd)
precision_gd = precision_score(y_test['Genetic Disorder'], y_pred_gd, average='weighted')
recall_gd = recall_score(y_test['Genetic Disorder'], y_pred_gd, average='weighted')
f1_gd = f1_score(y_test['Genetic Disorder'], y_pred_gd, average='weighted')
balanced_acc_gd = balanced_accuracy_score(y_test['Genetic Disorder'], y_pred_gd)

# Metrics for Disorder Subclass
accuracy_ds = accuracy_score(y_test['Disorder Subclass'], y_pred_ds)
precision_ds = precision_score(y_test['Disorder Subclass'], y_pred_ds, average='weighted')
recall_ds = recall_score(y_test['Disorder Subclass'], y_pred_ds, average='weighted')
f1_ds = f1_score(y_test['Disorder Subclass'], y_pred_ds, average='weighted')
balanced_acc_ds = balanced_accuracy_score(y_test['Disorder Subclass'], y_pred_ds)

# Print metrics for Genetic Disorder
print("Metrics for Genetic Disorder:")
print("Accuracy: {:.3f}".format(accuracy_gd))
print("Precision: {:.3f}".format(precision_gd))
print("Recall: {:.3f}".format(recall_gd))
print("F1 Score: {:.3f}".format(f1_gd))
print("Balanced Accuracy: {:.3f}".format(balanced_acc_gd))

# Print metrics for Disorder Subclass
print("\nMetrics for Disorder Subclass:")
print("Accuracy: {:.3f}".format(accuracy_ds))
print("Precision: {:.3f}".format(precision_ds))
print("Recall: {:.3f}".format(recall_ds))
print("F1 Score: {:.3f}".format(f1_ds))
print("Balanced Accuracy: {:.3f}".format(balanced_acc_ds))


Metrics for Genetic Disorder:
Accuracy: 0.516
Precision: 0.266
Recall: 0.516
F1 Score: 0.351
Balanced Accuracy: 0.333

Metrics for Disorder Subclass:
Accuracy: 0.421
Precision: 0.393
Recall: 0.421
F1 Score: 0.387
Balanced Accuracy: 0.251


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [63]:
from sklearn.metrics import classification_report, balanced_accuracy_score, f1_score

# Print Outputs for Disorder Subclass
print('\nDisorder Subclass')
print(classification_report(y_test['Disorder Subclass'], y_pred_ds))
print('\nBalanced accuracy:')
print(balanced_accuracy_score(y_test['Disorder Subclass'], y_pred_ds))
print('\nMacro f1:')
print(f1_score(y_test['Disorder Subclass'], y_pred_ds, average='macro'))
print('\nWeighted f1:')
print(f1_score(y_test['Disorder Subclass'], y_pred_ds, average='weighted'))



Disorder Subclass
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        20
           1       0.00      0.00      0.00        13
           2       0.41      0.68      0.51       444
           3       0.27      0.03      0.06       240
           4       0.49      0.22      0.31       184
           5       0.00      0.00      0.00        86
           6       0.44      0.58      0.50       703
           7       0.40      0.38      0.39       608
           8       0.42      0.36      0.39       410

    accuracy                           0.42      2708
   macro avg       0.27      0.25      0.24      2708
weighted avg       0.39      0.42      0.39      2708


Balanced accuracy:
0.2510783796158719

Macro f1:
0.2398498480353163

Weighted f1:
0.3871360837967648


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [65]:
# Print Outputs for Genetic Disorder
print('\nGenetic Disorder')
print(classification_report(y_test['Genetic Disorder'], y_pred_gd))
print('\nBalanced accuracy:')
print(balanced_accuracy_score(y_test['Genetic Disorder'], y_pred_gd))
print('\nMacro f1:')
print(f1_score(y_test['Genetic Disorder'], y_pred_gd, average='macro'))
print('\nWeighted f1:')
print(f1_score(y_test['Genetic Disorder'], y_pred_gd, average='weighted'))


Genetic Disorder
              precision    recall  f1-score   support

           0       0.52      1.00      0.68      1397
           1       0.00      0.00      0.00       273
           2       0.00      0.00      0.00      1038

    accuracy                           0.52      2708
   macro avg       0.17      0.33      0.23      2708
weighted avg       0.27      0.52      0.35      2708


Balanced accuracy:
0.3333333333333333

Macro f1:
0.22687779131140884

Weighted f1:
0.35112438086636427


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
