In [None]:
# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.svm import SVC, SVR
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
# Load dataset

dataset= pd.read_csv('dataset.csv')
print("Dataset Size:", dataset.shape)

# Replace 'Unknown' with NaN in Target
dataset['Genetic Disorder'].replace('Unknown', np.nan, inplace=True)
dataset['Disorder Subclass'].replace('Unknown', np.nan, inplace=True)

dataset.dropna(inplace=True)
# Alocate features and labels

X = dataset.iloc[:, :-2]  # Features
y = dataset.iloc[:, -2:]  # Labels (last two columns)

# Perform one-hot encoding on the categorical features
X = pd.get_dummies(X, drop_first=True)
# drop_first is used to drop one of the columns for each categorical feature to avoid multicollinearity.

# Check if the target variable has more than 2 classes
if y.nunique().any() > 2:
    y = pd.get_dummies(y, drop_first=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

without 'unknown' class removed

In [None]:
# split the data into validation, validation and test

X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, train_size=0.5,  random_state=1) # training as 50%
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1) # test as 25% and validations as 25%

# Select and train model of classifier

svm_classifier = SVC(kernel='linear', random_state=1)

clf = MultiOutputClassifier(svm_classifier)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_validation)

y_validation_ds = y_validation.iloc[:, -1]
y_validation_gd = y_validation.iloc[:, -2]

y_pred_ds = y_pred[:, -1]
y_pred_gd = y_pred[:, -2]

# Apply metrics

accuracy_ds = accuracy_score(y_validation_ds, y_pred_ds)
precision_ds = precision_score(y_validation_ds, y_pred_ds, average='weighted')
recall_ds = recall_score(y_validation_ds, y_pred_ds, average='weighted')
f1_ds = f1_score(y_validation_ds, y_pred_ds, average='weighted')

accuracy_gd = accuracy_score(y_validation_gd, y_pred_gd)
precision_gd= precision_score(y_validation_gd, y_pred_gd, average='weighted')
recall_gd = recall_score(y_validation_gd, y_pred_gd, average='weighted')
f1_gd = f1_score(y_validation_gd, y_pred_gd, average='weighted')


print("Metrics for the Generative Disorder:")
print("Accuracy:", accuracy_gd)
print("Precision:", precision_gd)
print("Recall:", recall_gd)
print("F1 Score:", f1_gd)

print("\nMetrics for the Disorder Subclass:")
print("Accuracy:", accuracy_ds)
print("Precision:", precision_ds)
print("Recall:", recall_ds)
print("F1 Score:", f1_ds)

with 'unknown' class removed

In [None]:
# split the data into validation, validation and test

X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, train_size=0.5,  random_state=1) # training as 50%
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1) # test as 25% and validations as 25%

# Select and train model of classifier

svm_classifier = SVC(kernel='linear', random_state=1, class_weight='balanced')

clf = MultiOutputClassifier(svm_classifier)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_validation)

y_validation_ds = y_validation.iloc[:, -1]
y_validation_gd = y_validation.iloc[:, -2]

y_pred_ds = y_pred[:, -1]
y_pred_gd = y_pred[:, -2]

# Apply metrics

accuracy_ds = accuracy_score(y_validation_ds, y_pred_ds)
precision_ds = precision_score(y_validation_ds, y_pred_ds, average='weighted')
recall_ds = recall_score(y_validation_ds, y_pred_ds, average='weighted')
f1_ds = f1_score(y_validation_ds, y_pred_ds, average='weighted')

accuracy_gd = accuracy_score(y_validation_gd, y_pred_gd)
precision_gd= precision_score(y_validation_gd, y_pred_gd, average='weighted')
recall_gd = recall_score(y_validation_gd, y_pred_gd, average='weighted')
f1_gd = f1_score(y_validation_gd, y_pred_gd, average='weighted')


print("Metrics for the Generative Disorder:")
print("Accuracy:", accuracy_gd)
print("Precision:", precision_gd)
print("Recall:", recall_gd)
print("F1 Score:", f1_gd)

print("\nMetrics for the Disorder Subclass:")
print("Accuracy:", accuracy_ds)
print("Precision:", precision_ds)
print("Recall:", recall_ds)
print("F1 Score:", f1_ds)

Class Distribution

In [None]:
# Generative Disorder


class_labels_last_column = y.iloc[:, -2]

# Count the number of occurrences of each class
class_counts_gd = class_labels_last_column.value_counts()

# Calculate the percentage of each class
total_samples = len(class_labels_last_column)
class_percentages = (class_counts_gd / total_samples) * 100

# Plot the histogram
plt.figure(figsize=(10, 6))
plt.bar(class_counts_gd.index, class_counts_gd, color='pink', alpha=0.7)

# Adding percentage text on top of each bar
for label, count, percentage in zip(class_counts_gd.index, class_counts_gd, class_percentages):
    plt.text(label, count, f'{percentage:.2f}%', ha='center', va='bottom')

plt.title('Class Distribution - Generative Disorder')
plt.xlabel('Class Labels')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:
# Disorder Subclass

# Assuming y is your original dataset
class_labels_last_column = y.iloc[:, -1]

# Count the number of occurrences of each class
class_counts_ds = class_labels_last_column.value_counts()

# Calculate the percentage of each class
total_samples = len(class_labels_last_column)
class_percentages = (class_counts_ds / total_samples) * 100

# Plot the histogram
plt.figure(figsize=(10, 6))
plt.bar(class_counts_ds.index, class_counts_ds, color='blue', alpha=0.7)

# Adding percentage text on top of each bar
for label, count, percentage in zip(class_counts_ds.index, class_counts_ds, class_percentages):
    plt.text(label, count, f'{percentage:.2f}%', ha='center', va='bottom')

plt.title('Class Distribution - Disorder Subclass')
plt.xlabel('Class Labels')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

With the classes balanced by the classifier parameter (automatic)

In [None]:
# split the data into validation, validation and test

X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, train_size=0.5,  random_state=1) # training as 50%
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1) # test as 25% and validations as 25%

# Select and train model of classifier

svm_classifier = SVC(kernel='linear', random_state=1, class_weight='balanced')

clf = MultiOutputClassifier(svm_classifier)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_validation)

y_validation_ds = y_validation.iloc[:, -1]
y_validation_gd = y_validation.iloc[:, -2]

y_pred_ds = y_pred[:, -1]
y_pred_gd = y_pred[:, -2]

# Apply metrics

accuracy_ds = accuracy_score(y_validation_ds, y_pred_ds)
precision_ds = precision_score(y_validation_ds, y_pred_ds, average='weighted')
recall_ds = recall_score(y_validation_ds, y_pred_ds, average='weighted')
f1_ds = f1_score(y_validation_ds, y_pred_ds, average='weighted')

accuracy_gd = accuracy_score(y_validation_gd, y_pred_gd)
precision_gd= precision_score(y_validation_gd, y_pred_gd, average='weighted')
recall_gd = recall_score(y_validation_gd, y_pred_gd, average='weighted')
f1_gd = f1_score(y_validation_gd, y_pred_gd, average='weighted')


print("Metrics for the Generative Disorder:")
print("Accuracy:", accuracy_gd)
print("Precision:", precision_gd)
print("Recall:", recall_gd)
print("F1 Score:", f1_gd)

print("\nMetrics for the Disorder Subclass:")
print("Accuracy:", accuracy_ds)
print("Precision:", precision_ds)
print("Recall:", recall_ds)
print("F1 Score:", f1_ds)

With the 