RANDOM FOREST CLASSIFIER

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV

In [2]:
#dataset
df = pd.read_csv('dataset.csv')

label_encoder = LabelEncoder()
rf = RandomForestClassifier()

REMOVE UNKNOWNS FROM TARGETS VS FROM ALL COLUMNS (RUN ONE OF THE CELLS)

Without unknown targets (possible to apply just to one of the columns if only one of them is the target)

In [3]:
# Replace 'Unknown' with NaN
df['Genetic Disorder'].replace('Unknown', np.nan, inplace=True)
df['Disorder Subclass'].replace('Unknown', np.nan, inplace=True)

df.dropna(inplace=True)

Without unknown in any column

In [None]:
# Replace 'Unknown' with NaN
df.replace('Unknown', np.nan, inplace=True)

df.dropna(inplace=True)

USE JUST LABEL ENCODER OR BOTH LABE LENCODER AND ONE-HOT ENCODER (RUN ONE OF THE CELLS)

Just label encoder

In [4]:
all_x_columns= [
    'Patient Age', "Genes in mother's side", 'Inherited from father', 'Maternal gene', 'Paternal gene',
    'Blood cell count (mcL)', "Mother's age", "Father's age", 'Status', 'Respiratory Rate (breaths/min)',
    'Heart Rate (rates/min)', 'Follow-up', 'Gender', 'Birth asphyxia',
    'Autopsy shows birth defect (if applicable)', 'Place of birth', 'Folic acid details (peri-conceptional)',
    'H/O serious maternal illness', 'H/O radiation exposure (x-ray)', 'H/O substance abuse',
    'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies', 'No. of previous abortion',
    'Birth defects', 'White Blood cell count (thousand per microliter)', 'Blood test result',
    'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5'
]

df_encoded=df.copy()
for column in all_x_columns:
    df_encoded[column] = label_encoder.fit_transform(df_encoded[column].astype(str))

Label encoder and one-hot encoder

In [None]:
categorical_unordered_columns = [
    'Genes in mother\'s side', 'Inherited from father', 'Maternal gene', 'Paternal gene',
    'Gender', 'Birth asphyxia',
    'Autopsy shows birth defect (if applicable)', 'Place of birth',
    'Folic acid details (peri-conceptional)', 'H/O serious maternal illness',
    'H/O radiation exposure (x-ray)', 'H/O substance abuse', 'Status',
    'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies',
    'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5'
]

quantitative_with_unknowns_or_ordered_columns = ['Patient Age', "Mother's age", "Father's age", 'Respiratory Rate (breaths/min)',
    'Heart Rate (rates/min)', 'Follow-up', 'No. of previous abortion',
    'Birth defects', 'White Blood cell count (thousand per microliter)', 'Blood test result']


df_encoded=df.copy()
for column in quantitative_with_unknowns_or_ordered_columns:
    df_encoded[column] = label_encoder.fit_transform(df_encoded[column].astype(str))
df_encoded=pd.get_dummies(df_encoded, columns=categorical_unordered_columns, drop_first=False)

With Feature Selection

In [None]:
# With feature selection

df=df[["Symptom 5", "Symptom 4", "Symptom 3", "Symptom 2", "Genes in mother's side", "Inherited from father","Genetic Disorder","Disorder Subclass"]]

df_encoded=df.copy()
df_encoded=pd.get_dummies(df_encoded, columns=["Symptom 5", "Symptom 4", "Symptom 3", "Symptom 2", "Genes in mother's side", "Inherited from father"], drop_first=False)

TRAIN MODEL TO PREDICT GENETIC DISORDER VS DISORDER SUBCLASS (RUN ONE OF THE CELLS)

Genetic Disorder

In [5]:
# Extract features and target
X = df_encoded.drop(columns=['Genetic Disorder', 'Disorder Subclass'], axis=1)
y = df_encoded['Genetic Disorder']

Disorder subclass

In [None]:
# Extract features and target
X = df_encoded.drop(columns=['Genetic Disorder', 'Disorder Subclass'], axis=1)
y = df_encoded['Disorder Subclass']

Both

In [None]:
# Extract features and target
X = df_encoded.drop(columns=['Genetic Disorder', 'Disorder Subclass'], axis=1)
y = df_encoded[['Genetic Disorder','Disorder Subclass']]

SCALE FEATURES (POSSIBLE TO SKIP)

In [6]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

BALANCE DATA - ONE TARGET (POSSIBLE TO SKIP)

In [7]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(y),
                                        y = y                                                    
                                    )
class_weights_dict = dict(zip(np.unique(y), class_weights))

# Classifier with weights
rf = RandomForestClassifier(class_weight=class_weights_dict)

{'Mitochondrial genetic inheritance disorders': 0.6509757241279803, 'Multifactorial genetic inheritance disorders': 3.2049369561356773, 'Single-gene inheritance diseases': 0.8681868475489488}


BALANCE DATA - MULTITARGET (POSSIBLE TO SKIP)

In [None]:
#----------------------------------------MULTIOUTPUT------------------------------------------------------

from sklearn.utils.class_weight import compute_class_weight

# GENETIC DISORDER
class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(y['Genetic Disorder']),
                                        y = y['Genetic Disorder']                                                    
                                    )
class_weights_gd = dict(zip(np.unique(y['Genetic Disorder']), class_weights))


# DISORDER SUBCLASS
class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(y['Disorder Subclass']),
                                        y = y['Disorder Subclass']                                                    
                                    )
class_weights_ds = dict(zip(np.unique(y['Disorder Subclass']), class_weights))


class_weights=[class_weights_gd, class_weights_ds]

# Classifier with weights
rf = RandomForestClassifier(class_weight=class_weights)

TUNING (POSSIBLE TO SKIP)

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
parameters = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:

rf = RandomizedSearchCV(estimator = rf, param_distributions = parameters, n_iter = 100, cv = 3, verbose=2, refit=True, scoring='balanced_accuracy')

TRAIN MODEL

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, stratify=y) # training as 75%
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test) 

PRINT SCORES MULTIOUTPUT VS ONE TARFET (RUN ONE OF THE CELLS)

Multioutput

In [None]:
print('\nGenetic Disorder')
print(classification_report(y_test.iloc[:,-2], y_pred[:,-2]))
print('\nBalanced accuracy:')
print(balanced_accuracy_score(y_test.iloc[:,-2], y_pred[:,-2]))
print('\nMacro f1:')
print(f1_score(y_test.iloc[:,-2], y_pred[:,-2], average='macro'))
print('\nWeighted f1:')
print(f1_score(y_test.iloc[:,-2], y_pred[:,-2], average='weighted'))

print('\nDisorder Subclass')
print(classification_report(y_test.iloc[:,-1], y_pred[:,-1]))
print('\nBalanced accuracy:')
print(balanced_accuracy_score(y_test.iloc[:,-1], y_pred[:,-1]))
print('\nMacro f1:')
print(f1_score(y_test.iloc[:,-1], y_pred[:,-1], average='macro'))
print('\nWeighted f1:')
print(f1_score(y_test.iloc[:,-1], y_pred[:,-1], average='weighted'))

One target

In [9]:
print('\nGenetic Disorder')
print(classification_report(y_test, y_pred))
print('\nBalanced accuracy:')
print(balanced_accuracy_score(y_test, y_pred))
print('\nMacro f1:')
print(f1_score(y_test, y_pred, average='macro'))
print('\nWeighted f1:')
print(f1_score(y_test, y_pred, average='weighted'))


Genetic Disorder
                                              precision    recall  f1-score   support

 Mitochondrial genetic inheritance disorders       0.59      0.87      0.71      2311
Multifactorial genetic inheritance disorders       0.56      0.18      0.27       469
            Single-gene inheritance diseases       0.56      0.31      0.40      1732

                                    accuracy                           0.59      4512
                                   macro avg       0.57      0.45      0.46      4512
                                weighted avg       0.58      0.59      0.54      4512


Balanced accuracy:
0.4546137124661332

Macro f1:
0.45930000087295636

Weighted f1:
0.5445297424891591
