LOGISTIC REGRESSION CLASSIFIER

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV

In [2]:
#dataset
df = pd.read_csv('dataset.csv')

label_encoder = LabelEncoder()
lrc = LogisticRegression()

REMOVE UNKNOWNS FROM TARGETS VS FROM ALL COLUMNS (RUN ONE OF THE CELLS)

Without unknown targets (possible to apply just to one of the columns if only one of them is the target)

In [3]:
# Replace 'Unknown' with NaN
df['Genetic Disorder'].replace('Unknown', np.nan, inplace=True)
df['Disorder Subclass'].replace('Unknown', np.nan, inplace=True)

df.dropna(inplace=True)

Without unknown in any column

In [None]:
# Replace 'Unknown' with NaN
df.replace('Unknown', np.nan, inplace=True)

df.dropna(inplace=True)

USE JUST LABEL ENCODER OR BOTH LABE LENCODER AND ONE-HOT ENCODER (RUN ONE OF THE CELLS)

Just label encoder

In [4]:
all_x_columns= [
    'Patient Age', "Genes in mother's side", 'Inherited from father', 'Maternal gene', 'Paternal gene',
    'Blood cell count (mcL)', "Mother's age", "Father's age", 'Status', 'Respiratory Rate (breaths/min)',
    'Heart Rate (rates/min)', 'Follow-up', 'Gender', 'Birth asphyxia',
    'Autopsy shows birth defect (if applicable)', 'Place of birth', 'Folic acid details (peri-conceptional)',
    'H/O serious maternal illness', 'H/O radiation exposure (x-ray)', 'H/O substance abuse',
    'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies', 'No. of previous abortion',
    'Birth defects', 'White Blood cell count (thousand per microliter)', 'Blood test result',
    'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5'
]

df_encoded=df.copy()
for column in all_x_columns:
    df_encoded[column] = label_encoder.fit_transform(df_encoded[column].astype(str))

Label encoder and one-hot encoder

In [None]:
categorical_unordered_columns = [
    'Genes in mother\'s side', 'Inherited from father', 'Maternal gene', 'Paternal gene',
    'Gender', 'Birth asphyxia',
    'Autopsy shows birth defect (if applicable)', 'Place of birth',
    'Folic acid details (peri-conceptional)', 'H/O serious maternal illness',
    'H/O radiation exposure (x-ray)', 'H/O substance abuse', 'Status',
    'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies',
    'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5'
]

quantitative_with_unknowns_or_ordered_columns = ['Patient Age', "Mother's age", "Father's age", 'Respiratory Rate (breaths/min)',
    'Heart Rate (rates/min)', 'Follow-up', 'No. of previous abortion',
    'Birth defects', 'White Blood cell count (thousand per microliter)', 'Blood test result']


df_encoded=df.copy()
for column in quantitative_with_unknowns_or_ordered_columns:
    df_encoded[column] = label_encoder.fit_transform(df_encoded[column].astype(str))
df_encoded=pd.get_dummies(df_encoded, columns=categorical_unordered_columns, drop_first=False)

With Feature Selection

In [None]:
# With feature selection

df=df[["Symptom 5", "Symptom 4", "Symptom 3", "Symptom 2", "Genes in mother's side", "Inherited from father","Genetic Disorder","Disorder Subclass"]]

df_encoded=df.copy()
df_encoded=pd.get_dummies(df_encoded, columns=["Symptom 5", "Symptom 4", "Symptom 3", "Symptom 2", "Genes in mother's side", "Inherited from father"], drop_first=False)

TRAIN MODEL TO PREDICT GENETIC DISORDER VS DISORDER SUBCLASS (RUN ONE OF THE CELLS)

Genetic Disorder

In [5]:
# Extract features and target
X = df_encoded.drop(columns=['Genetic Disorder', 'Disorder Subclass'], axis=1)
y = df_encoded['Genetic Disorder']

Disorder subclass

In [None]:
# Extract features and target
X = df_encoded.drop(columns=['Genetic Disorder', 'Disorder Subclass'], axis=1)
y = df_encoded['Disorder Subclass']

Both

In [None]:
# Extract features and target
X = df_encoded.drop(columns=['Genetic Disorder', 'Disorder Subclass'], axis=1)
y = df_encoded[['Genetic Disorder','Disorder Subclass']]

SCALE FEATURES (POSSIBLE TO SKIP)

In [6]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

BALANCE DATA - ONE TARGET (POSSIBLE TO SKIP)

In [9]:
class_counts = y.value_counts()

# Calculate class weights
total_samples = len(y)
class_weights = {label: 1/count for label, count in class_counts.items()}

# Classifier with weights
lrc = LogisticRegression(class_weight=class_weights)

BALANCE DATA - MULTITARGET (POSSIBLE TO SKIP)

In [8]:
#----------------------------------------MULTIOUTPUT------------------------------------------------------

# GENETIC DISORDER
class_counts_gd = y['Genetic Disorder'].value_counts()

# Calculate class weights
total_samples_gd = len(y['Genetic Disorder'])
class_weights_gd = {label: 1/count for label, count in class_counts_gd.items()}


# DISORDER SUBCLASS
class_counts_ds = y['Disorder Subclass'].value_counts()

# Calculate class weights
total_samples_ds = len(y['Disorder Subclass'])
class_weights_ds = {label: 1/count for label, count in class_counts_ds.items()}


class_weights=[class_weights_gd, class_weights_ds]

# Classifier with weights
lrc = LogisticRegression(class_weight=class_weights)

0         Mitochondrial genetic inheritance disorders
2        Multifactorial genetic inheritance disorders
3         Mitochondrial genetic inheritance disorders
4        Multifactorial genetic inheritance disorders
5                    Single-gene inheritance diseases
                             ...                     
22077     Mitochondrial genetic inheritance disorders
22078     Mitochondrial genetic inheritance disorders
22079    Multifactorial genetic inheritance disorders
22080     Mitochondrial genetic inheritance disorders
22082    Multifactorial genetic inheritance disorders
Name: Genetic Disorder, Length: 18047, dtype: object


KeyError: 'Genetic Disorder'

TUNING (POSSIBLE TO SKIP)

In [None]:
parameters = [    
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }
]

In [None]:

lrc = RandomizedSearchCV(estimator = lrc, param_distributions = parameters, n_iter = 100, cv = 3, verbose=2, refit=True, scoring='balanced_accuracy')

TRAIN MODEL

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, stratify=y) # training as 50%
lrc.fit(X_train, y_train)

y_pred = lrc.predict(X_test) 


Genetic Disorder


IndexingError: Too many indexers

PRINT SCORES ONE TARGET VS MULTIOUTPUT (RUN ONE OF THE CELLS)

Multioutput

In [None]:
print('\nGenetic Disorder')
print(classification_report(y_test.iloc[:,-2], y_pred[:,-2]))
print('\nBalanced accuracy:')
print(balanced_accuracy_score(y_test.iloc[:,-2], y_pred[:,-2]))
print('\nMacro f1:')
print(f1_score(y_test.iloc[:,-2], y_pred[:,-2], average='macro'))
print('\nWeighted f1:')
print(f1_score(y_test.iloc[:,-2], y_pred[:,-2], average='weighted'))

print('\nDisorder Subclass')
print(classification_report(y_test.iloc[:,-1], y_pred[:,-1]))
print('\nBalanced accuracy:')
print(balanced_accuracy_score(y_test.iloc[:,-1], y_pred[:,-1]))
print('\nMacro f1:')
print(f1_score(y_test.iloc[:,-1], y_pred[:,-1], average='macro'))
print('\nWeighted f1:')
print(f1_score(y_test.iloc[:,-1], y_pred[:,-1], average='weighted'))

One target

In [None]:
print('\nGenetic Disorder')
print(classification_report(y_test, y_pred))
print('\nBalanced accuracy:')
print(balanced_accuracy_score(y_test, y_pred))
print('\nMacro f1:')
print(f1_score(y_test, y_pred, average='macro'))
print('\nWeighted f1:')
print(f1_score(y_test, y_pred, average='weighted'))