In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd  # type: ignore

import numpy as np # type: ignore

from imblearn.combine import SMOTETomek # type: ignore
from imblearn.under_sampling import TomekLinks # type: ignore

from sklearn.model_selection import train_test_split # type: ignore
from sklearn.preprocessing import StandardScaler # type: ignore

import torch # type: ignore
import torch.nn as nn # type: ignore
import torch.nn.functional as F 
import torch.optim as optim # type: ignore

from sklearn.metrics import f1_score # type: ignore

import optuna # type: ignore

Logistic regression model with no data feature scaling or data imbalance handling

In [2]:
asd_sm = pd.read_csv('asd_sm.csv')
kd_sm = pd.read_csv('kd_sm.csv')
normal_sm = pd.read_csv('normal_sm.csv')
rhd_sm = pd.read_csv('rhd_sm.csv')

normal_sm['state'] = 0
asd_sm['state'] = 1
kd_sm['state'] = 2
rhd_sm['state'] = 3

combined_df = pd.concat([normal_sm, asd_sm, kd_sm, rhd_sm], axis=0)

print('normal_sm:', round(combined_df['state'].value_counts()[0]/len(combined_df) * 100,2), '% of the dataset')
print('asd_sm:', round(combined_df['state'].value_counts()[1]/len(combined_df) * 100,2), '% of the dataset')
print('kd_sm:', round(combined_df['state'].value_counts()[2]/len(combined_df) * 100,2), '% of the dataset')
print('rhd_sm:', round(combined_df['state'].value_counts()[3]/len(combined_df) * 100,2), '% of the dataset')

raw_data = combined_df.copy()
raw_data.drop_duplicates(inplace=True)

normal_sm: 96.83 % of the dataset
asd_sm: 1.43 % of the dataset
kd_sm: 0.64 % of the dataset
rhd_sm: 1.1 % of the dataset


In [3]:
X = raw_data.drop('state', axis=1)
y = raw_data['state']
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.25, random_state = 42)

class LogisticRegressionModel(nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()
        self.input_layer = nn.Linear(n_features, n_classes)

    def forward(self, x):
        return self.input_layer(x) 

X_train = torch.FloatTensor(X_train.to_numpy())
y_train = torch.LongTensor(y_train.to_numpy().astype(int))  

X_validation = torch.FloatTensor(X_validation.to_numpy())
y_validation = torch.LongTensor(y_validation.to_numpy().astype(int))  

model = LogisticRegressionModel(n_features=170, n_classes=4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epoch_count = 1000

for epoch in range(epoch_count):
    model.train()
    optimizer.zero_grad()
    y_train_prediction = model(X_train)
    training_loss = criterion(y_train_prediction, y_train)
    training_loss.backward()
    optimizer.step()

model.eval()
y_train_prediction = torch.argmax(y_train_prediction, dim=1)

with torch.no_grad():
    y_validation_prediction = model(X_validation)
    y_validation_prediction = torch.argmax(y_validation_prediction, dim=1)
    
Training_F1 = f1_score(y_train.detach().numpy(), y_train_prediction.detach().numpy(), average='macro')
Validation_F1 = f1_score(y_validation.detach().numpy(), y_validation_prediction.detach().numpy(), average='macro')


print(f'Model With No Data Scaling or Adjustments -> Training F1 Score: {Training_F1} | Validation F1 Score: {Validation_F1}')

Model With No Data Scaling or Adjustments -> Training F1 Score: 0.4393785730496816 | Validation F1 Score: 0.36660724503361686


Test Set Validation

In [4]:
# Test_Set = pd.read_csv('test_all.csv').copy()

# X = Test_Set.drop('state', axis=1)
# y = Test_Set['state']

# mapping = {
#     'normal': 0,
#     'asd': 1,
#     'kd': 2,
#     'rhd': 3
# }

# y = y.replace(mapping)

# X_test = torch.FloatTensor(X.to_numpy())
# y_test = torch.LongTensor(y.to_numpy().astype(int)) 

# with torch.no_grad():
#     y_test_prediction = model(X_test)
#     y_test_prediction = torch.argmax(y_test_prediction, dim=1)
    

# Test_F1 = f1_score(y_test.detach().numpy(), y_test_prediction.detach().numpy(), average='macro')

# print(f'Test Set F1 Score: {Test_F1}')

# y_test_prediction = pd.DataFrame(y_test_prediction, columns=['prediction'])

# # Now saving it to a CSV file
# y_test_prediction.to_csv('Logistic_Regression_Answer.csv', index=False)

Test Set F1 Score: 0.38886370524134256


Logistic regression model with data feature scaling and data imbalance handling

In [5]:
X = raw_data.drop('state', axis=1)
y = raw_data['state']
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.25, random_state = 42)

def Standard_Scaler (df, col_names):
    features = df[col_names]
    scaler = StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
    df[col_names] = features
    
    return df

col_names = X.columns

X_train = Standard_Scaler(X_train, col_names)
X_validation = Standard_Scaler(X_validation, col_names)

resample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))

X_train_resampled, y_train_resampled = resample.fit_resample(X_train, y_train)

print('normal_sm:', round(y_train_resampled.value_counts()[0]/len(y_train_resampled) * 100,2), '% of the dataset')
print('asd_sm:', round(y_train_resampled.value_counts()[1]/len(y_train_resampled) * 100,2), '% of the dataset')
print('kd_sm:', round(y_train_resampled.value_counts()[2]/len(y_train_resampled) * 100,2), '% of the dataset')
print('rhd_sm:', round(y_train_resampled.value_counts()[3]/len(y_train_resampled) * 100,2), '% of the dataset')

normal_sm: 25.0 % of the dataset
asd_sm: 25.0 % of the dataset
kd_sm: 25.0 % of the dataset
rhd_sm: 25.0 % of the dataset


In [6]:
class LogisticRegressionModel(nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()
        self.input_layer = nn.Linear(n_features, n_classes)

    def forward(self, x):
        return self.input_layer(x) 

X_train_resampled = torch.FloatTensor(X_train_resampled.to_numpy())
y_train_resampled = torch.LongTensor(y_train_resampled.to_numpy().astype(int))

X_validation = torch.FloatTensor(X_validation.to_numpy())
y_validation = torch.LongTensor(y_validation.to_numpy().astype(int))  

model = LogisticRegressionModel(n_features=170, n_classes=4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epoch_count = 1000

for epoch in range(epoch_count):
    model.train()
    optimizer.zero_grad()
    y_train_prediction = model(X_train_resampled)
    training_loss = criterion(y_train_prediction, y_train_resampled)
    training_loss.backward()
    optimizer.step()

model.eval()
y_train_prediction = torch.argmax(y_train_prediction, dim=1)

with torch.no_grad():
    y_validation_prediction = model(X_validation)
    y_validation_prediction = torch.argmax(y_validation_prediction, dim=1)
    
Training_F1 = f1_score(y_train_resampled.detach().numpy(), y_train_prediction.detach().numpy(), average='macro')
Validation_F1 = f1_score(y_validation.detach().numpy(), y_validation_prediction.detach().numpy(), average='macro')

print(f'Model With Data Scaling and Adjustments -> Training F1 Score: {Training_F1} | Validation F1 Score: {Validation_F1}')

Model With Data Scaling and Adjustments -> Training F1 Score: 0.8251489176416756 | Validation F1 Score: 0.26127799267646457


Logistic Regression Model With Optuna

In [7]:
X = raw_data.drop('state', axis=1)
y = raw_data['state']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.25, random_state = 42)

def Standard_Scaler (df, col_names):
    features = df[col_names]
    scaler = StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
    df[col_names] = features
    
    return df

col_names = X.columns

X_train = Standard_Scaler(X_train, col_names)
X_validation = Standard_Scaler(X_validation, col_names)

resample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))

X_train_resampled, y_train_resampled = resample.fit_resample(X_train, y_train)

print('normal_sm:', round(y_train_resampled.value_counts()[0]/len(y_train_resampled) * 100,2), '% of the dataset')
print('asd_sm:', round(y_train_resampled.value_counts()[1]/len(y_train_resampled) * 100,2), '% of the dataset')
print('kd_sm:', round(y_train_resampled.value_counts()[2]/len(y_train_resampled) * 100,2), '% of the dataset')
print('rhd_sm:', round(y_train_resampled.value_counts()[3]/len(y_train_resampled) * 100,2), '% of the dataset')

normal_sm: 25.0 % of the dataset
asd_sm: 25.0 % of the dataset
kd_sm: 25.0 % of the dataset
rhd_sm: 25.0 % of the dataset


Logistic Regression With Hyperparameter Tuning Using Optuna

In [8]:
class LogisticRegressionModel(nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()
        self.input_layer = nn.Linear(n_features, n_classes)

    def forward(self, x):
        return self.input_layer(x) 
    

X_train_resampled_tensor = torch.FloatTensor(X_train_resampled.to_numpy())
y_train_resampled_tensor = torch.LongTensor(y_train_resampled.to_numpy().astype(int))  

X_validation_tensor = torch.FloatTensor(X_validation.to_numpy())
y_validation_tensor = torch.LongTensor(y_validation.to_numpy().astype(int))  

def create_objective(X_train, y_train, X_val, y_val):

    def objective(trial):

        optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "SGD"])
        lr = trial.suggest_float("lr", 1e-3, 1e-1, log=True)
        n_epochs = trial.suggest_int("n_epochs", 1000, 5000)

        model = LogisticRegressionModel(n_features=170, n_classes=4)
        criterion = nn.CrossEntropyLoss()
        
        if optimizer_name == "Adam":
            optimizer = optim.Adam(model.parameters(), lr=lr)
        else:
            optimizer = optim.SGD(model.parameters(), lr=lr)

        for epoch in range(n_epochs):
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train)
            loss = criterion(outputs, y_train)
            loss.backward()
            optimizer.step()

        model.eval()

        with torch.no_grad():
            outputs = model(X_val)
            predictions = torch.argmax(outputs, dim=1)
            val_f1 = f1_score(y_val.cpu().numpy(), predictions.cpu().numpy(), average='macro')

        return val_f1
    
    return objective

# Now create the objective function using the closure
objective = create_objective(X_train_resampled_tensor, y_train_resampled_tensor, X_validation_tensor, y_validation_tensor)

# Create and run the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2024-05-10 21:39:11,185] A new study created in memory with name: no-name-04a00203-229f-404a-853d-4699e9dd2bbe
[I 2024-05-10 21:39:22,214] Trial 0 finished with value: 0.28009480425865907 and parameters: {'optimizer': 'Adam', 'lr': 0.0017244961007919966, 'n_epochs': 1112}. Best is trial 0 with value: 0.28009480425865907.
[I 2024-05-10 21:40:04,608] Trial 1 finished with value: 0.29792750562941483 and parameters: {'optimizer': 'Adam', 'lr': 0.00648377787571706, 'n_epochs': 4285}. Best is trial 1 with value: 0.29792750562941483.
[I 2024-05-10 21:40:17,646] Trial 2 finished with value: 0.2951235166972829 and parameters: {'optimizer': 'Adam', 'lr': 0.021438172263667975, 'n_epochs': 1352}. Best is trial 1 with value: 0.29792750562941483.
[I 2024-05-10 21:40:45,599] Trial 3 finished with value: 0.28960071249117464 and parameters: {'optimizer': 'SGD', 'lr': 0.048977897777363576, 'n_epochs': 2952}. Best is trial 1 with value: 0.29792750562941483.
[I 2024-05-10 21:41:05,798] Trial 4 finished

Best trial:
  Value: 0.29904440011145234
  Params: 
    optimizer: Adam
    lr: 0.016035075295773086
    n_epochs: 4253
