In [7]:
import pandas as pd
import numpy as np

from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier

import optuna

Gradient Boosted Classifier with no data feature scaling or data imbalance handling

In [8]:
asd_sm = pd.read_csv('asd_sm.csv')
kd_sm = pd.read_csv('kd_sm.csv')
normal_sm = pd.read_csv('normal_sm.csv')
rhd_sm = pd.read_csv('rhd_sm.csv')

normal_sm['state'] = 0
asd_sm['state'] = 1
kd_sm['state'] = 2
rhd_sm['state'] = 3

combined_df = pd.concat([normal_sm, asd_sm, kd_sm, rhd_sm], axis=0)

print('normal_sm:', round(combined_df['state'].value_counts()[0]/len(combined_df) * 100,2), '% of the dataset')
print('asd_sm:', round(combined_df['state'].value_counts()[1]/len(combined_df) * 100,2), '% of the dataset')
print('kd_sm:', round(combined_df['state'].value_counts()[2]/len(combined_df) * 100,2), '% of the dataset')
print('rhd_sm:', round(combined_df['state'].value_counts()[3]/len(combined_df) * 100,2), '% of the dataset')

raw_data = combined_df.copy()
raw_data.drop_duplicates(inplace=True)

X = raw_data.drop('state', axis=1)
y = raw_data['state']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.25, random_state = 42)

normal_sm: 96.83 % of the dataset
asd_sm: 1.43 % of the dataset
kd_sm: 0.64 % of the dataset
rhd_sm: 1.1 % of the dataset


In [9]:
from sklearn.ensemble import GradientBoostingClassifier

clf_GBC = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
clf_GBC.fit(X_train, y_train)
training_output = clf_GBC.predict(X_train)
training_f1 = f1_score(y_train, training_output, average='macro')

Validation_Output = clf_GBC.predict(X_validation)
Validation_f1 = f1_score(y_validation, Validation_Output, average='macro')

print('Gradient Boosted Classifier with no data feature scaling or data imbalance handling')
print('-' * 50)
print(f'Gradient Boosted Classifier: Training F1 Score: {training_f1} | Validation F1 Score: {Validation_f1}')

Gradient Boosted Classifier with no data feature scaling or data imbalance handling
--------------------------------------------------
Gradient Boosted Classifier: Training F1 Score: 0.41018967518361493 | Validation F1 Score: 0.3734732918011756


In [10]:
# pd.set_option('future.no_silent_downcasting', True)
# Test_Set = pd.read_csv('test_all.csv').copy()

# X = Test_Set.drop('state', axis=1)
# y = Test_Set['state']

# mapping = {
#     'normal': 0,
#     'asd': 1,
#     'kd': 2,
#     'rhd': 3
# }

# y = y.replace(mapping)
# y.fillna(0, inplace=True)
# y = y.astype(int)

# def Standard_Scaler (df, col_names):
#     features = df[col_names]
#     scaler = StandardScaler().fit(features.values)
#     features = scaler.transform(features.values)
#     df[col_names] = features
    
#     return df

# col_names = X.columns

# X_test = Standard_Scaler(X, col_names)

# # Assuming clf_GBC is already trained and available
# test_output = clf_GBC.predict(X_test)
# test_f1 = f1_score(y, test_output, average='macro')

# print(f'Test Set F1 Score: {test_f1}')

# y_test_prediction = pd.DataFrame(test_output, columns=['prediction'])

# # Save predictions to a CSV file
# y_test_prediction.to_csv('Gradient_Boosting_Answer.csv', index=False)

Test Set F1 Score: 0.24600159052752496


Gradient Boosted Classifier with data feature scaling and data imbalance handling

In [11]:
X = raw_data.drop('state', axis=1)
y = raw_data['state']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.25, random_state = 42)

def Standard_Scaler (df, col_names):
    features = df[col_names]
    scaler = StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
    df[col_names] = features
    
    return df

col_names = X.columns

X_train = Standard_Scaler(X_train, col_names)
X_validation = Standard_Scaler(X_validation, col_names)

resample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))

X_train_resampled, y_train_resampled = resample.fit_resample(X_train, y_train)

print('normal_sm:', round(y_train_resampled.value_counts()[0]/len(y_train_resampled) * 100,2), '% of the dataset')
print('asd_sm:', round(y_train_resampled.value_counts()[1]/len(y_train_resampled) * 100,2), '% of the dataset')
print('kd_sm:', round(y_train_resampled.value_counts()[2]/len(y_train_resampled) * 100,2), '% of the dataset')
print('rhd_sm:', round(y_train_resampled.value_counts()[3]/len(y_train_resampled) * 100,2), '% of the dataset')

normal_sm: 25.0 % of the dataset
asd_sm: 25.0 % of the dataset
kd_sm: 25.0 % of the dataset
rhd_sm: 25.0 % of the dataset


In [12]:
from sklearn.ensemble import GradientBoostingClassifier

clf_GBC = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
clf_GBC.fit(X_train_resampled, y_train_resampled)
training_output = clf_GBC.predict(X_train_resampled)
training_f1 = f1_score(y_train_resampled, training_output, average='macro')

Validation_Output = clf_GBC.predict(X_validation)
Validation_f1 = f1_score(y_validation, Validation_Output, average='macro')

print('Gradient Boosted Classifier with data feature scaling and data imbalance handling')
print('-' * 50)
print(f'Gradient Boosted Classifier: Training F1 Score: {training_f1} | Validation F1 Score: {Validation_f1}')

Gradient Boosted Classifier with data feature scaling and data imbalance handling
--------------------------------------------------
Gradient Boosted Classifier: Training F1 Score: 0.9520082391905339 | Validation F1 Score: 0.16186286575895187


Gradient Boosted Classifier with data feature scaling, data imbalance handling, and hyperparameter tuning using Optuna

In [13]:
def create_objective(X_train, y_train, X_val, y_val):

    def objective(trial):

        n_estimators = trial.suggest_int("n_estimators", 1, 200)
        learning_rate = trial.suggest_float("learning_rate", 0.5, 1.5)
    
        clf_GBC = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=1, random_state=0)
        clf_GBC.fit(X_train, y_train)
        
        Validation_Output = clf_GBC.predict(X_val)
        Validation_f1 = f1_score(y_val, Validation_Output, average='macro')
        
        return Validation_f1
    
    return objective

# Now create the objective function using the closure
objective = create_objective(X_train_resampled, y_train_resampled, X_validation, y_validation)

# Create and run the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2024-05-10 22:35:58,636] A new study created in memory with name: no-name-0cb1ebd8-4ea6-4646-bfeb-92dba4e84dc3
[I 2024-05-10 22:46:52,333] Trial 0 finished with value: 0.16123998028452957 and parameters: {'n_estimators': 172, 'learning_rate': 1.3771050461802645}. Best is trial 0 with value: 0.16123998028452957.
[I 2024-05-10 22:49:13,230] Trial 1 finished with value: 0.223959791200548 and parameters: {'n_estimators': 39, 'learning_rate': 0.9797582810955014}. Best is trial 1 with value: 0.223959791200548.
[I 2024-05-10 23:00:15,666] Trial 2 finished with value: 0.1714985407424226 and parameters: {'n_estimators': 175, 'learning_rate': 1.036957476835771}. Best is trial 1 with value: 0.223959791200548.
[I 2024-05-10 23:03:18,840] Trial 3 finished with value: 0.21036931909628948 and parameters: {'n_estimators': 53, 'learning_rate': 1.4019199638421274}. Best is trial 1 with value: 0.223959791200548.
[I 2024-05-10 23:10:23,922] Trial 4 finished with value: 0.17300532855136752 and parameter

Best trial:
  Value: 0.223959791200548
  Params: 
    n_estimators: 39
    learning_rate: 0.9797582810955014
