In [6]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd  # type: ignore

import numpy as np # type: ignore

from imblearn.combine import SMOTETomek # type: ignore
from imblearn.under_sampling import TomekLinks # type: ignore

from sklearn.model_selection import train_test_split # type: ignore
from sklearn.preprocessing import StandardScaler # type: ignore

from sklearn.metrics import f1_score # type: ignore

import optuna # type: ignore

In [7]:
asd_sm = pd.read_csv('asd_sm.csv')
kd_sm = pd.read_csv('kd_sm.csv')
normal_sm = pd.read_csv('normal_sm.csv')
rhd_sm = pd.read_csv('rhd_sm.csv')

normal_sm['state'] = 0
asd_sm['state'] = 1
kd_sm['state'] = 2
rhd_sm['state'] = 3

combined_df = pd.concat([normal_sm, asd_sm, kd_sm, rhd_sm], axis=0)

print('normal_sm:', round(combined_df['state'].value_counts()[0]/len(combined_df) * 100,2), '% of the dataset')
print('asd_sm:', round(combined_df['state'].value_counts()[1]/len(combined_df) * 100,2), '% of the dataset')
print('kd_sm:', round(combined_df['state'].value_counts()[2]/len(combined_df) * 100,2), '% of the dataset')
print('rhd_sm:', round(combined_df['state'].value_counts()[3]/len(combined_df) * 100,2), '% of the dataset')

raw_data = combined_df.copy()
raw_data.drop_duplicates(inplace=True)

X = raw_data.drop('state', axis=1)
y = raw_data['state']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.25, random_state = 42)

normal_sm: 96.83 % of the dataset
asd_sm: 1.43 % of the dataset
kd_sm: 0.64 % of the dataset
rhd_sm: 1.1 % of the dataset


In [8]:
from sklearn.ensemble import RandomForestClassifier

clf_RFC = RandomForestClassifier(max_depth=10, random_state=42)
clf_RFC.fit(X_train, y_train)
training_output = clf_RFC.predict(X_train)
training_f1 = f1_score(y_train, training_output, average='macro')

Validation_Output = clf_RFC.predict(X_validation)
Validation_f1 = f1_score(y_validation, Validation_Output, average='macro')

print('Random Forest Classifier with no data feature scaling or data imbalance handling')
print('-' * 50)
print(f'Random Forest Classifier: Training F1 Score: {training_f1} | Validation F1 Score: {Validation_f1}')

Random Forest Classifier with no data feature scaling or data imbalance handling
--------------------------------------------------
Random Forest Classifier: Training F1 Score: 0.4536402836268748 | Validation F1 Score: 0.3949774123570443


Test Set Validation

In [9]:
# pd.set_option('future.no_silent_downcasting', True)
# Test_Set = pd.read_csv('test_all.csv').copy()

# X = Test_Set.drop('state', axis=1)
# y = Test_Set['state']

# mapping = {
#     'normal': 0,
#     'asd': 1,
#     'kd': 2,
#     'rhd': 3
# }

# y = y.replace(mapping)
# y.fillna(0, inplace=True)
# y = y.astype(int)

# def Standard_Scaler (df, col_names):
#     features = df[col_names]
#     scaler = StandardScaler().fit(features.values)
#     features = scaler.transform(features.values)
#     df[col_names] = features
    
#     return df

# col_names = X.columns

# X_test = Standard_Scaler(X, col_names)

# # Assuming clf_GBC is already trained and available
# test_output = clf_RFC.predict(X_test)
# test_f1 = f1_score(y, test_output, average='macro')

# print(f'Test Set F1 Score: {test_f1}')

# y_test_prediction = pd.DataFrame(test_output, columns=['prediction'])

# # Save predictions to a CSV file
# y_test_prediction.to_csv('Random_Forest_Answer.csv', index=False)

Test Set F1 Score: 0.24600159052752496


Random Forest Classifier with data feature scaling and data imbalance handling

In [10]:
X = raw_data.drop('state', axis=1)
y = raw_data['state']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.25, random_state = 42)

def Standard_Scaler (df, col_names):
    features = df[col_names]
    scaler = StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
    df[col_names] = features
    
    return df

col_names = X.columns

X_train = Standard_Scaler(X_train, col_names)
X_validation = Standard_Scaler(X_validation, col_names)

resample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))

X_train_resampled, y_train_resampled = resample.fit_resample(X_train, y_train)

print('normal_sm:', round(y_train_resampled.value_counts()[0]/len(y_train_resampled) * 100,2), '% of the dataset')
print('asd_sm:', round(y_train_resampled.value_counts()[1]/len(y_train_resampled) * 100,2), '% of the dataset')
print('kd_sm:', round(y_train_resampled.value_counts()[2]/len(y_train_resampled) * 100,2), '% of the dataset')
print('rhd_sm:', round(y_train_resampled.value_counts()[3]/len(y_train_resampled) * 100,2), '% of the dataset')

normal_sm: 25.0 % of the dataset
asd_sm: 25.0 % of the dataset
kd_sm: 25.0 % of the dataset
rhd_sm: 25.0 % of the dataset


In [11]:
clf_RFC = RandomForestClassifier(max_depth=10, random_state=42)
clf_RFC.fit(X_train_resampled, y_train_resampled)
training_output = clf_RFC.predict(X_train_resampled)
training_f1 = f1_score(y_train_resampled, training_output, average='macro')

Validation_Output = clf_RFC.predict(X_validation)
Validation_f1 = f1_score(y_validation, Validation_Output, average='macro')

print('Random Forest Classifier with data feature scaling and data imbalance handling')
print('-' * 50)
print(f'Random Forest Classifier: Training F1 Score: {training_f1} | Validation F1 Score: {Validation_f1}')

Random Forest Classifier with data feature scaling and data imbalance handling
--------------------------------------------------
Random Forest Classifier: Training F1 Score: 0.9723153565754665 | Validation F1 Score: 0.37463320497407626


Random Forest Classifier with data feature scaling, data imbalance handling, and hyperparameter tuning using Optuna

In [12]:
def create_objective(X_train, y_train, X_val, y_val):

    def objective(trial):

        n_estimators = trial.suggest_int("n_estimators", 1, 200)
        max_depth = trial.suggest_int("max_depth", 1, 25)
    
        clf_GBC = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        clf_GBC.fit(X_train, y_train)
        
        Validation_Output = clf_GBC.predict(X_val)
        Validation_f1 = f1_score(y_val, Validation_Output, average='macro')
        
        return Validation_f1
    
    return objective

# Now create the objective function using the closure
objective = create_objective(X_train_resampled, y_train_resampled, X_validation, y_validation)

# Create and run the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2024-05-10 22:32:38,509] A new study created in memory with name: no-name-0e45bc48-ab43-48fa-a9ce-522205d68df1
[I 2024-05-10 22:34:47,437] Trial 0 finished with value: 0.4001788460577376 and parameters: {'n_estimators': 186, 'max_depth': 23}. Best is trial 0 with value: 0.4001788460577376.
[I 2024-05-10 22:35:56,010] Trial 1 finished with value: 0.40385845871361675 and parameters: {'n_estimators': 100, 'max_depth': 23}. Best is trial 1 with value: 0.40385845871361675.
[I 2024-05-10 22:36:04,341] Trial 2 finished with value: 0.38587179779782577 and parameters: {'n_estimators': 12, 'max_depth': 22}. Best is trial 1 with value: 0.40385845871361675.
[I 2024-05-10 22:36:34,232] Trial 3 finished with value: 0.3410991035078023 and parameters: {'n_estimators': 79, 'max_depth': 9}. Best is trial 1 with value: 0.40385845871361675.
[I 2024-05-10 22:38:06,455] Trial 4 finished with value: 0.4056778026559231 and parameters: {'n_estimators': 141, 'max_depth': 20}. Best is trial 4 with value: 0.40

Best trial:
  Value: 0.4174892614495892
  Params: 
    n_estimators: 196
    max_depth: 18
