In [21]:
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import os, re, csv
from tqdm import tqdm

In [22]:
# Define directories
train_path = '../dataset/MICCAI_BraTS2020_TrainingData/'
val_path = '../dataset/MICCAI_BraTS2020_ValidationData/'
modality_keys = 'flair'
BATCH_SIZE = 4

In [23]:

def make_csv(y_pred_validation, modality_used):
    df = pd.read_csv(os.path.join(val_path, 'survival_evaluation.csv'))
    validation_ids = df['BraTS20ID'].values
    filename = f"./global_predictons/Light_GBM/{modality_used}_Light_GBM.csv"

    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["ID", "Days"])
        for id, day in zip(validation_ids, y_pred_validation):
            writer.writerow([id, day])

    print(f"CSV file '{filename}' created successfully.")

In [24]:


def preprocess_labels(csv_file_path):
    df = pd.read_csv(csv_file_path)
    df['Survival_days'] = df['Survival_days'].apply(lambda x: int(re.search(r'\d+', x).group()) if isinstance(x, str) else x)
    df['Survival_days'] = pd.to_numeric(df['Survival_days'], errors='coerce').dropna().astype(int)
    return df['Survival_days'].values, df['Brats20ID'].values

In [25]:
def create_data_list_val(data_dir, modality_keys):
    df = pd.read_csv(os.path.join(data_dir, 'survival_evaluation.csv'))
    patient_ids = df['BraTS20ID'].values
    data_list = []
    for patient in tqdm(patient_ids, desc="Creating validation data list"):
        patient_dir = os.path.join(data_dir, patient)
        if os.path.isdir(patient_dir):
            data_dict = {key: os.path.join(patient_dir, f"{patient}_{key}.nii") for key in modality_keys}
            data_list.append(data_dict)
    return data_list

In [26]:


# Load the saved features
train_features = np.load('./features/train/train_backbone_outputs.npy')
validation_features = np.load('./features/validation/validation_backbone_outputs.npy')


In [27]:

# Load the corresponding labels
def load_labels(file_path):
    return np.load(file_path)


In [28]:


train_labels, train_id = preprocess_labels(os.path.join(train_path, 'survival_info.csv')) 


In [37]:
def extract_number(string):
    # Split the string by underscore and get the last part
    num_str = string.split('_')[-1]
    # Convert the string to an integer to remove leading zeros
    num_int = int(num_str)
    return num_int


In [38]:
numbers = [extract_number(s) for s in train_id]


In [36]:
train_id.shape

(236,)

In [40]:
# List to keep track of missing numbers
missing_numbers = []

# Loop from 1 to 369
for i in range(1, 370):
    if i not in numbers:
        missing_numbers.append(i)

In [44]:
len(missing_numbers)

133

In [41]:
missing_numbers

[130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 146,
 148,
 149,
 150,
 155,
 156,
 171,
 172,
 176,
 193,
 197,
 198,
 200,
 202,
 205,
 207,
 208,
 209,
 211,
 212,
 213,
 232,
 234,
 237,
 243,
 244,
 245,
 248,
 256,
 257,
 258,
 259,
 260,
 261,
 262,
 263,
 264,
 265,
 266,
 267,
 268,
 269,
 270,
 271,
 272,
 273,
 274,
 275,
 276,
 277,
 278,
 279,
 280,
 281,
 282,
 283,
 284,
 285,
 286,
 287,
 288,
 289,
 290,
 291,
 292,
 293,
 294,
 295,
 296,
 297,
 298,
 299,
 300,
 301,
 302,
 303,
 304,
 305,
 306,
 307,
 308,
 309,
 310,
 311,
 312,
 313,
 314,
 315,
 316,
 317,
 318,
 319,
 320,
 321,
 322,
 323,
 324,
 325,
 326,
 327,
 328,
 329,
 330,
 331,
 332,
 333,
 334,
 335,
 336,
 343,
 344,
 349,
 350,
 352,
 361,
 362,
 364,
 365]

In [35]:
train_id[0]

'BraTS20_Training_001'

In [31]:
train_features.shape

(369, 64)

In [29]:


# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, test_size=0.2, random_state=42)


ValueError: Found input variables with inconsistent numbers of samples: [369, 236]

In [None]:



# Create LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)


In [None]:
# Define LightGBM parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Train the model
print("Training LightGBM model...")
gbm = lgb.train(params, train_data, num_boost_round=1000, valid_sets=[train_data, val_data], early_stopping_rounds=50, verbose_eval=10)


In [None]:
# Predict and evaluate
print("Evaluating the model...")
y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')


In [None]:




# Predict on the validation set
validation_pred = []
for i in tqdm(range(0, len(validation_features), BATCH_SIZE), desc="Predicting validation set"):
    batch = validation_features[i:i+BATCH_SIZE]
    batch_pred = gbm.predict(batch, num_iteration=gbm.best_iteration)
    validation_pred.extend(batch_pred)

make_csv(validation_pred, modality_keys)


In [None]:

# Save the trained model
gbm.save_model('./lightgbm_model.txt')
print("Model saved to ./lightgbm_model.txt")
