In [29]:
#importing libraries
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SequentialFeatureSelector
import joblib

In [30]:
#loading training dataset
dataset = pd.read_excel('TrainDataset2023.xls')

# Data preprocessing
# Label encoding the ID column (optional as we drop ID column anyhow)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['ID'] = label_encoder.fit_transform(dataset['ID'])
c = ['ID','pCR (outcome)']
# Replace 999 with NaN
dataset.replace(999, np.nan, inplace=True)
dataset.fillna(dataset.median(), inplace=True)

In [31]:
import numpy as np

def remove_outliers(df):
    threshold = 3
    z_scores = np.abs((df - df.mean()) / df.std())
    df = df[(z_scores < threshold).all(axis=1)]
    return df

In [32]:
dataset = remove_outliers(dataset)

In [33]:
df1 = dataset.copy()
#correlated features of dataset
def correlation(data, threshold):
    col_corr = {}  # Dictionary to store correlated features
    corr_matrix = data.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:  # We are interested in absolute coefficient value
                colname = corr_matrix.columns[i]
                if colname not in col_corr:
                    col_corr[colname] = set()
                col_corr[colname].add(corr_matrix.columns[j])

    return col_corr

corr_features = correlation(df1, 0.8)
print('correlated features: ', len(corr_features))

correlated features:  87


In [34]:
df_corr= df1.drop(labels=corr_features, axis=1)
df_corr.shape

(313, 33)

In [35]:
# quasi constant feature checking and removing the features 
fr = VarianceThreshold(threshold=0.01)

#fitting the variance threshold to finding and removing the features with low variance
df_corr_quasi = fr.fit_transform(df_corr)

features_columns = df_corr.columns[fr.get_support()]
print("How many are not quasi-constant features: ",sum(fr.get_support()))

How many are not quasi-constant features:  28


In [36]:
#converting the arryas  to dataframe type
#checking if columns are assigned corectly
df_corr_quasi= pd.DataFrame(df_corr_quasi, columns=features_columns)
df_corr_quasi.columns = features_columns

df_corr_quasi.shape

(313, 28)

In [37]:
df_corr_quasi1 = df_corr_quasi.copy()

In [38]:
# Dropping all the columns other than features
columns_to_drop = ['ID','pCR (outcome)','RelapseFreeSurvival (outcome)']
X = df_corr_quasi1.drop(columns_to_drop, axis=1)
y = df_corr_quasi1['RelapseFreeSurvival (outcome)']

In [39]:
#Z-normalization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [40]:
#rfe = RFE(rf_regressor, n_features_to_select=12)
#rfe.fit(X, y)
#selected_features = X.columns[rfe.support_]

#feature selection using sequential feature selection method
selector = SequentialFeatureSelector(estimator=RandomForestRegressor(n_estimators=100), n_features_to_select='auto')
selector.fit(X,y)
selected_features = X.columns[selector.support_]

In [41]:
selected_features
new_X = df_corr_quasi1[selected_features]
new_X.columns

Index(['ER', 'PgR', 'HER2', 'TrippleNegative', 'ChemoGrade', 'LNStatus',
       'original_shape_LeastAxisLength', 'original_shape_SurfaceVolumeRatio',
       'original_firstorder_10Percentile', 'original_firstorder_90Percentile',
       'original_firstorder_InterquartileRange',
       'original_glszm_SmallAreaEmphasis'],
      dtype='object')

In [42]:
#xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.6,subsample=0.01,max_depth=10, n_estimators=10)

#Random forest regressor
rf_regressor = RandomForestRegressor(n_estimators=100, max_depth = 9,random_state=10)

#Train test split data
X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.1, random_state=10)

#Fitting the model with the selected features only
rf_regressor.fit(X_train, y_train)

In [43]:
# Make predictions on the test set
y_pred = rf_regressor.predict(X_test)

In [44]:
# Evaluate the performance of the model
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 15.558959021247993


In [45]:
k_fold = KFold(n_splits=10, shuffle=True, random_state=10)
mae_scores = cross_val_score(rf_regressor, new_X, y, scoring='neg_mean_absolute_error', cv=k_fold)

In [46]:
for i, mae in enumerate(mae_scores, 1):
    print(f'Fold {i}: Mean Absolute Error: {abs(mae)}')

Fold 1: Mean Absolute Error: 15.95452099047807
Fold 2: Mean Absolute Error: 22.744015008953035
Fold 3: Mean Absolute Error: 22.564765773286915
Fold 4: Mean Absolute Error: 21.658076177302416
Fold 5: Mean Absolute Error: 17.86325355765368
Fold 6: Mean Absolute Error: 17.77127889652033
Fold 7: Mean Absolute Error: 18.172086566143612
Fold 8: Mean Absolute Error: 22.148916568258475
Fold 9: Mean Absolute Error: 19.12241858102302
Fold 10: Mean Absolute Error: 22.582866497002716


In [47]:
print(f'Average Mean Absolute Error across all folds: {abs(mae_scores.mean())}')

Average Mean Absolute Error across all folds: 20.058219861662227


In [28]:
#saving the model

#model = RandomForestRegressor()
#joblib.dump(rf_regressor, 'rf_regressor.joblib')

['rf_regressor1.joblib']