In [6]:
import pandas as pd
from scipy.io import arff
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
from sklearn.feature_selection import RFECV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
# Load the ARFF file using scipy
data, meta = arff.loadarff('Survival Prediction after bone-marrow transplantation/bone-marrow.arff')
df = pd.DataFrame(data)

# Decode byte strings to normal strings (if necessary)
df = df.map(lambda x: x.decode() if isinstance(x, bytes) else x)

df

Unnamed: 0,Recipientgender,Stemcellsource,Donorage,Donorage35,IIIV,Gendermatch,DonorABO,RecipientABO,RecipientRh,ABOmatch,...,extcGvHD,CD34kgx10d6,CD3dCD34,CD3dkgx10d8,Rbodymass,ANCrecovery,PLTrecovery,time_to_aGvHD_III_IV,survival_time,survival_status
0,1,1,22.830137,0,1,0,1,1,1,0,...,1,7.20,1.338760,5.38,35.0,19.0,51.0,32.0,999.0,0.0
1,1,0,23.342466,0,1,0,-1,-1,1,0,...,1,4.50,11.078295,0.41,20.6,16.0,37.0,1000000.0,163.0,1.0
2,1,0,26.394521,0,1,0,-1,-1,1,0,...,1,7.94,19.013230,0.42,23.4,23.0,20.0,1000000.0,435.0,1.0
3,0,0,39.684932,1,1,0,1,2,1,1,...,?,4.25,29.481647,0.14,50.0,23.0,29.0,19.0,53.0,1.0
4,0,1,33.358904,0,0,0,1,2,0,1,...,1,51.85,3.972255,13.05,9.0,14.0,14.0,1000000.0,2043.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,1,1,37.575342,1,1,0,1,1,0,0,...,1,11.08,2.522750,4.39,44.0,15.0,22.0,16.0,385.0,1.0
183,0,1,22.895890,0,0,0,1,0,1,1,...,1,4.64,1.038858,4.47,44.5,12.0,30.0,1000000.0,634.0,1.0
184,0,1,27.347945,0,1,0,1,-1,1,1,...,1,7.73,1.635559,4.73,33.0,16.0,16.0,1000000.0,1895.0,0.0
185,1,1,27.780822,0,1,0,1,0,1,1,...,0,15.41,8.077770,1.91,24.0,13.0,14.0,54.0,382.0,1.0


In [12]:
# Step 1: Import the pandas library
# Make sure pandas is imported in your notebook.
import pandas as pd

# Step 2: Ensure your DataFrame is ready
# This script assumes you have a DataFrame named 'df' in your environment.
# For example, you might have loaded it from a CSV like this:
# df = pd.read_csv('your_file.csv')

# Step 3: Define the name for the output Excel file
excel_file_name = 'my_dataset.xlsx'

# Step 4: Export your existing DataFrame to an Excel file
# The to_excel() function writes the contents of 'df' to an .xlsx file.
# We use 'index=False' to prevent pandas from writing the row numbers
# into the first column of the Excel sheet.
df.to_excel(excel_file_name, index=False, sheet_name='SalesData')

print(f"Success! Your DataFrame has been saved to the file: '{excel_file_name}'")
print("You can now download it from the Jupyter file browser.")

Success! Your DataFrame has been saved to the file: 'my_dataset.xlsx'
You can now download it from the Jupyter file browser.


In [11]:
# Export the DataFrame to an Excel file
# We use index=False to avoid writing the DataFrame index to the file
df.to_excel('my_data_export.xlsx', index=False, sheet_name='Employee Data')
print("DataFrame has been successfully exported to 'my_data_export.xlsx'")

DataFrame has been successfully exported to 'my_data_export.xlsx'


In [20]:
# Replace '?' with NaN
df.replace('?', pd.NA, inplace=True)

# Identify numerical and categorical features
numerical_features = ['Donorage', 'CD34kgx10d6', 'CD3dCD34', 'CD3dkgx10d8', 'Rbodymass', 'ANCrecovery', 'PLTrecovery', 'time_to_aGvHD_III_IV', 'survival_time']
categorical_features = ['ReAcipientgender', 'Stemcellsource', 'Donorage35', 'IIIV', 'Gendermatch', 'DonorABO', 'RecipientABO', 'RecipientRh', 'ABOmatch', 'CMVstatus', 'DonorCMV', 'RecipientCMV', 'Disease', 'Riskgroup', 'Txpostrelapse', 'Diseasegroup', 'HLAmatch', 'HLAmismatch', 'Antigen', 'Alel', 'HLAgrI', 'Recipientage', 'Recipientage10', 'Recipientageint', 'Relapse', 'aGvHDIIIIV', 'extcGvHD']

# Convert numerical columns to numeric types
df[numerical_features] = df[numerical_features].apply(pd.to_numeric)

# Print to verify columns
print(f"Numerical Features: {numerical_features}")
print(f"Categorical Features: {categorical_features}")
print(df.head())

Numerical Features: ['Donorage', 'CD34kgx10d6', 'CD3dCD34', 'CD3dkgx10d8', 'Rbodymass', 'ANCrecovery', 'PLTrecovery', 'time_to_aGvHD_III_IV', 'survival_time']
Categorical Features: ['Recipientgender', 'Stemcellsource', 'Donorage35', 'IIIV', 'Gendermatch', 'DonorABO', 'RecipientABO', 'RecipientRh', 'ABOmatch', 'CMVstatus', 'DonorCMV', 'RecipientCMV', 'Disease', 'Riskgroup', 'Txpostrelapse', 'Diseasegroup', 'HLAmatch', 'HLAmismatch', 'Antigen', 'Alel', 'HLAgrI', 'Recipientage', 'Recipientage10', 'Recipientageint', 'Relapse', 'aGvHDIIIIV', 'extcGvHD']
  Recipientgender Stemcellsource   Donorage Donorage35 IIIV Gendermatch  \
0               1              1  22.830137          0    1           0   
1               1              0  23.342466          0    1           0   
2               1              0  26.394521          0    1           0   
3               0              0  39.684932          1    1           0   
4               0              1  33.358904          0    0           

In [21]:
# Set the option to avoid the FutureWarning
pd.set_option('future.no_silent_downcasting', True)

# Replace '?' with NaN and then with np.nan
df.replace('?', np.nan, inplace=True)
df = df.replace({pd.NA: np.nan})

# Identify numerical and categorical features
numerical_features = ['Donorage', 'CD34kgx10d6', 'CD3dCD34', 'CD3dkgx10d8', 'Rbodymass', 'ANCrecovery', 'PLTrecovery', 'time_to_aGvHD_III_IV', 'survival_time']
categorical_features = ['Recipientgender', 'Stemcellsource', 'Donorage35', 'IIIV', 'Gendermatch', 'DonorABO', 'RecipientABO', 'RecipientRh', 'ABOmatch', 'CMVstatus', 'DonorCMV', 'RecipientCMV', 'Disease', 'Riskgroup', 'Txpostrelapse', 'Diseasegroup', 'HLAmatch', 'HLAmismatch', 'Antigen', 'Alel', 'HLAgrI', 'Recipientage', 'Recipientage10', 'Recipientageint', 'Relapse', 'aGvHDIIIIV', 'extcGvHD']

# Convert numerical columns to numeric types
df[numerical_features] = df[numerical_features].apply(pd.to_numeric)

# Define imputers
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Impute missing values for numerical features
df[numerical_features] = num_imputer.fit_transform(df[numerical_features])

# Impute missing values for categorical features
df[categorical_features] = cat_imputer.fit_transform(df[categorical_features])

# Print to verify imputed data
print(df.head())

  Recipientgender Stemcellsource   Donorage Donorage35 IIIV Gendermatch  \
0               1              1  22.830137          0    1           0   
1               1              0  23.342466          0    1           0   
2               1              0  26.394521          0    1           0   
3               0              0  39.684932          1    1           0   
4               0              1  33.358904          0    0           0   

  DonorABO RecipientABO RecipientRh ABOmatch  ... extcGvHD CD34kgx10d6  \
0        1            1           1        0  ...        1        7.20   
1       -1           -1           1        0  ...        1        4.50   
2       -1           -1           1        0  ...        1        7.94   
3        1            2           1        1  ...        1        4.25   
4        1            2           0        1  ...        1       51.85   

    CD3dCD34 CD3dkgx10d8 Rbodymass ANCrecovery PLTrecovery  \
0   1.338760        5.38      35.0        

In [22]:
# Encode categorical features
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Print to verify encoded data
print(df.head())

    Donorage  CD34kgx10d6   CD3dCD34  CD3dkgx10d8  Rbodymass  ANCrecovery  \
0  22.830137         7.20   1.338760         5.38       35.0         19.0   
1  23.342466         4.50  11.078295         0.41       20.6         16.0   
2  26.394521         7.94  19.013230         0.42       23.4         23.0   
3  39.684932         4.25  29.481647         0.14       50.0         23.0   
4  33.358904        51.85   3.972255        13.05        9.0         14.0   

   PLTrecovery  time_to_aGvHD_III_IV  survival_time  survival_status  ...  \
0         51.0                  32.0          999.0              0.0  ...   
1         37.0             1000000.0          163.0              1.0  ...   
2         20.0             1000000.0          435.0              1.0  ...   
3         29.0                  19.0           53.0              1.0  ...   
4         14.0             1000000.0         2043.0              0.0  ...   

   Recipientage_18.8  Recipientage_18.9  Recipientage_20.1  Recipientage_2

In [23]:
# Calculate correlation matrix
correlation_matrix = df.corr()

# Select upper triangle of correlation matrix
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Find index of feature columns with correlation greater than 0.7
to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]

# Drop features
df.drop(columns=to_drop, inplace=True)

# Print to verify dropped features
print(df.head())

    Donorage  CD34kgx10d6   CD3dCD34  CD3dkgx10d8  Rbodymass  ANCrecovery  \
0  22.830137         7.20   1.338760         5.38       35.0         19.0   
1  23.342466         4.50  11.078295         0.41       20.6         16.0   
2  26.394521         7.94  19.013230         0.42       23.4         23.0   
3  39.684932         4.25  29.481647         0.14       50.0         23.0   
4  33.358904        51.85   3.972255        13.05        9.0         14.0   

   PLTrecovery  time_to_aGvHD_III_IV  survival_time  survival_status  ...  \
0         51.0                  32.0          999.0              0.0  ...   
1         37.0             1000000.0          163.0              1.0  ...   
2         20.0             1000000.0          435.0              1.0  ...   
3         29.0                  19.0           53.0              1.0  ...   
4         14.0             1000000.0         2043.0              0.0  ...   

   Recipientage_18.3  Recipientage_18.4  Recipientage_18.7  Recipientage_1

In [24]:
# Data scaling
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Display the DataFrame after scaling
print("DataFrame after pre-processing:")
print(df.head())

DataFrame after pre-processing:
   Donorage  CD34kgx10d6  CD3dCD34  CD3dkgx10d8  Rbodymass  ANCrecovery  \
0 -1.289981    -0.474500 -0.428480     0.167062  -0.041097    -0.165725   
1 -1.227878    -0.747563  0.602871    -1.141966  -0.779836    -0.165744   
2 -0.857918    -0.399661  1.443128    -1.139332  -0.636192    -0.165701   
3  0.753104    -0.772846  2.551663    -1.213080   0.728424    -0.165701   
4 -0.013717     4.041147 -0.149610     2.187231  -1.374932    -0.165756   

   PLTrecovery  time_to_aGvHD_III_IV  survival_time  survival_status  ...  \
0    -0.316161             -1.858056       0.071115              0.0  ...   
1    -0.316209              0.538196      -0.915531              1.0  ...   
2    -0.316269              0.538196      -0.594517              1.0  ...   
3    -0.316237             -1.858087      -1.045353              1.0  ...   
4    -0.316289              0.538196       1.303242              0.0  ...   

   Recipientage_18.3  Recipientage_18.4  Recipientage_

In [25]:
# Sample target variable
target = 'survival_status'

# Ensure the target variable is correctly identified
if target not in df.columns:
    raise ValueError("The target column 'survival_status' is not found in the dataset. Please ensure the target column exists.")

# Split the data into features and target
X = df.drop(columns=[target])
y = df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model and evaluate
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Feature importance using Random Forest
rf_importances = rf.feature_importances_
rf_features = pd.DataFrame({'Feature': X.columns, 'Importance': rf_importances})
rf_features = rf_features.sort_values(by='Importance', ascending=False)
print("Random Forest Feature Importances:")
print(rf_features)

# Recursive Feature Elimination with Cross-Validation for Random Forest
rfecv = RFECV(estimator=rf, step=1, cv=5, scoring='accuracy')
rfecv.fit(X_train, y_train)
print("Optimal number of features for Random Forest:", rfecv.n_features_)
print("Best features selected by Recursive Feature Elimination:")
print(X.columns[rfecv.support_])

# Train and evaluate additional models similarly and get feature importances
# XGBoost
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

Random Forest Accuracy: 0.9210526315789473
Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      1.00      0.94        22
         1.0       1.00      0.81      0.90        16

    accuracy                           0.92        38
   macro avg       0.94      0.91      0.92        38
weighted avg       0.93      0.92      0.92        38

Random Forest Feature Importances:
              Feature  Importance
8       survival_time    0.313613
6         PLTrecovery    0.065606
168         Relapse_1    0.049675
3         CD3dkgx10d8    0.048686
1         CD34kgx10d6    0.046670
..                ...         ...
96   Recipientage_8.0    0.000000
46   Recipientage_1.6    0.000000
91   Recipientage_7.4    0.000000
89   Recipientage_7.2    0.000000
63   Recipientage_3.9    0.000000

[170 rows x 2 columns]
Optimal number of features for Random Forest: 138
Best features selected by Recursive Feature Elimination:
Index(['Donorage', 

In [26]:
# Hyper-Parameter Tuning for KNN
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

grid_search_knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid_knn, cv=5, scoring='accuracy', n_jobs=1)
grid_search_knn.fit(X_train, y_train)
print("Best parameters for KNN:", grid_search_knn.best_params_)
best_knn = grid_search_knn.best_estimator_
y_pred_best_knn = best_knn.predict(X_test)
print("Tuned KNN Accuracy:", accuracy_score(y_test, y_pred_best_knn))
print("Tuned KNN Classification Report:")
print(classification_report(y_test, y_pred_best_knn))

Best parameters for KNN: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
Tuned KNN Accuracy: 0.7368421052631579
Tuned KNN Classification Report:
              precision    recall  f1-score   support

         0.0       0.73      0.86      0.79        22
         1.0       0.75      0.56      0.64        16

    accuracy                           0.74        38
   macro avg       0.74      0.71      0.72        38
weighted avg       0.74      0.74      0.73        38



In [27]:
# Additional metrics for the tuned model
roc_auc_best_knn = roc_auc_score(y_test, best_knn.predict_proba(X_test)[:, 1])
print("Tuned KNN AUC-ROC:", roc_auc_best_knn)
conf_matrix_best_knn = confusion_matrix(y_test, y_pred_best_knn)
print("Tuned KNN Confusion Matrix:")
print(conf_matrix_best_knn)

# Save the trained Random Forest model
joblib.dump(rf, 'rf_model.joblib')

Tuned KNN AUC-ROC: 0.8636363636363635
Tuned KNN Confusion Matrix:
[[19  3]
 [ 7  9]]


['rf_model.joblib']

In [28]:
#Define the preprocess function
def preprocess(input_df):
    # Identify numerical and categorical features
    numerical_features = ['Donorage', 'CD34kgx10d6', 'CD3dCD34', 'CD3dkgx10d8', 'Rbodymass', 'ANCrecovery', 'PLTrecovery', 'time_to_aGvHD_III_IV', 'survival_time']
    categorical_features = ['Recipientgender', 'Stemcellsource', 'Donorage35', 'IIIV', 'Gendermatch', 'DonorABO', 'RecipientABO', 'RecipientRh', 'ABOmatch', 'CMVstatus', 'DonorCMV', 'RecipientCMV', 'Disease', 'Riskgroup', 'Txpostrelapse', 'Diseasegroup', 'HLAmatch', 'HLAmismatch', 'Antigen', 'Alel', 'HLAgrI', 'Recipientage', 'Recipientage10', 'Recipientageint', 'Relapse', 'aGvHDIIIIV', 'extcGvHD']

    # Convert numerical columns to numeric types
    input_df[numerical_features] = input_df[numerical_features].apply(pd.to_numeric)

    # Define imputers
    num_imputer = SimpleImputer(strategy='mean')
    cat_imputer = SimpleImputer(strategy='most_frequent')

    # Impute missing values for numerical features
    input_df[numerical_features] = num_imputer.fit_transform(input_df[numerical_features])

    # Impute missing values for categorical features
    input_df[categorical_features] = cat_imputer.fit_transform(input_df[categorical_features])

    # Encode categorical features
    input_df = pd.get_dummies(input_df, columns=categorical_features, drop_first=True)

    # Data scaling
    scaler = StandardScaler()
    input_df[numerical_features] = scaler.transform(input_df[numerical_features])

    return input_df

# Load the trained Random Forest model
rf_model = joblib.load('rf_model.joblib')

In [1]:
import os

In [33]:
import sklearn
import joblib
print(sklearn.__version__)
print(joblib.__version__)

1.5.2
1.4.2


In [35]:
import joblib

try:
    model_pipeline = joblib.load('rf_model_pipeline.joblib')
    print("Pipeline loaded successfully.")
except Exception as e:
    print(f"An error occurred while loading the pipeline: {e}")

An error occurred while loading the pipeline: 123


In [39]:
import arff

In [44]:
import scipy

In [55]:
import pandas as pd
import scipy.io.arff
import joblib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import arff

# Load the ARFF file directly with the file path as a string
data, meta = scipy.io.arff.loadarff('bone-marrow.arff')

# Convert ARFF data to a list
data_list = [list(record) for record in data]

attribute_names = meta.names()

# Create DataFrame from the ARFF file
df = pd.DataFrame(data, columns = attribute_names)

# Replace '?' with NaN
df.replace('?', pd.NA, inplace=True)

# Identify numerical and categorical features
numerical_features = ['Donorage', 'CD34kgx10d6', 'CD3dCD34', 'CD3dkgx10d8', 'Rbodymass', 'ANCrecovery', 'PLTrecovery', 'time_to_aGvHD_III_IV', 'survival_time']
categorical_features = ['Recipientgender', 'Stemcellsource', 'Donorage35', 'IIIV', 'Gendermatch', 'DonorABO', 'RecipientABO', 'RecipientRh', 'ABOmatch', 'CMVstatus', 'DonorCMV', 'RecipientCMV', 'Disease', 'Riskgroup', 'Txpostrelapse', 'Diseasegroup', 'HLAmatch', 'HLAmismatch', 'Antigen', 'Alel', 'HLAgrI', 'Recipientage', 'Recipientage10', 'Recipientageint', 'Relapse', 'aGvHDIIIIV', 'extcGvHD']

# Convert numerical columns to numeric types
df[numerical_features] = df[numerical_features].apply(pd.to_numeric)

# Define imputers
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Define the preprocessing pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', num_imputer),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', cat_imputer),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with the preprocessor and the model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Sample target variable
target = 'survival_status'

# Split the data into features and target
X = df.drop(columns=[target])
y = df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model pipeline
model_pipeline.fit(X_train, y_train)

# Save the model pipeline
joblib.dump(model_pipeline, 'rf_model_pipeline.joblib')

# Verify the file is saved
import os
  # Ensure the file is saved in the directory

# Load the model pipeline
model_pipeline = joblib.load('rf_model_pipeline.joblib')

# Define the prediction function
def predict_survival(input_data):
    input_df = pd.DataFrame([input_data])
    prediction = model_pipeline.predict(input_df)
    survival_status = 'Survived' if prediction[0] == 1 else 'Did not survive'
    return survival_status

# Example usage of the predict_survival function
input_data = {
    'Donorage': 35,
    'Recipientgender': 1,
    'Stemcellsource': 1,
    'Donorage35': 0,
    'IIIV': 1,
    'Gendermatch': 1,
    'DonorABO': 1,
    'RecipientABO': 1,
    'RecipientRh': 1,
    'ABOmatch': 1,
    'CMVstatus': 1,
    'DonorCMV': 1,
    'RecipientCMV': 1,
    'Disease': 1,
    'Riskgroup': 1,
    'Txpostrelapse': 0,
    'Diseasegroup': 1,
    'HLAmatch': 1,
    'HLAmismatch': 0,
    'Antigen': 1,
    'Alel': 1,
    'HLAgrI': 1,
    'Recipientage': 30,
    'Recipientage10': 1,
    'Recipientageint': 3,
    'Relapse': 0,
    'aGvHDIIIIV': 0,
    'extcGvHD': 0,
    'CD34kgx10d6': 5,
    'CD3dCD34': 1.5,
    'CD3dkgx10d8': 5,
    'Rbodymass': 70,
    'ANCrecovery': 20,
    'PLTrecovery': 50,
    'time_to_aGvHD_III_IV': 100,
    'survival_time': 200
}

try:
    prediction = predict_survival(input_data)
    print(prediction)
except Exception as e:
    print(f"An error occurred: {e}")

Survived
