In [104]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix

from scipy.io import arff
import warnings

# Load Data

In [104]:
warnings.filterwarnings('ignore')


data = arff.loadarff('bone-marrow.arff.txt')
df = pd.DataFrame(data[0])
print(df)

# Clean Data

In [104]:
df.drop(columns=['Disease'], inplace=True)


#Convert all columns to numeric, coerce errors to null values
for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors='coerce')
    
#Make sure binary columns are encoded as 0 and 1
for c in df.columns[df.nunique()==2]:
    df[c] = (df[c]==1)*1.0

print('Count of unique values in each column:')
print(df.nunique())

Count of unique values in each column:
Recipientgender           2
Stemcellsource            2
Donorage                187
Donorage35                2
IIIV                      2
Gendermatch               2
DonorABO                  4
RecipientABO              4
RecipientRh               2
ABOmatch                  2
CMVstatus                 4
DonorCMV                  2
RecipientCMV              2
Riskgroup                 2
Txpostrelapse             2
Diseasegroup              2
HLAmatch                  4
HLAmismatch               2
Antigen                   4
Alel                      5
HLAgrI                    7
Recipientage            125
Recipientage10            2
Recipientageint           3
Relapse                   2
aGvHDIIIIV                2
extcGvHD                  2
CD34kgx10d6             183
CD3dCD34                182
CD3dkgx10d8             163
Rbodymass               130
ANCrecovery              18
PLTrecovery              50
time_to_aGvHD_III_IV     28
survival_

# Create Feature and Labels

In [105]:
# 2. Set target, survival_status,as y; features (dropping survival status and time) as X
y = df['survival_status']
x = df.drop(columns = ['survival_status', 'survival_time'])

## Seperate numeric columns from categorical columns

In [106]:
# 3. Define lists of numeric and categorical columns based on number of unique values
num_cols = x.columns[x.nunique() < 7]
cat_cols = x.columns[x.nunique() >= 7]

In [107]:
# 4. Print columns with missing values
nan_cols = x.isnull().sum

# Split Data

In [138]:
    # 5. Split data into train/test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= .2, random_state = 0)

# Create Pipelines 

In [139]:
# 6. Create categorical preprocessing pipeline
# Using mode to fill in missing values and OHE
cat_vals = Pipeline([("imputer", SimpleImputer(strategy= 'most_frequent')),
                     ('OHE', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'))])

# 7. Create numerical preprocessing pipeline
# Using mean to fill in missing values and standard scaling of features
num_vals = Pipeline([('imputer', SimpleImputer(strategy= 'mean')), ('scaler', StandardScaler())])


## Preprocess categorical pipeline and numerical pipeline

In [140]:
# 8. Create column transformer that will preprocess the numerical and categorical features separately
preprocess = ColumnTransformer(transformers= [
                          ('cat_preprocess', cat_vals, cat_cols),
                          ('num_preprocess', num_vals, num_cols)
                          ]
)

# Build Pipeline Model - Logistic Regression

In [141]:
# 9. Create a pipeline with preprocess, PCA, and a logistic regresssion model
pipeline = Pipeline([('preprocess', preprocess), ('pca',PCA()), ('LR',LogisticRegression(random_state=0))])

# 10. Fit the pipeline on the training data
pipeline.fit(x_train, y_train)
#Predict the pipeline on the test data
pipeline.predict(x_test)
print(pipeline.score(x_test, y_test))


0.7894736842105263


# Build Pipeline Model - Random Forest Classifier

In [71]:
# 9. Create a pipeline with preprocess, PCA, and a logistic regresssion model
pipeline = Pipeline([('preprocess', preprocess), ('pca',PCA()), ('RFC',RandomForestClassifier(random_state=4))])

# 10. Fit the pipeline on the training data
pipeline.fit(x_train, y_train)
#Predict the pipeline on the test data
pipeline.predict(x_test)
print(pipeline.score(x_test, y_test))


0.5526315789473685


# Build Pipeline Model - Random Forest Regressor 

In [95]:
# 9. Create a pipeline with preprocess, PCA, and a logistic regresssion model
pipeline = Pipeline([('preprocess', preprocess), ('pca',PCA()), ('RFR',RandomForestRegressor(random_state=5))])

# 10. Fit the pipeline on the training data
pipeline.fit(x_train, y_train)
#Predict the pipeline on the test data
pipeline.predict(x_test)
print(pipeline.score(x_test, y_test))


-0.020226111111111056


# Tune Hyperparameters 

In [98]:
# 11. Define search space of hyperparameters
search_space = [{'RFR':[RandomForestRegressor()],
                 'RFR__max_depth': np.linspace(2,10,1).astype(int),
                'pca__n_components':np.linspace(30,37,1).astype(int),
                     'preprocess__num_preprocess__scaler':[MinMaxScaler(), RobustScaler(), StandardScaler()],
                'preprocess__cat_preprocess__imputer__strategy': ['mean', 'most_frequent', 'median', 'constant'],
                'preprocess__num_preprocess__imputer__strategy': ['mean', 'most_frequent', 'median', 'constant']}
                
                   ]

In [72]:
# # 11. Define search space of hyperparameters
# search_space = [{'RFC':[RandomForestClassifier()],
#                  'RFC__max_depth': np.linspace(2,10,1).astype(int),
#                 'pca__n_components':np.linspace(30,37,1).astype(int),
#                      'preprocess__num_preprocess__scaler':[MinMaxScaler(), RobustScaler(), StandardScaler()],
#                 'preprocess__cat_preprocess__imputer__strategy': ['mean', 'most_frequent', 'median', 'constant']}
#                    ]

In [142]:
# # 11. Define search space of hyperparameters
# search_space = [{'LR':[LogisticRegression()],
#                      'LR__random_state': np.linspace(0,3,1).astype(int),
#                      'LR__penalty': ['l1', 'l2', 'elasticnet', 'none'],
#                      'LR__C': np.logspace(-4, 2, 10),
#                 'pca__n_components':np.linspace(4,37,3).astype(int)
# #                      'preprocess__num_preprocess__scaler':[MinMaxScaler(), RobustScaler(), StandardScaler()],
# #                 'preprocess__cat_preprocess__imputer__strategy': ['mean', 'most_frequent', 'median', 'constant'],
# #                 'preprocess__num_preprocess__imputer__strategy': ['mean', 'most_frequent', 'median', 'constant']
#                 }
                
#                    ]


# Run Model

In [146]:
#12. Search over hyperparameters abolve to optimize pipeline and fit
gs = GridSearchCV(pipeline, search_space, cv=2)
gs.fit(x_train, y_train)

# 13. Save the best estimator from the gridsearch and print attributes and final accuracy on test set
best_model = gs.best_estimator_


# 14. Print attributes of best_model
print(best_model.named_steps['LR'])
print(best_model.named_steps['pca'])
print(best_model.named_steps['preprocess'])
# print(best_model.named_steps['RFR'])

# 15. Print final accuracy score 
print(best_model.score(x_test, y_test))


LogisticRegression(C=0.21544346900318823, random_state=0)
PCA(n_components=20)
ColumnTransformer(transformers=[('cat_preprocess',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('OHE',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 Index(['Donorage', 'HLAgrI', 'Recipientage', 'CD34kgx10d6', 'CD3dCD34',
       'CD3dkgx10d8', 'Rbodymass', 'ANCrecovery', 'PLTrecovery',
       'time_to_aGvHD_III_IV'],
      dtype...
                                                 ('scaler', StandardScaler())]),
                                 Index(['Recipientgender', 'Stemcellsource', 'Donorage35', 'IIIV',
       