<a href="https://colab.research.google.com/github/Practicum-Team-2/LungCancerClassifier/blob/main/random_forest_dashboard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
from statistics import mean
from sklearn.metrics import f1_score,roc_auc_score,confusion_matrix
import seaborn as sns

pd.options.display.max_seq_items = None

In [1]:
from google.colab import files
uploaded = files.upload()

Saving full_data.csv to full_data.csv


In [75]:
import io
df = pd.read_csv(io.BytesIO(uploaded['full_data.csv']))
df.drop(df.columns[0], axis=1, inplace=True)
df=df[df["Cancer Type Detailed"]!="Small Cell Lung Cancer"]
df["Cancer Type Detailed"].value_counts()


Lung Adenocarcinoma             3437
Lung Squamous Cell Carcinoma    1506
Name: Cancer Type Detailed, dtype: int64

In [89]:
df['Cancer Type Detailed'] = np.where(df['Cancer Type Detailed'] == "Lung Squamous Cell Carcinoma", 1, 0)

In [90]:
# Split the dataset into train and test sets
X = df.loc[:, df.columns != 'Cancer Type Detailed']
y = df['Cancer Type Detailed']
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.2, random_state = 11)
X_train.shape
y_train

4596    0
2313    0
2817    0
2381    1
2548    1
       ..
332     0
1293    1
4143    0
3895    1
1945    0
Name: Cancer Type Detailed, Length: 3954, dtype: int64

### start preprocessing the data, very inefficient approach just because I want to preserve the variable names.

In [96]:
X_train_cat=X_train.iloc[:,[3,4,5]]#the categorical columns
impute=SimpleImputer(strategy='most_frequent', fill_value='missing')
X_train_cat=pd.DataFrame(impute.fit_transform(X_train_cat), columns = X_train_cat.columns)
X_train_cat=pd.get_dummies(X_train_cat, prefix='dummy')
X_train_cat

Unnamed: 0,dummy_Female,dummy_Male,dummy_Matched,dummy_Unmatched,dummy_Current Smoker,dummy_Former Smoker,dummy_Non Smoker,dummy_Reformed Smoker
0,1,0,1,0,0,1,0,0
1,1,0,1,0,0,1,0,0
2,1,0,1,0,0,0,0,1
3,0,1,1,0,0,1,0,0
4,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...
3949,0,1,1,0,1,0,0,0
3950,0,1,1,0,0,0,0,1
3951,1,0,1,0,0,0,0,1
3952,0,1,1,0,1,0,0,0


In [97]:
X_train_num=X_train.iloc[:,[0,1,2,6,7]]#the numeric columns
impute=SimpleImputer(strategy='median')
X_train_num=impute.fit_transform(X_train_num)
X_train_num=pd.DataFrame(StandardScaler().fit_transform(X_train_num), columns = X_train.iloc[:,[0,1,2,6,7]].columns)
X_train_num

Unnamed: 0,Mutation Count,Fraction Genome Altered,Diagnosis Age,Person Cigarette Smoking History Pack Year Value,TMB Nonsynonymous
0,-0.758086,-1.307309,-0.093502,1.407065,0.731580
1,0.021908,-0.044583,-0.093502,-0.056285,-0.193803
2,0.327873,-0.863882,-0.763721,1.842947,0.110356
3,0.207211,-0.044583,1.491904,0.971183,-0.012153
4,0.991514,-0.044583,-0.288853,2.801888,0.765142
...,...,...,...,...,...
3949,-0.486597,-0.044583,-0.093502,-0.772346,-0.683837
3950,0.612290,0.318761,0.898319,0.535300,0.456759
3951,-0.215107,0.035317,0.423450,-0.772346,-0.426147
3952,-0.284057,0.208161,0.898319,0.840418,-0.493738


In [98]:
X_train_good =pd.concat([X_train_cat,X_train_num],axis=1)
X_train_good

Unnamed: 0,dummy_Female,dummy_Male,dummy_Matched,dummy_Unmatched,dummy_Current Smoker,dummy_Former Smoker,dummy_Non Smoker,dummy_Reformed Smoker,Mutation Count,Fraction Genome Altered,Diagnosis Age,Person Cigarette Smoking History Pack Year Value,TMB Nonsynonymous
0,1,0,1,0,0,1,0,0,-0.758086,-1.307309,-0.093502,1.407065,0.731580
1,1,0,1,0,0,1,0,0,0.021908,-0.044583,-0.093502,-0.056285,-0.193803
2,1,0,1,0,0,0,0,1,0.327873,-0.863882,-0.763721,1.842947,0.110356
3,0,1,1,0,0,1,0,0,0.207211,-0.044583,1.491904,0.971183,-0.012153
4,0,1,1,0,0,1,0,0,0.991514,-0.044583,-0.288853,2.801888,0.765142
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3949,0,1,1,0,1,0,0,0,-0.486597,-0.044583,-0.093502,-0.772346,-0.683837
3950,0,1,1,0,0,0,0,1,0.612290,0.318761,0.898319,0.535300,0.456759
3951,1,0,1,0,0,0,0,1,-0.215107,0.035317,0.423450,-0.772346,-0.426147
3952,0,1,1,0,1,0,0,0,-0.284057,0.208161,0.898319,0.840418,-0.493738


In [99]:
from imblearn.over_sampling import SMOTE#address imbalanced data
oversample = SMOTE()
X_train_good, y_train = oversample.fit_resample(X_train_good, y_train)
X_train_good

Unnamed: 0,dummy_Female,dummy_Male,dummy_Matched,dummy_Unmatched,dummy_Current Smoker,dummy_Former Smoker,dummy_Non Smoker,dummy_Reformed Smoker,Mutation Count,Fraction Genome Altered,Diagnosis Age,Person Cigarette Smoking History Pack Year Value,TMB Nonsynonymous
0,1,0,1,0,0,1,0,0,-0.758086,-1.307309,-0.093502,1.407065,0.731580
1,1,0,1,0,0,1,0,0,0.021908,-0.044583,-0.093502,-0.056285,-0.193803
2,1,0,1,0,0,0,0,1,0.327873,-0.863882,-0.763721,1.842947,0.110356
3,0,1,1,0,0,1,0,0,0.207211,-0.044583,1.491904,0.971183,-0.012153
4,0,1,1,0,0,1,0,0,0.991514,-0.044583,-0.288853,2.801888,0.765142
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5511,0,1,1,0,1,0,0,0,-0.088677,-0.249734,0.306513,2.057620,-0.296555
5512,0,1,1,0,0,0,0,1,-0.431564,0.226319,-0.170135,3.063417,-0.634113
5513,1,0,1,0,0,0,0,1,0.285320,-0.100146,1.149006,-1.532399,0.089650
5514,1,0,1,0,0,1,0,0,-0.668826,-0.044583,1.610621,-0.946699,-0.866699


In [101]:
X_test_cat=X_test.iloc[:,[3,4,5]]
impute=SimpleImputer(strategy='most_frequent', fill_value='missing')
X_test_cat=pd.DataFrame(impute.fit_transform(X_test_cat), columns = X_test_cat.columns)
X_test_cat=pd.get_dummies(X_test_cat, prefix='dummy')
X_test_num=X_test.iloc[:,[0,1,2,6,7]]
impute=SimpleImputer(strategy='median')
X_test_num=impute.fit_transform(X_test_num)
X_test_num=pd.DataFrame(StandardScaler().fit_transform(X_test_num), columns = X_test.iloc[:,[0,1,2,6,7]].columns)
X_test_good =pd.concat([X_test_cat,X_test_num],axis=1)
X_test_good, y_test = oversample.fit_resample(X_test_good, y_test)
X_test_good

Unnamed: 0,dummy_Female,dummy_Male,dummy_Matched,dummy_Unmatched,dummy_Current Smoker,dummy_Former Smoker,dummy_Non Smoker,dummy_Reformed Smoker,Mutation Count,Fraction Genome Altered,Diagnosis Age,Person Cigarette Smoking History Pack Year Value,TMB Nonsynonymous
0,0,1,1,0,0,0,0,1,0.601908,1.917064,-1.501187,1.639747,0.371656
1,1,0,1,0,0,1,0,0,0.340707,-0.884981,-0.869649,-0.120439,0.071909
2,0,1,1,0,0,0,1,0,-0.825693,-1.399515,-0.156578,-1.592018,-0.586851
3,0,1,1,0,0,1,0,0,-0.794168,-1.299423,-0.869649,-0.120439,0.325767
4,1,0,1,0,0,1,0,0,-0.659064,-0.433005,1.403887,-0.120439,-0.848146
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1353,1,0,1,0,0,0,0,1,0.634494,0.745670,0.108627,0.367318,0.400750
1354,0,1,1,0,0,0,0,1,0.402309,-0.704911,-0.002489,0.512225,0.144763
1355,0,1,1,0,0,0,0,1,-0.096389,2.277630,0.509032,0.262824,-0.293220
1356,0,1,1,0,0,0,0,1,-0.231689,-0.790237,1.073276,0.500932,-0.456253


In [102]:
y_train.value_counts()

0    2758
1    2758
Name: Cancer Type Detailed, dtype: int64

In [106]:
!pip install explainerdashboard
import explainerdashboard
from sklearn.ensemble import RandomForestClassifier
from explainerdashboard import ClassifierExplainer, ExplainerDashboard
from sklearn.ensemble import RandomForestClassifier
feature_descriptions = {
    "Sex": "Gender of Patient",
    "Mutation Count": "Mutation Count",
    "Fraction Genome Altered": "Fraction Genome Altered",
    "Diagnosis Age": "Age at which a condition or disease was first diagnosed.", 
    "Person Cigarette Smoking History Pack Year Value": "Numeric computed value to represent lifetime tobacco exposure defined as number of cigarettes smoked per day x number of years smoked divided by 20.",
    "TMB Nonsynonymous": "Tumor Mutational Burden Nonsynonymous",
    "Somatic Status": "Somatic Status",
    "Smoking History" : "Smoking History",
}
#X_train, y_train, X_test, y_test = titanic_survive()
#train_names, test_names = titanic_names()
model = RandomForestClassifier(n_estimators=100, max_depth=9,criterion="gini",max_features=2)
model.fit(X_train_good, y_train)
explainer = ClassifierExplainer(model, X_test_good, y_test, 
                                cats=[{"Somatic Status":['dummy_Matched','dummy_Unmatched']},
                                      {"Smoking History":['dummy_Current Smoker',"dummy_Former Smoker","dummy_Non Smoker","dummy_Reformed Smoker"]},
                                    {'Sex': ['dummy_Female', 'dummy_Male']}],
                                descriptions=feature_descriptions,
                                labels=['Lung Adenocarcinoma', 'Lung Squamous Cell Carcinoma'],
                                index_name = "Sample", 
                                target = "Cancer Type Detailed"
                                )
db = ExplainerDashboard(explainer, 
                        title="Lung Cancer Classifier", 
                        shap_interaction=False,
                        )
db.run(port=8050)

Detected RandomForestClassifier model: Changing class type to RandomForestClassifierExplainer...
Note: model_output=='probability', so assuming that raw shap output of RandomForestClassifier is in probability space...
Generating self.shap_explainer = shap.TreeExplainer(model)
Building ExplainerDashboard..
Detected google colab environment, setting mode='external'
Generating layout...
Calculating shap values...
Calculating prediction probabilities...
Calculating metrics...
Calculating confusion matrices...
Calculating classification_dfs...
Calculating roc auc curves...
Calculating pr auc curves...
Calculating liftcurve_dfs...
Calculating dependencies...
Calculating permutation importances (if slow, try setting n_jobs parameter)...
Calculating predictions...
Calculating pred_percentiles...
Calculating ShadowDecTree for each individual decision tree...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. Cla

<IPython.core.display.Javascript object>