#### Configure script.

We set global variables that define behaviour of this script.

In [None]:
DATA_DIR = "../data/"   
DATA = "CAP"
CLASS_LABEL = 'pca_death_code'  # Target label to predict

RANDOM_STATE = 42
TEST_SIZE = 0.2

CLASSIFIER = 'rf'
LOAD_CLASSIFIER = True
CLASSIFIER_FILENAME = 'cap_%s_gridsearch.joblib' % CLASSIFIER


# Specify the hyperparameters to optimise when training classifiers:
PARAMETERS = {
    'vect__ngram_range': ((1, 2),),
    'vect__max_df': (0.7,),
    'vect__min_df': (5,),
    'vect__max_features': (1500,),
}

if CLASSIFIER == "rf":    
    PARAMETERS['clf__n_estimators'] = (100,500, 1000, 2000)
    #PARAMETERS['clf__max_features'] = (0.1,0.2)
    PARAMETERS['clf__max_depth'] = (5,10,15)
    PARAMETERS['clf__max_samples'] = (0.7,0.8,0.9)
    PARAMETERS['clf__min_samples_leaf'] = (2,3,4)
    PARAMETERS['clf__n_estimators'] = (100,)
    PARAMETERS['clf__max_samples'] = (0.8,)
    
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(random_state=RANDOM_STATE)

elif CLASSIFIER == 'lr':
    PARAMETERS['clf__C'] = (0.001, 0.01, 0.1, 1.0, 10.0, 100.0)
    PARAMETERS['clf__penalty'] = ('l1', 'l2', 'elasticnet', 'none')
    PARAMETERS['clf__fit_intercept'] = (True, False)
    
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(random_state=RANDOM_STATE)

elif CLASSIFIER == 'svc':
    PARAMETERS['clf__C'] = (0.001, 0.01, 0.1, 1.0, 10.0, 100.0)
    PARAMETERS['clf__kernel'] = ('linear', 'poly', 'sigmoid')
    PARAMETERS['clf__probability'] = (True, )
    
    from sklearn.svm import SVC
    model = SVC(random_state=RANDOM_STATE)
    

CV = 5
SCORING = None # Specify scoring metric to use for GridsearchCV (or use default if None)

In [None]:
import pandas as pd
from joblib import dump, load
import pickle

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.metrics import brier_score_loss, precision_score, recall_score, f1_score

from cap_helper import *

from helper import (pd_print, 
                    accuracy,
                    lemmatize_text,
                    summarise_gridsearch_classifier,
                    calibrate_random_forest, 
                    plot_calibration_curve,
                    plot_calibration_curve_easy_hard,
                    plot_roc_curve,
                    compute_all_metrics)

from explainability import (get_rf_feature_importances,
                            wordcloud,
                            run_tree_interpreter,
                            get_ti_feature_contributions_for_instance_i,
                            get_ti_feature_contributions_average)

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
print("Loading CAP prostate cancer data for preprocessing.")
df = load_data(DATA_DIR)
# Combine text from all feature columns into a single string column
df = concatenate_feature_columns(df)
# Link to dates of death:
df = add_dates(df, DATA_DIR)
# Link to reviewer Ids:
df = add_reviewer_ids(df, DATA_DIR)
# Convert all dates to be in units of months before/after death (Note: this regex is not foolproof)
df = convert_dates_relative(df)  

print("Preprocessing complete.")

In [None]:
# The reviews are dominated by three authors:
pd_print(get_reviewer_counts(df))

#### Having loaded and pre-processed the data we can start to train classifiers:

In [None]:
stemmer = WordNetLemmatizer()

In [None]:
X,y = df.combined, df[CLASS_LABEL]

In [None]:
documents = lemmatize_text(X, stemmer)

In [None]:
print(len(y[y==1]))
print(len(y[y==2]))

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words=stopwords.words('english'))),
    ('tfidf', TfidfTransformer()),
    ('clf', model)
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(documents, 
                                                    df[CLASS_LABEL], 
                                                    test_size=TEST_SIZE, 
                                                    random_state=RANDOM_STATE)

In [None]:
if LOAD_CLASSIFIER:
    clf = load('models/' + CLASSIFIER_FILENAME)
    
else:
    clf = GridSearchCV(pipeline, PARAMETERS, n_jobs=-1, verbose=1, cv=CV, scoring=SCORING)
    clf.fit(X_train, y_train)
    dump(clf, 'models/' + CLASSIFIER_FILENAME)

In [None]:
summarise_gridsearch_classifier(clf)

In [None]:
X_dict = {'train': X_train, 'test': X_test}
y_dict = {'train': y_train, 'test': y_test}
clf_dict = {'train': clf, 'test': clf}
names = ['train', 'test']

plot_roc_curve(clf_dict, X_dict, y_dict, names, pos_label=2)

#### We now attempt to calibration the random forest:

Note: the main results presented in the publication are for the uncalibrated classifiers trained above.

In [None]:
calibrated_clf = calibrate_random_forest(X_train, y_train)

In [None]:
plot_calibration_curve(clf, calibrated_clf, X_test, y_test)

#### Stratify based on 'easy' and 'hard' cases:

Note: In CAP this is determined by the cause of death assignment route, essentially the harder it is to determine the cause of death the more levels of review and deliberartion are required.

In [None]:
from cap_helper import get_easy_and_hard_cases
easy_x, hard_x, easy_y, hard_y = get_easy_and_hard_cases(df, subset_x=X_test, subset_y=y_test)
plot_calibration_curve_easy_hard(calibrated_clf, easy_x, hard_x, easy_y, hard_y)

In [None]:
names = ['easy cases (cod_route: 1,5)', 'hard cases (cod_route: 2,4)']
clf_dict = {names[0]: clf, names[1]: clf}
X_dict = {names[0]: easy_x, names[1]: hard_x}
y_dict = {names[0]: easy_y, names[1]: hard_y}
plot_roc_curve(clf_dict, X_dict, y_dict, names)
plt.savefig('roc_easy_hard.jpg', dpi=300)

In [None]:
easy_x, hard_x, easy_y, hard_y = get_easy_and_hard_cases(df, subset_x=X, subset_y=y)
print("There are %d easy cases." % len(easy_x))
print("There are %d hard cases." % len(hard_x))