In [1]:
import os
import pandas as pd
# from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
# sklearn.metrics.accuracy_score
import xgboost as xgb
import numpy as np
import itertools 
import tqdm
from time import sleep
import random
# from docx import Document
# import docx2txt
from datetime import datetime

In [None]:
import warnings 
warnings.filterwarnings("ignore")

In [2]:
# mannually generate the search grid
def makeGrid(pars_dict):  
    keys=pars_dict.keys()
    combinations=itertools.product(*pars_dict.values())
    ds=[dict(zip(keys,cc)) for cc in combinations]
    return ds

## Data Preparation

In [3]:
df_raw = pd.read_excel("../Dataset/MergedDataset_231207_ForElly_Excel.xlsx")

# Remove "." in the feature column
row_num, column_num = df_raw.shape
for rowID in range(row_num):
    for columnID in range(column_num):
        if "." == df_raw.iloc[rowID, columnID]:
            df_raw.iloc[rowID, columnID] = np.nan

# prepare the feature list
featureList = []
for item in df_raw.columns:
    if item.startswith("Item"):
        featureList.append(item)

patientList = []
roundList = []
for idx, row in df_raw.iterrows():
    patient = int(str(row["IAPTus_Num"]).split("_")[0])
    if "_" in str(row["IAPTus_Num"]):
        Round = int(str(row["IAPTus_Num"]).split("_")[1])
    else:
        Round = 0
    patientList.append(patient)
    roundList.append(Round)
df_raw["patient"] = patientList
df_raw["round"] = roundList

In [4]:
df_referral = pd.read_json("../Dataset/ReferralLetterSummary.json", lines=True)

In [5]:
df = pd.merge(df_raw, df_referral, on=["patient","round"])

# Label Preparation

## Here I use RecoveryDesc, not the EndDesc as the prediction target. EndDesc has 4 label categories and Referral Letter has only 44 data points, so it does not meet the conditions for tuning.

In [6]:
df = df[df["RecoveryDesc"] == df["RecoveryDesc"]]
df = df[(df["RecoveryDesc"].isin(["At recovery", "Not at recovery"]))]
df.reset_index().drop("index", axis=1)
print(df.shape)

labelList = []
for idx, row in df.iterrows():
    if (row["ReliableChangeDesc"] == "Reliable improvement") & (row["ReliableRecoveryDesc"] == "Reliable recovery") & (row["RecoveryDesc"] == "At recovery"):
        labelList.append(1)
    else:
        labelList.append(0)
df["labels"] = np.array(labelList, dtype=float)
labelList = ["labels"]

(43, 285)


In [7]:
df = df[["IAPTus_Num_x","patient","round","text", "summarization", "labels"]]
df = df.reset_index().drop("index", axis=1)

In [141]:
# Keep 10% data as the test data
train, test = train_test_split(df, test_size=0.1)
train.shape

(38, 6)

In [142]:
train

Unnamed: 0,IAPTus_Num_x,patient,round,text,summarization,labels
35,25866,25866,0,Additional advisers\n\nCurrent thoughts/plans/...,### Patient Background:\n- **Age:** Early 20s\...,1.0
12,24918,24918,0,Assessment appointment\n\nExploration of CORE ...,"### Summary of Patient Background, Symptoms, I...",0.0
33,25833,25833,0,Additional advisers\n\nCurrent thoughts/plans/...,"### Patient Background:\nNAME, a university st...",0.0
6,24837,24837,0,Assessment appointment\n\nExploration of MDS s...,"### Summary of Patient Background, Symptoms, I...",1.0
27,25114,25114,0,Exploration of CORE scores and risk\nPHQ 9= 10...,### Patient Background:\n- **Name**: NAME\n- *...,0.0
20,25027,25027,0,Assessment appointment\n\nExploration of CORE ...,**Patient Background:**\n- NAME is a universit...,1.0
0,24804,24804,0,ASSESSMENT APPOINTMENT:\n\nStudent confirmed t...,"### Patient Background:\nNAME, a university st...",1.0
32,25812,25812,0,"Hi Team, \n\nI am hoping to refer the followin...",**Patient Background:**\nNAME is a university ...,1.0
39,25941,25941,0,Additional advisers\n\nCurrent thoughts/plans/...,"### Patient Background:\nNAME, a university st...",1.0
7,24840,24840,0,[<50 min assessment included confidentiality l...,**Patient Background:**\nNAME has a history of...,1.0


## Train

In [143]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer

In [144]:
# param = {
#     'max_features': 40, 
#     'ngram_range': (3, 3)
# }

param = {
    'max_features': 10, 
    'ngram_range': (1, 3)
}

corpus = df["summarization"]
# Initizalize the vectorizer with max nr words and ngrams (1: single words, 2: two words in a row)
vectorizer_tfidf = TfidfVectorizer(max_features=param["max_features"], ngram_range=param["ngram_range"])
# Fit the vectorizer to the training data
vectorizer_tfidf.fit(corpus)

In [145]:
# param_dict = {
#     'criterion': 'entropy', 
#     'max_depth': None, 
#     'max_features': 20, 
#     'max_leaf_nodes': None, 
#     'max_samples': 0.7, 
#     'min_samples_leaf': 1, 
#     'min_samples_split': 2, 
#     'min_weight_fraction_leaf': 0.0, 
#     'n_estimators': 50, 
#     'n_jobs': -1, 
#     'random_state': 24975
# }
# clf = tree.DecisionTreeClassifier(**param_dict)
# classifier_tfidf = RandomForestClassifier(**param_dict)
# classifier_tfidf = LogisticRegression()
classifier_tfidf = xgb.XGBRegressor(objective="binary:logistic", random_state=42)
model_tfidf = Pipeline([("vectorizer", vectorizer_tfidf), ("classifier", classifier_tfidf)])

start_time = datetime.now()
model_tfidf.fit(train["summarization"], train["labels"])
end_time = datetime.now()

training_time_tfidf = (end_time - start_time).total_seconds()
print('Time consuming: {}'.format(training_time_tfidf))

predicted_train_tfidf = model_tfidf.predict(train["summarization"])
accuracy_train_tfidf = accuracy_score(train["labels"], (predicted_train_tfidf>0.5))
print('Accuracy Training data: {:.1%}'.format(accuracy_train_tfidf))

predicted_test_tfidf = model_tfidf.predict(test["summarization"])
accuracy_test_tfidf = accuracy_score(test["labels"], (predicted_test_tfidf>0.5))
accuracy_tfidf = accuracy_test_tfidf
print('Accuracy Test data: {:.1%}'.format(accuracy_test_tfidf))

Time consuming: 0.047632
Accuracy Training data: 100.0%
Accuracy Test data: 80.0%


In [130]:
predicted_train_tfidf > 0.5

array([ True, False,  True, False, False,  True,  True, False,  True,
        True,  True, False,  True,  True, False,  True,  True, False,
       False, False, False,  True,  True, False,  True, False, False,
        True, False, False,  True,  True,  True, False,  True, False,
        True, False])

In [11]:
def singleTrain(X_train, Y_train, X_test, Y_test, Corpus, param):
    vectorizer_tfidf = TfidfVectorizer(max_features=param["max_features"], ngram_range=param["ngram_range"])
    vectorizer_tfidf.fit(Corpus)

    classifier_tfidf = LogisticRegression()
    model_tfidf = Pipeline([("vectorizer", vectorizer_tfidf), ("classifier", classifier_tfidf)])

    model_tfidf.fit(X_train, Y_train)

    predicted_train_tfidf = model_tfidf.predict(X_train)
    accuracy_train_tfidf = accuracy_score(Y_train, predicted_train_tfidf)
    # print('Accuracy Training data: {:.1%}'.format(accuracy_train_tfidf))

    predicted_test_tfidf = model_tfidf.predict(X_test)
    accuracy_test_tfidf = accuracy_score(Y_test, predicted_test_tfidf)
    accuracy_tfidf = accuracy_test_tfidf
    # print('Accuracy Test data: {:.1%}'.format(accuracy_test_tfidf))

    result = {}
    result["train_acc"] = accuracy_train_tfidf
    result["test_acc"] = accuracy_tfidf

    return result

In [12]:
def CVTrain(Train, Corpus, param):
    FinalResult = {}
    FinalResult["details"] = []
    skf = StratifiedKFold(n_splits=5)
    for i, (train_index, val_index) in enumerate(skf.split(train["summarization"], train["labels"])):
        (X_train, Y_train) = train.iloc[train_index]["summarization"].values, train.iloc[train_index]["labels"].values.squeeze()
        (X_val, Y_val) = train.iloc[val_index]["summarization"].values, train.iloc[val_index]["labels"].values.squeeze()
        # print(X_train, Y_train)

        result = singleTrain(X_train, Y_train, X_val, Y_val, Corpus, param)
        FinalResult["details"].append(result)
    FinalResult["acc_mean"] = np.mean([x["test_acc"] for x in FinalResult["details"]])
    FinalResult["acc_std"] = np.std([x["test_acc"] for x in FinalResult["details"]])
    return FinalResult

In [39]:
param_dict = {
    "max_features": 150,
    "ngram_range": (1,2)
}
CVTrain(train, df["summarization"], param_dict)

{'details': [{'train_acc': 0.8333333333333334, 'test_acc': 0.5},
  {'train_acc': 0.7666666666666667, 'test_acc': 0.375},
  {'train_acc': 0.7, 'test_acc': 0.5},
  {'train_acc': 0.8387096774193549, 'test_acc': 0.5714285714285714},
  {'train_acc': 0.8064516129032258, 'test_acc': 0.7142857142857143}],
 'acc_mean': np.float64(0.5321428571428571),
 'acc_std': np.float64(0.11088696211614303)}

## Grid Search for TFIDF Classification

In [13]:
param_dict = {
    "max_features": [x*10 for x in range(1, 100)],
    "ngram_range": [(1,1),(1,2),(2,2),(1,3),(2,3),(3,3),(1,4),(2,4),(3,4),(4,4)]    
}

In [14]:
def MyGridSearch(param_dict, train, Corpus):
    searchSpace = makeGrid(param_dict)
    print("Search Space Size:" + len(searchSpace).__str__())
    resultList = []
    for param in tqdm.tqdm(searchSpace):
        #sleep(1)
        result = CVTrain(train, df["summarization"], param)
        result["param"] = param
        resultList.append(result)
    sortedResult = sorted(resultList, key=lambda x: x["acc_mean"], reverse=True)
    print(sortedResult[:5])
    return sortedResult

In [15]:
CVResult = MyGridSearch(param_dict, train, df["summarization"])

Search Space Size:990


100%|██████████| 990/990 [02:10<00:00,  7.58it/s]

[{'details': [{'train_acc': 0.8, 'test_acc': 0.75}, {'train_acc': 0.8, 'test_acc': 0.625}, {'train_acc': 0.8, 'test_acc': 0.875}, {'train_acc': 0.9032258064516129, 'test_acc': 0.7142857142857143}, {'train_acc': 0.9032258064516129, 'test_acc': 0.7142857142857143}], 'acc_mean': np.float64(0.7357142857142858), 'acc_std': np.float64(0.08096988606253308), 'param': {'max_features': 40, 'ngram_range': (3, 3)}}, {'details': [{'train_acc': 0.9333333333333333, 'test_acc': 0.625}, {'train_acc': 0.9, 'test_acc': 0.625}, {'train_acc': 0.8333333333333334, 'test_acc': 0.75}, {'train_acc': 0.9032258064516129, 'test_acc': 0.7142857142857143}, {'train_acc': 0.9032258064516129, 'test_acc': 0.8571428571428571}], 'acc_mean': np.float64(0.7142857142857143), 'acc_std': np.float64(0.08674969858207941), 'param': {'max_features': 160, 'ngram_range': (3, 3)}}, {'details': [{'train_acc': 0.7333333333333333, 'test_acc': 0.75}, {'train_acc': 0.7333333333333333, 'test_acc': 0.625}, {'train_acc': 0.8, 'test_acc': 0.7




In [41]:
[x*10 for x in range(1, 100)]

[10,
 20,
 30,
 40,
 50,
 60,
 70,
 80,
 90,
 100,
 110,
 120,
 130,
 140,
 150,
 160,
 170,
 180,
 190,
 200,
 210,
 220,
 230,
 240,
 250,
 260,
 270,
 280,
 290,
 300,
 310,
 320,
 330,
 340,
 350,
 360,
 370,
 380,
 390,
 400,
 410,
 420,
 430,
 440,
 450,
 460,
 470,
 480,
 490,
 500,
 510,
 520,
 530,
 540,
 550,
 560,
 570,
 580,
 590,
 600,
 610,
 620,
 630,
 640,
 650,
 660,
 670,
 680,
 690,
 700,
 710,
 720,
 730,
 740,
 750,
 760,
 770,
 780,
 790,
 800,
 810,
 820,
 830,
 840,
 850,
 860,
 870,
 880,
 890,
 900,
 910,
 920,
 930,
 940,
 950,
 960,
 970,
 980,
 990]

In [13]:
training_time_tfidf

0.037645

In [15]:
predicted_train_tfidf = model_tfidf.predict(train["summarization"])
accuracy_train_tfidf = accuracy_score(train["labels"], predicted_train_tfidf)
print('Accuracy Training data: {:.1%}'.format(accuracy_train_tfidf))

Accuracy Training data: 100.0%


In [16]:
predicted_test_tfidf = model_tfidf.predict(test["summarization"])
accuracy_test_tfidf = accuracy_score(test["labels"], predicted_test_tfidf)
accuracy_tfidf = accuracy_test_tfidf
print('Accuracy Test data: {:.1%}'.format(accuracy_test_tfidf))

Accuracy Test data: 40.0%
