Required packages:\
pandas==1.4.0\
numpy==1.21.5\
scikit-learn==1.0.2\
tensorflow==2.7.0\
torch==1.10.2\
transformers==4.17.0.dev0\
datasets==1.18.3\
textstat==0.7.2 (if running the ML part)\
xgboost==1.5.2 (if running the ML part)

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("data/sample_full.csv")

In [3]:
data.fillna({'Remember': 0, 'Understand': 0, 'Apply': 0, 'Analyze': 0, 'Evaluate': 0, 'Create':0}, inplace=True)

In [4]:
LIWC_data = pd.read_csv("data/LIWC2015 Results (Learning_outcome.csv).csv")
data = data.join(LIWC_data).drop(['A'], axis=1)

In [5]:
data.head()

Unnamed: 0,Learning_outcome,Remember,Understand,Apply,Analyze,Evaluate,Create,WC,Analytic,Clout,...,Comma,Colon,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP
0,Analyze the health economic implications of e...,0.0,0.0,0.0,1.0,0.0,0.0,9,99.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Apply research skills to operate effectively ...,0.0,0.0,1.0,0.0,0.0,0.0,14,99.0,92.33,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Assess and synthesise diverse information abo...,0.0,0.0,0.0,0.0,1.0,1.0,26,43.96,77.92,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Describe the general characteristics of the m...,0.0,1.0,0.0,0.0,0.0,0.0,23,99.0,50.0,...,8.7,0.0,0.0,0.0,0.0,4.35,0.0,0.0,0.0,0.0
4,Evaluate the different models of perioperativ...,0.0,0.0,0.0,0.0,1.0,0.0,10,98.58,15.86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
labels = data[data.columns[1:7]].values.tolist()

In [7]:
data.columns[1:7]

Index(['Remember', 'Understand', 'Apply', 'Analyze', 'Evaluate', 'Create'], dtype='object')

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, cohen_kappa_score, f1_score

## ML Test

In [10]:
import textstat

In [11]:
def generateX(data_x, test_x, textual_column_index, start_index_LIWC, end_index_LIWC):
    column_names = []
    print("Getting Unigram...")
    uni_cv = CountVectorizer(stop_words='english', ngram_range=(1, 1), max_features=1000)
    unigram = uni_cv.fit_transform(data_x[:, textual_column_index])
    unigram = unigram.toarray()
    unigram_test = uni_cv.transform(test_x[:,textual_column_index]).toarray()
    temp = uni_cv.get_feature_names_out().tolist()
    column_names += ["uni_"+name for name in temp]
    print("Getting Bigram...")
    bi_cv = CountVectorizer(stop_words='english', ngram_range=(2, 2), max_features=1000)
    bigram = bi_cv.fit_transform(data_x[:, textual_column_index])
    bigram = bigram.toarray()
    bigram_test = bi_cv.transform(test_x[:, textual_column_index]).toarray()
    temp = bi_cv.get_feature_names_out().tolist()
    column_names += ["bi_"+name for name in temp]
    print("Getting Tfidf...")
    tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), max_features=1000)
    t = tfidf.fit_transform(data_x[:, textual_column_index])
    t = t.toarray()
    t_test = tfidf.transform(test_x[:, textual_column_index]).toarray()
    temp = tfidf.get_feature_names_out().tolist()
    column_names += ["tfidf_"+name for name in temp]
    print("Getting ARI...")
    ari = [textstat.automated_readability_index(text) for text in data_x[:, textual_column_index]]
    ari_test = [textstat.automated_readability_index(text) for text in test_x[:, textual_column_index]]
    column_names.append("ari")
    combined_data_x = []
    combined_test_x = []
    print("Combining...")
    for i in range(len(data_x)):
        combined_data_x.append(unigram[i].tolist()
                              + bigram[i].tolist()
                              + t[i].tolist()
                              + [ari[i]]
                              + data_x[i, start_index_LIWC:end_index_LIWC].tolist())
    for i in range(len(test_x)):
        combined_test_x.append(unigram_test[i].tolist()
                              + bigram_test[i].tolist()
                              + t_test[i].tolist()
                              + [ari_test[i]]
                              + test_x[i, start_index_LIWC:end_index_LIWC].tolist())
    print("Generated feature shape is", np.array(combined_data_x).shape)
    print("Generated test feature is", np.array(combined_test_x).shape)
    return combined_data_x, column_names, combined_test_x

In [12]:
data.drop(columns=list(data.columns[1:7])).iloc[:, 0]

0         Analyze the health economic implications of e...
1         Apply research skills to operate effectively ...
2         Assess and synthesise diverse information abo...
3         Describe the general characteristics of the m...
4         Evaluate the different models of perioperativ...
                               ...                        
21375    Write/type simple sentences using hiragana, ka...
21376    Writing of assessment reports and giving feedb...
21377    You will develop the ability to work in a team...
21378    You will develop their oral presentation skill...
21379    You will gain an ability to use geoscientific ...
Name: Learning_outcome, Length: 21380, dtype: object

In [13]:
train_x, test_x, train_y, test_y = train_test_split(data.drop(columns=list(data.columns[1:8])), data[data.columns[1:7]], test_size=0.2, random_state=666)

In [14]:
np.unique(train_y['Remember'].tolist(), return_counts=True), np.unique(test_y['Remember'].tolist(), return_counts=True)

((array([0., 1.]), array([16156,   948])),
 (array([0., 1.]), array([4039,  237])))

In [15]:
np.unique(train_y['Understand'].tolist(), return_counts=True), np.unique(test_y['Understand'].tolist(), return_counts=True)

((array([0., 1.]), array([12479,  4625])),
 (array([0., 1.]), array([3076, 1200])))

In [16]:
np.unique(train_y['Apply'].tolist(), return_counts=True), np.unique(test_y['Apply'].tolist(), return_counts=True)

((array([0., 1.]), array([12239,  4865])),
 (array([0., 1.]), array([3060, 1216])))

In [17]:
np.unique(train_y['Analyze'].tolist(), return_counts=True), np.unique(test_y['Analyze'].tolist(), return_counts=True)

((array([0., 1.]), array([14346,  2758])),
 (array([0., 1.]), array([3575,  701])))

In [18]:
np.unique(train_y['Evaluate'].tolist(), return_counts=True), np.unique(test_y['Evaluate'].tolist(), return_counts=True)

((array([0., 1.]), array([14069,  3035])),
 (array([0., 1.]), array([3477,  799])))

In [19]:
np.unique(train_y['Create'].tolist(), return_counts=True), np.unique(test_y['Create'].tolist(), return_counts=True)

((array([0., 1.]), array([13956,  3148])),
 (array([0., 1.]), array([3537,  739])))

In [20]:
one_hot = []
for d in data[data.columns[1:7]].values:
    one_hot.append(np.array2string(d).count("1"))
np.unique(one_hot, return_counts=True)

(array([1, 2, 3, 4]), array([18773,  2325,   280,     2]))

In [21]:
ml_train_x, column_names, ml_test_x = generateX(train_x.to_numpy(), test_x.to_numpy(), 0, 1, 94)

Getting Unigram...
Getting Bigram...
Getting Tfidf...
Getting ARI...
Combining...
Generated feature shape is (17104, 3093)
Generated test feature is (4276, 3093)


In [22]:
column_names += data.columns[7:].tolist()

In [23]:
rf = RandomForestClassifier()
rf.fit(ml_train_x, train_y)

RandomForestClassifier()

In [24]:
pred_y = rf.predict(ml_test_x)

In [25]:
print(classification_report(test_y, pred_y, output_dict=False, target_names=list(data.columns[1:7]), digits=3))

              precision    recall  f1-score   support

    Remember      0.938     0.705     0.805       237
  Understand      0.929     0.783     0.850      1200
       Apply      0.945     0.771     0.849      1216
     Analyze      0.964     0.733     0.833       701
    Evaluate      0.962     0.760     0.849       799
      Create      0.923     0.645     0.760       739

   micro avg      0.943     0.744     0.832      4892
   macro avg      0.943     0.733     0.824      4892
weighted avg      0.943     0.744     0.831      4892
 samples avg      0.779     0.755     0.763      4892



  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
pred_score_y = rf.predict_proba(ml_test_x)

In [27]:
np.array(test_x).shape

(4276, 93)

In [28]:
np.array(pred_score_y).shape

(6, 4276, 2)

In [29]:
pred_score_y = np.transpose([score[:, 1] for score in rf.predict_proba(ml_test_x)])

In [30]:
roc_auc_score(test_y, pred_score_y, average=None)

array([0.9848633 , 0.97075301, 0.97065273, 0.97663957, 0.97929357,
       0.96531295])

In [31]:
f1_score(test_y, pred_y, average="micro")

0.8319817247287263

In [32]:
accuracy_score(test_y, pred_y)

0.7252104770813844

In [33]:
ml_result_df = pd.DataFrame(data=pred_y, columns=data.columns[1:7])

In [34]:
ml_result_df

Unnamed: 0,Remember,Understand,Apply,Analyze,Evaluate,Create
0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
4271,0.0,0.0,0.0,0.0,1.0,0.0
4272,1.0,0.0,0.0,0.0,0.0,0.0
4273,0.0,0.0,1.0,0.0,0.0,0.0
4274,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
ml_golden_df = pd.DataFrame(data=test_y, columns=data.columns[1:7])

In [36]:
print(accuracy_score(ml_golden_df['Remember'].tolist(), ml_result_df['Remember'].tolist()))
print(accuracy_score(ml_golden_df['Understand'].tolist(), ml_result_df['Understand'].tolist()))
print(accuracy_score(ml_golden_df['Apply'].tolist(), ml_result_df['Apply'].tolist()))
print(accuracy_score(ml_golden_df['Analyze'].tolist(), ml_result_df['Analyze'].tolist()))
print(accuracy_score(ml_golden_df['Evaluate'].tolist(), ml_result_df['Evaluate'].tolist()))
print(accuracy_score(ml_golden_df['Create'].tolist(), ml_result_df['Create'].tolist()))

0.9810570626753976
0.9223573433115061
0.921889616463985
0.9518241347053321
0.9494855004677268
0.9293732460243218


In [37]:
print(cohen_kappa_score(ml_golden_df['Remember'].tolist(), ml_result_df['Remember'].tolist()))
print(cohen_kappa_score(ml_golden_df['Understand'].tolist(), ml_result_df['Understand'].tolist()))
print(cohen_kappa_score(ml_golden_df['Apply'].tolist(), ml_result_df['Apply'].tolist()))
print(cohen_kappa_score(ml_golden_df['Analyze'].tolist(), ml_result_df['Analyze'].tolist()))
print(cohen_kappa_score(ml_golden_df['Evaluate'].tolist(), ml_result_df['Evaluate'].tolist()))
print(cohen_kappa_score(ml_golden_df['Create'].tolist(), ml_result_df['Create'].tolist()))

0.7950759924457214
0.7980526086986209
0.7968115473567279
0.8055212727390746
0.8191237484680814
0.719669252126872


## BERT

In [38]:
import torch
import tensorflow as tf
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from transformers import TFBertPreTrainedModel, TFBertMainLayer, InputFeatures
from datasets import load_metric, list_metrics

  from .autonotebook import tqdm as notebook_tqdm


In [39]:
class EncodeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)




In [40]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', problem_type="multi_label_classification")
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', problem_type="multi_label_classification", num_labels=6)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [41]:
train_x, test_x, train_y, test_y = train_test_split(data['Learning_outcome'].tolist(), labels, test_size=0.2, random_state=666)
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=666)

In [42]:
train_encoded = tokenizer(train_x, truncation=True, padding=True, max_length=100)
val_encoded = tokenizer(val_x, truncation=True, padding=True, max_length=100)
test_encoded = tokenizer(test_x, truncation=True, padding=True, max_length=100)

In [43]:
train_set, val_set, test_set = EncodeDataset(train_encoded, train_y), EncodeDataset(val_encoded, val_y), EncodeDataset(test_encoded, test_y)

In [44]:
training_args = TrainingArguments(
        output_dir='multilabel',          # output directory
        overwrite_output_dir=True,
        num_train_epochs=3,              # total number of training epochs
        per_device_train_batch_size=64,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        warmup_steps=5,                # number of warmup steps for learning rate scheduler
        weight_decay=0.05,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
        evaluation_strategy="steps",
        save_strategy="steps",
        save_steps=10,
        load_best_model_at_end=True
    )

In [45]:
def getClassResult(predicted):
    results = []
    for probs in predicted.numpy():
        result = []
        for prob in probs:
            if prob < 0.5:
                result.append(0)
            else:
                result.append(1)
        results.append(result)
    return results

metric = load_metric("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = tf.keras.activations.sigmoid(logits)
    predicted = getClassResult(predictions)
    return metric.compute(predictions=predicted, references=labels, average="micro")

In [46]:
trainer = Trainer(model=model, args=training_args, train_dataset=train_set, eval_dataset=val_set, callbacks=[EarlyStoppingCallback(early_stopping_patience=5)])

In [48]:
trainer.train()

***** Running training *****
  Num examples = 13683
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 642


Step,Training Loss,Validation Loss
10,0.6043,0.498867
20,0.4582,0.437737
30,0.428,0.398313
40,0.3812,0.345268
50,0.3403,0.298848
60,0.2799,0.263143
70,0.2496,0.229983
80,0.2232,0.21168
90,0.2073,0.190417
100,0.1852,0.181433


***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to multilabel/checkpoint-10
Configuration saved in multilabel/checkpoint-10/config.json
Model weights saved in multilabel/checkpoint-10/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to multilabel/checkpoint-20
Configuration saved in multilabel/checkpoint-20/config.json
Model weights saved in multilabel/checkpoint-20/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to multilabel/checkpoint-30
Configuration saved in multilabel/checkpoint-30/config.json
Model weights saved in multilabel/checkpoint-30/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to multilabel/checkpoint-40
Configuration saved in multilabel/checkpoint-40/config.json
Model weights saved in multilabel/checkpoint-40/pytorch_model.bin
****

TrainOutput(global_step=642, training_loss=0.13524437687617968, metrics={'train_runtime': 464.4953, 'train_samples_per_second': 88.373, 'train_steps_per_second': 1.382, 'total_flos': 2109537813056400.0, 'train_loss': 0.13524437687617968, 'epoch': 3.0})

In [49]:
logits = trainer.predict(test_set)

***** Running Prediction *****
  Num examples = 4276
  Batch size = 64


In [50]:
logits.predictions.shape

(4276, 6)

In [51]:
predicted = tf.keras.activations.sigmoid(logits.predictions)

2023-09-05 13:17:11.501703: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-09-05 13:17:11.546480: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-09-05 13:17:11.546781: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-09-05 13:17:11.548737: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [52]:
predicted.numpy()

array([[0.00969101, 0.98481834, 0.0107606 , 0.00975786, 0.00886829,
        0.01091874],
       [0.00915964, 0.98466325, 0.01052529, 0.00910978, 0.00913619,
        0.01224123],
       [0.05010709, 0.89347756, 0.00486106, 0.05864727, 0.06428542,
        0.00304807],
       ...,
       [0.00506825, 0.01770407, 0.97926235, 0.00668835, 0.00808989,
        0.01428074],
       [0.9081824 , 0.08788445, 0.03324952, 0.0135317 , 0.8482272 ,
        0.02497723],
       [0.01558924, 0.02727734, 0.79540026, 0.03905605, 0.9863284 ,
        0.02791703]], dtype=float32)

In [53]:
predicted_label = getClassResult(predicted)

In [54]:
count = 0
for pred in predicted_label:
    if pred.count(1) > 1:
        count += 1
count

519

In [55]:
print(classification_report(test_y, predicted_label, output_dict=False, target_names=list(data.columns[1:7]), digits=3))

              precision    recall  f1-score   support

    Remember      0.906     0.937     0.921       237
  Understand      0.957     0.935     0.946      1200
       Apply      0.925     0.931     0.928      1216
     Analyze      0.921     0.914     0.918       701
    Evaluate      0.936     0.927     0.931       799
      Create      0.931     0.857     0.892       739

   micro avg      0.934     0.918     0.926      4892
   macro avg      0.929     0.917     0.923      4892
weighted avg      0.934     0.918     0.926      4892
 samples avg      0.935     0.930     0.927      4892



  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
roc_auc_score(test_y, predicted.numpy(), average=None)

array([0.99093125, 0.98838345, 0.98106201, 0.9831701 , 0.98881367,
       0.97583137])

In [57]:
accuracy_score(np.array(test_y), predicted_label)

0.8849391955098223

In [58]:
dl_result_df = pd.DataFrame(data=predicted_label, columns=data.columns[1:7])

In [59]:
print(accuracy_score(ml_golden_df['Remember'].tolist(), dl_result_df['Remember'].tolist()))
print(accuracy_score(ml_golden_df['Understand'].tolist(), dl_result_df['Understand'].tolist()))
print(accuracy_score(ml_golden_df['Apply'].tolist(), dl_result_df['Apply'].tolist()))
print(accuracy_score(ml_golden_df['Analyze'].tolist(), dl_result_df['Analyze'].tolist()))
print(accuracy_score(ml_golden_df['Evaluate'].tolist(), dl_result_df['Evaluate'].tolist()))
print(accuracy_score(ml_golden_df['Create'].tolist(), dl_result_df['Create'].tolist()))

0.9911131898971001
0.9700654817586529
0.9588400374181478
0.9731057062675398
0.974508886810103
0.9642188961646398


In [60]:
print(cohen_kappa_score(ml_golden_df['Remember'].tolist(), dl_result_df['Remember'].tolist()))
print(cohen_kappa_score(ml_golden_df['Understand'].tolist(), dl_result_df['Understand'].tolist()))
print(cohen_kappa_score(ml_golden_df['Apply'].tolist(), dl_result_df['Apply'].tolist()))
print(cohen_kappa_score(ml_golden_df['Analyze'].tolist(), dl_result_df['Analyze'].tolist()))
print(cohen_kappa_score(ml_golden_df['Evaluate'].tolist(), dl_result_df['Evaluate'].tolist()))
print(cohen_kappa_score(ml_golden_df['Create'].tolist(), dl_result_df['Create'].tolist()))

0.9164544023297831
0.9253291189805224
0.8990734740645693
0.9016083140914566
0.9158313318284425
0.8707724325268182
