In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window
import os, re
from time import ctime
import time,json
import pandas as pd
from pyspark.sql.types import ArrayType,IntegerType,FloatType,StringType,StructType,StructField
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import classification_report
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
os.environ['WANDB_DISABLED'] = 'true'

## Trainning an initial model

In [None]:
data = pd.read_csv()

In [None]:
labels = {'All other language': 0,
 'Other person': 0,
 'Patient': 1,
 'Direct ancestor': 2,
 'Other(Deployment/Travel/etc.)': 0,
 'Other irrelevant': 0,
 'Cannot attribute Race/Ethnicity': 0,
 'Patient native speaker': 3,
 'Name': 0,
 'Patient lived or lives': 4,
 'Patient_negated': 5,
 'Patient was born/from': 6,
 'Patient speaks': 7,
 'Family member speaks': 0}

In [None]:
data['labels'] = data['Adjudication'].map(labels)

In [15]:
train, interm = train_test_split(data.sample(frac=1), train_size = 0.60,  random_state=1234 ,shuffle=True)
valid, test = train_test_split(interm.sample(frac=1), train_size = 0.50,  random_state=1234 ,shuffle=True)
train.shape,interm.shape,valid.shape,test.shape

In [10]:
id2label = {v:k for k,v in labels.items()}

In [None]:
model_args = {
    "num_train_epochs": 50,
    "learning_rate": 1e-5,
    "do_lower_case":False,
    "max_seq_length": 256,
    "use_early_stopping":True,
    "evaluate_during_training":True,
    "early_stopping_consider_epochs":True,
    "early_stopping_patience":3,
    "output_dir":'output/',
    "use_cuda":True,
    "overwrite_output_dir":True,
    "save_eval_checkpoints":False,
    "save_model_every_epoch":False,
    "save_steps":False,
    "special_tokens_list":['[TERM]','[/TERM]']
   
}

# Create a ClassificationModel
model = ClassificationModel(
    'bert',
    'models/va_cased/',
    num_labels=8,
    args=model_args,
    use_cuda=True,
    weight=[1,6,4,4,4,3,4,3]
) 
model.train_model(train,eval_df =valid)

### Evaluate on test

In [None]:
preds, _ = model.predict(test['text'].to_list())
print(classification_report(test['labels'],preds))

## Final training

In [189]:
model_args = {
    "num_train_epochs": 50,
    "learning_rate": 6e-6,
    "do_lower_case":False,
    "max_seq_length": 256,
    "use_early_stopping":True,
    "train_batch_size":32,
    "evaluate_during_training":True,
    "early_stopping_consider_epochs":True,
    "early_stopping_patience":5,
    "output_dir":'output',
    "use_cuda":True,
    "overwrite_output_dir":True,
    "save_eval_checkpoints":False,
    "save_model_every_epoch":False,
    "save_steps":False,
    "special_tokens_list":['[TERM]','[/TERM]']
   
}


# Create a ClassificationModel
model = ClassificationModel(
    'bert',
    'models/va_cased/',
    num_labels=8,
    args=model_args,
    use_cuda=True,
    weight=[1,6,6,6,4,2,2,2]
) 

model.train_model(train,eval_df =valid)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at models/va_cased/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
preds, _ = model.predict(df_test['text'].to_list())
print(classification_report(df_test['labels'],preds))

In [None]:
           0       0.98      0.94      0.96       760
           1       0.98      0.99      0.98       865
           2       0.83      0.67      0.74        15
           3       0.88      0.88      0.88         8
           4       0.35      0.70      0.47        30
           5       0.00      0.00      0.00         4
           6       0.73      0.71      0.72        34
           7       1.00      0.67      0.80         3

    accuracy                           0.95      1719
   macro avg       0.72      0.69      0.69      1719
weighted avg       0.96      0.95      0.95      1719