In [1]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [2]:
df = pd.read_csv("final_training_data.csv")
df = df[["line", "label"]]
df.columns = ["text", "labels"]
df.head()

Unnamed: 0,text,labels
0,the existing literature on the corporate digit...,Context
1,"Thus, to close the aforementioned gap, this pa...",Context
2,"Hence, the objective of\r\nthis research is to...",Context
3,The main idea behind the\r\ntheory of the diff...,Key insights
4,"In the development of this process,\r\nthere i...",Key insights


In [3]:
df["labels"].value_counts()

Unknown         2961
Key findings     965
Key insights     786
Context          461
Definitions      400
Name: labels, dtype: int64

In [4]:
df = pd.read_csv("final_training_data.csv")
df = df[["line", "label"]]
df.columns = ["text", "labels"]

# Do stratified train test split using sklearn train_test_split
from sklearn.model_selection import train_test_split
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["labels"])

train_df.reset_index(drop=True, inplace=True)
eval_df.reset_index(drop=True, inplace=True)

In [5]:
train_df["labels"].unique()

array(['Key findings', 'Definitions', 'Unknown', 'Context',
       'Key insights'], dtype=object)

In [6]:
# Compute class weight
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight('balanced', classes=np.unique(train_df.labels), y = train_df.labels)

In [7]:
np.unique(train_df.labels), list(class_weights)

(array(['Context', 'Definitions', 'Key findings', 'Key insights',
        'Unknown'], dtype=object),
 [2.4162601626016262,
  2.78625,
  1.154922279792746,
  1.4174880763116058,
  0.3765202702702703])

In [None]:
from sklearn.metrics import f1_score, accuracy_score,precision_score, recall_score

# Optional model configuration
model_args = ClassificationArgs()
model_args.num_train_epochs=1
model_args.use_multiprocessing = False,
#model_args.use_multiprocessing_for_evaluation = False
#model_args.use_multiprocessed_decoding = False
model_args.overwrite_output_dir = True
model_args.save_steps = -1
model_args.save_model_every_epoch = False
#model.args.metric_fnc = f1_multiclass
#model_args.wandb_project = "text_highlight5"
model_args.labels_list = ['Context', 'Definitions', 'Key findings', 'Key insights', 'Unknown']

# Create a ClassificationModel
model = ClassificationModel(
    "roberta", "roberta-base", args=model_args, use_cuda=True, num_labels = 5, weight = list(class_weights)
)

# Define weighted f1 score function
def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='weighted')

# Define weighted recall score function
def macro_recall(labels, preds):
    return recall_score(labels, preds, average='weighted')

# # Define weighted precision score function
def get_precision(labels, preds):
    return precision_score(labels, preds, average='weighted')

# Train the model
model.train_model(train_df,   f1=f1_multiclass, 
                                    acc=accuracy_score, 
                                    recall = macro_recall,
                                    precision = get_precision)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

# Make predictions with the model
predictions, raw_outputs = model.predict(["Sam was a Wizard"])


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.de

HBox(children=(FloatProgress(value=0.0, max=4458.0), HTML(value='')))

In [None]:
#INFO:simpletransformers.classification.classification_model:{'mcc': 0.6049468970115803, 'eval_loss': 0.10204083122796594}

In [None]:
predictions

In [3]:
saved_model = ClassificationModel(
    "roberta", "use_this_model", use_cuda=False, num_labels = 5
)



In [4]:
predictions, raw_outputs = saved_model.predict(["Sam was a Wizard"])

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
100%|██████████| 1/1 [00:00<00:00,  5.46it/s]


In [5]:
predictions

['Unknown']

In [6]:
predictions, raw_outputs = saved_model.predict(["the existing literature on the corporate digital divide does not explicitly identify the accumulation of personal digital competencies that provide for the implementation and use of effective DTs "])

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
100%|██████████| 1/1 [00:00<00:00,  5.78it/s]


In [7]:
predictions

['Unknown']

In [8]:
predictions, raw_outputs = saved_model.predict(["the existing literature on the corporate digital divide does not explicitly identify the accumulation of personal digital competencies that provide for the implementation and use of effective DTs ", "Apple is tech company"])

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
100%|██████████| 1/1 [00:00<00:00,  2.91it/s]


In [9]:
predictions

['Unknown', 'Unknown']

In [11]:
raw_outputs

array([[ 5.54518843, -0.5448392 , -0.3685222 , -2.11993289, -3.09125853],
       [ 5.7347312 , -0.57513279, -0.42800692, -2.13170743, -3.12520361]])