# Contrast BERTOPIC outliers with BERT & CER

## Load Packages

In [2]:
import pandas as pd
from transformers import AutoTokenizer
from transformers import pipeline
import numpy as np
from transformers import TrainingArguments, Trainer
import torch

  from .autonotebook import tqdm as notebook_tqdm


## Loading BERT Model on top 5 Themes

### Define Model

In [18]:
labels = ['‘Transparency’', '‘Clarity’', '‘Reconciliation’', '‘Engagement and Communication’', 'Environment']
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [19]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

### Loading in Model (Pre saved pth file)

In [20]:
import torch

path = "/Users/Sebastian/Desktop/PreTrainedBert/BERT-multi-label-classifier-top-5-long.pth"
model.load_state_dict(torch.load(path))

<All keys matched successfully>

### Evaluation mode

In [21]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

## Apply BERT model on BERTOPIC Subset

### Load BERTOPIC Subset

In [5]:
# OPR csv with themes 
file_path = '/Users/Sebastian/OneDrive/2023 Summer/CER-Theme-Classification/OPR/data/processed/outliers_bertopic.csv'

BERTOPIC_Outliers = pd.read_csv(file_path)


In [7]:
BERTOPIC_Outliers = BERTOPIC_Outliers[["raw_text", "themes", "theme", "bertopics", "group", "tags"]]

In [13]:
BERTOPIC_Outliers.head(3)

Unnamed: 0,raw_text,themes,theme,bertopics,group,tags
0,The OPR discussion paper has identified severa...,"Clarity, Emergency Management, Regulatory Desi...",Environment,19,Environmental NGO,"Jurisdictional Alignment, Education/ Training,..."
1,"In B.C., the Commission maintains a commitment...","Socio-Economic, Environment, Engagement and Co...",Environment,5,Government (Fed/ Prov),"GBA+, Reclamation, Engagement Mechanisms"
2,Transparency could be improved by ensuring spi...,"Emergency Management, Transparency",Transparency,18,Government (Fed/ Prov),"Emergency Response Planning, Information Sharing"


### Encode text

In [14]:
text = BERTOPIC_Outliers['raw_text'].to_list()

# Remmeber to turn column to list first, then feed it into encode_text
# then will have to convert to add as five new columns (based on id2label order)

In [15]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def encode_text(text):
    encoding = tokenizer(text, truncation=True, padding="max_length", return_tensors="pt")
    #encoding = {k: v.to(rvc.device) for k,v in encoding.items()}
    return encoding

In [16]:
encoded_text = encode_text(text)

### Make a Prediction based on Encoded Text

In [22]:
with torch.no_grad():
    prediction = model(**encoded_text)

In [13]:
print(prediction)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.0353, -0.2503, -1.1942, -1.2093, -1.5443]]), hidden_states=None, attentions=None)


In [33]:
keys_list = list(label2id.keys())
keys_list

['‘Transparency’',
 '‘Clarity’',
 '‘Reconciliation’',
 '‘Engagement and Communication’',
 'Environment']

In [24]:
sigmoid = torch.nn.Sigmoid()
sigmoid_prediction = sigmoid(torch.Tensor(prediction['logits']))
sigmoid_prediction

tensor([[0.5729, 0.4863, 0.2047, 0.9159, 0.0759],
        [0.1262, 0.3236, 0.7743, 0.6457, 0.2043],
        [0.6877, 0.3503, 0.1026, 0.4066, 0.2096],
        [0.2201, 0.7636, 0.1282, 0.0525, 0.1560],
        [0.0728, 0.1333, 0.9631, 0.1724, 0.1700],
        [0.3217, 0.0752, 0.8991, 0.9300, 0.0939],
        [0.3769, 0.2240, 0.9443, 0.3963, 0.8704],
        [0.3769, 0.2240, 0.9443, 0.3963, 0.8704],
        [0.3769, 0.2240, 0.9443, 0.3963, 0.8704],
        [0.0836, 0.4882, 0.4254, 0.0487, 0.3049],
        [0.0836, 0.4882, 0.4254, 0.0487, 0.3049],
        [0.1711, 0.5341, 0.9116, 0.5403, 0.0960],
        [0.2965, 0.2788, 0.4031, 0.2542, 0.0523],
        [0.0854, 0.2339, 0.9058, 0.0785, 0.1456],
        [0.1930, 0.1571, 0.9649, 0.8643, 0.0946],
        [0.1474, 0.5335, 0.9367, 0.1014, 0.3581],
        [0.0795, 0.1284, 0.9731, 0.1597, 0.2686],
        [0.0754, 0.0781, 0.9317, 0.2235, 0.1603],
        [0.1798, 0.4095, 0.6299, 0.2046, 0.0920],
        [0.1798, 0.4095, 0.6299, 0.2046, 0.0920],


### Convert output to dataframe and rename columns

In [91]:
# Convert to data frame
data_frame = pd.DataFrame(sigmoid_prediction.numpy())

# Rename Columns
data_frame.columns = ['Transparency', 'Clarity', 'Reconciliation','‘Engagement and Communication', 'Environment']

# Concatenate the original DataFrame and the new columns DataFrame along axis=1 (right)
concatenated_df = pd.concat([BERTOPIC_Outliers, data_frame], axis=1)
concatenated_df.head(5)


Unnamed: 0,raw_text,themes,theme,bertopics,group,tags,Transparency,Clarity,Reconciliation,‘Engagement and Communication,Environment
0,The OPR discussion paper has identified severa...,"Clarity, Emergency Management, Regulatory Desi...",Environment,19,Environmental NGO,"Jurisdictional Alignment, Education/ Training,...",0.572899,0.486326,0.204711,0.915882,0.075894
1,"In B.C., the Commission maintains a commitment...","Socio-Economic, Environment, Engagement and Co...",Environment,5,Government (Fed/ Prov),"GBA+, Reclamation, Engagement Mechanisms",0.126159,0.323551,0.774255,0.645659,0.204271
2,Transparency could be improved by ensuring spi...,"Emergency Management, Transparency",Transparency,18,Government (Fed/ Prov),"Emergency Response Planning, Information Sharing",0.687687,0.350315,0.102611,0.406552,0.209596
3,The regulatory regime in the NWT differs from ...,Clarity,Clarity,-1,Government (Fed/ Prov),"Need Guidance, Jurisdictional Alignment",0.220098,0.763581,0.1282,0.052468,0.155977
4,The CERs Discussion Paper (2022:2) states that...,"Reconciliation, Application Stage, Regulatory ...",Reconciliation,18,Indigenous,"Rights and Interests, Filing Manual, Improve C...",0.072783,0.133264,0.963143,0.172431,0.170011


### Define status of prediction

In [92]:
# Define a custom function to set the 'Status' based on name, age, and score
def set_status(row):
    if row['theme'] == 'Transparency' and row['Transparency'] >= .5:
        return 'Correct'
    elif row['theme'] == 'Clarity' and row['Clarity'] >= .5:
        return 'Correct'
    elif row['theme'] == 'Reconciliation' and row['Reconciliation'] >= .5:
        return 'Correct'
    elif row['theme'] == '‘Engagement and Communication' and row['‘Engagement and Communication'] >= .5:
        return 'Correct'
    elif row['theme'] == 'Environment' and row['Environment'] >= .5:
        return 'Correct'
    else:
        return 'Incorrect'

In [93]:
# Apply function and save values into new column
concatenated_df['Status'] = concatenated_df.apply(set_status, axis=1)

In [95]:
df_sorted = concatenated_df.sort_values(by='raw_text')
display(df_sorted)

Unnamed: 0,raw_text,themes,theme,bertopics,group,tags,Transparency,Clarity,Reconciliation,‘Engagement and Communication,Environment,Status
38,1. Prevent private property damage arising fro...,"Reconciliation, Clarity, Gender Based Analysis...",Environment,5,Indigenous,"Cultural Awareness, Education/ Training, MMIWG...",0.076335,0.522269,0.836697,0.041070,0.272046,Incorrect
138,1. Reward and promote good operators.\n2. Supp...,"Clarity, Competitiveness, Reconciliation, Audi...",Reconciliation,6,Indigenous,"Need Guidance, Timeliness/ Responsiveness, Col...",0.074831,0.200127,0.897857,0.203444,0.064793,Correct
8,"1. The previous failures, disasters, complianc...","Transparency, Application Stage, Reconciliatio...",Environment,24,Indigenous,"Information Sharing, Filing Manual - Lands, Ri...",0.376899,0.223985,0.944278,0.396299,0.870395,Correct
7,"1. The previous failures, disasters, complianc...","Transparency, Application Stage, Reconciliatio...",Reconciliation,24,Indigenous,"Information Sharing, Filing Manual - Lands, Ri...",0.376899,0.223985,0.944278,0.396299,0.870395,Correct
6,"1. The previous failures, disasters, complianc...","Transparency, Application Stage, Reconciliatio...",Transparency,24,Indigenous,"Information Sharing, Filing Manual - Lands, Ri...",0.376899,0.223985,0.944278,0.396299,0.870395,Incorrect
...,...,...,...,...,...,...,...,...,...,...,...,...
24,• Importance of the CER and its regulated comp...,"Reconciliation, Clarity",Clarity,15,Indigenous,"Cultural Awareness, Education/ Training, Ackno...",0.109508,0.640491,0.882701,0.185702,0.107151,Correct
154,• Indigenous input in the technical details an...,"Reconciliation, Engagement and Communication",Reconciliation,6,Indigenous,"Collaboration, Indigenous Participation, Engag...",0.099808,0.086093,0.968735,0.472690,0.066244,Correct
130,• Integrating Indigenous Knowledge into the en...,"Reconciliation, Competitiveness",Reconciliation,18,Indigenous,"Indigenous Knowledge/ TEK, Collaboration, Lead...",0.089482,0.155088,0.938376,0.094741,0.660150,Correct
37,• OPR requirements to promote Métis representa...,"Emergency Management, Reconciliation, Clarity,...",Clarity,15,Indigenous,"Emergency Response Planning, UNDRIP, Indigenou...",0.214366,0.277116,0.977153,0.760343,0.151254,Incorrect


### Contrast BERTOPIC Outliers (-1)

In [74]:
BERTOPIC_Minus_One = concatenated_df[concatenated_df['bertopics'] == -1]

BERTOPIC_Minus_One

Unnamed: 0,raw_text,themes,theme,bertopics,group,tags,Transparency,Clarity,Reconciliation,‘Engagement and Communication,Environment
3,The regulatory regime in the NWT differs from ...,Clarity,Clarity,-1,Government (Fed/ Prov),"Need Guidance, Jurisdictional Alignment",0.220098,0.763581,0.128200,0.052468,0.155977
9,Greater collaboration with regulatory agencies...,"Clarity, Reconciliation",Clarity,-1,Indigenous,"Jurisdictional Alignment, Collaboration",0.083632,0.488245,0.425441,0.048705,0.304933
10,Greater collaboration with regulatory agencies...,"Clarity, Reconciliation",Reconciliation,-1,Indigenous,"Jurisdictional Alignment, Collaboration",0.083632,0.488245,0.425441,0.048705,0.304933
13,The CER (the Regulator) “is to have a Commissi...,Reconciliation,Reconciliation,-1,Indigenous,"Indigenous Participation, Co-Management/ Share...",0.085352,0.233866,0.905833,0.078459,0.145642
18,Very little is written on the abandonment proc...,"Clarity, Reconciliation, Regulatory Design",Clarity,-1,Indigenous,"Need Guidance, Indigenous Participation, Stand...",0.179753,0.409490,0.629886,0.204579,0.091971
...,...,...,...,...,...,...,...,...,...,...,...
171,The Regulations should force companies to foll...,"Transparency, Clarity, Competitiveness",Transparency,-1,Municipality,"Information Sharing, Jurisdictional Alignment,...",0.476459,0.717226,0.086535,0.106391,0.179006
172,The Regulations should force companies to foll...,"Transparency, Clarity, Competitiveness",Clarity,-1,Municipality,"Information Sharing, Jurisdictional Alignment,...",0.476459,0.717226,0.086535,0.106391,0.179006
173,Establish an event reporting guideline and rec...,"Clarity, Implementation, Competitiveness, Tran...",Clarity,-1,Municipality,"Need Guidance, Incident Reporting, Reporting R...",0.609995,0.768437,0.047927,0.189048,0.133339
174,Establish an event reporting guideline and rec...,"Clarity, Implementation, Competitiveness, Tran...",Transparency,-1,Municipality,"Need Guidance, Incident Reporting, Reporting R...",0.609995,0.768437,0.047927,0.189048,0.133339


In [88]:
# Compute the odds of being correct
correct_count = BERTOPIC_Minus_One['Status'].eq('Correct').sum()
incorrect_count = BERTOPIC_Minus_One['Status'].eq('Incorrect').sum()

odds_correct = correct_count / incorrect_count
display(f"Odds of being correct among -1: {odds_correct:.2f}")

'Odds of being correct among -1: 1.66'

### Contrast other BERTOPICs        

In [81]:
BERTOPIC_Others = concatenated_df[concatenated_df['bertopics'] != -1]


Unnamed: 0,raw_text,themes,theme,bertopics,group,tags,Transparency,Clarity,Reconciliation,‘Engagement and Communication,Environment
0,The OPR discussion paper has identified severa...,"Clarity, Emergency Management, Regulatory Desi...",Environment,19,Environmental NGO,"Jurisdictional Alignment, Education/ Training,...",0.572899,0.486326,0.204711,0.915882,0.075894
1,"In B.C., the Commission maintains a commitment...","Socio-Economic, Environment, Engagement and Co...",Environment,5,Government (Fed/ Prov),"GBA+, Reclamation, Engagement Mechanisms",0.126159,0.323551,0.774255,0.645659,0.204271
2,Transparency could be improved by ensuring spi...,"Emergency Management, Transparency",Transparency,18,Government (Fed/ Prov),"Emergency Response Planning, Information Sharing",0.687687,0.350315,0.102611,0.406552,0.209596
4,The CERs Discussion Paper (2022:2) states that...,"Reconciliation, Application Stage, Regulatory ...",Reconciliation,18,Indigenous,"Rights and Interests, Filing Manual, Improve C...",0.072783,0.133264,0.963143,0.172431,0.170011
5,The responses below are provided after interna...,Reconciliation,Reconciliation,18,Indigenous,Resources/ Capacity to Participate,0.321672,0.075185,0.899123,0.929961,0.093897
...,...,...,...,...,...,...,...,...,...,...,...
166,Enbridge provides the following recommendation...,"Transparency, Clarity, Reconciliation, Engagem...",Reconciliation,6,Industry,"Information Sharing, Need Guidance, Collaborat...",0.699832,0.721549,0.165723,0.728770,0.068718
167,TC Energy agrees with the CER that compliance ...,"Clarity, Audit/ Compliance Verification, Manag...",Reconciliation,6,Industry,"Need Guidance, Interpretation, Education/ Trai...",0.582004,0.863701,0.097214,0.216810,0.129790
168,Engagement options include:\n- workshops/lunch...,"Clarity, Engagement and Communication",Clarity,10,Industry,"Education/ Training, Engagement, Engagement Me...",0.377307,0.397951,0.171908,0.848016,0.081000
175,Provide an annual report to local authorities ...,"Competitiveness, Transparency",Transparency,18,Municipality,Reporting Requirements,0.603083,0.478975,0.081792,0.227720,0.204050


In [86]:
# Compute the odds of being correct
correct_count = BERTOPIC_Others['Status'].eq('Correct').sum()
incorrect_count = BERTOPIC_Others['Status'].eq('Incorrect').sum()

odds_correct = correct_count / incorrect_count
display(f"Odds of being correct among other bertopics: {odds_correct:.2f}")

'Odds of being correct among other bertopics: 1.06'