In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
print('Tensorflow version: ', tf.__version__)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Mini ProjectRisk Msc DS/Main Project/NER/Dataset/NER dataset.csv', encoding='utf-8')
df = df.fillna(method='ffill')
df.head(-5)

In [None]:
df['Sentence Id'].nunique(), df.Word.nunique(), df.Tag.nunique()

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['ner_tags'] = le.fit_transform(df['Tag'])
df['ner_tags'] = df['ner_tags'] + 1

In [None]:
df.groupby('ner_tags').size().reset_index(name='counts')

In [None]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["ner_tags"].values.tolist())]
        self.grouped = self.data.groupby("Sentence Id").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["{}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(df)

In [None]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
sentences[0]

In [None]:
labels = [[s[1] for s in sentence] for sentence in getter.sentences]
print(labels[0])

In [None]:
def split_tokens_and_labels(tokens_list, labels_list, max_length=75):
    new_tokens_list = []
    new_labels_list = []

    for tokens, labels in zip(tokens_list, labels_list):
        if len(tokens) <= max_length:
            new_tokens_list.append(tokens)
            new_labels_list.append(labels)
        else:
            start = 0
            end = max_length
            while start < len(tokens):
                new_tokens_list.append(tokens[start:end])
                new_labels_list.append(labels[start:end])
                start = end
                end = start + max_length

    return new_tokens_list, new_labels_list



In [None]:
new_tokens_list, new_labels_list = split_tokens_and_labels(sentences, labels, max_length=75)

### Preprocess the sentence and labels. Prepare to use with pytorch and bert

In [None]:
pip install transformers

In [None]:
pip install pytorch-crf

In [None]:
# @title
from transformers import BertPreTrainedModel, BertModel
from transformers.modeling_outputs import  TokenClassifierOutput
from torch import nn
from torch.nn import CrossEntropyLoss
import torch
from torchcrf import CRF

In [None]:
# @title
class BertCRF(BertPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.crf = CRF(num_tags=config.num_labels, batch_first=True)
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            log_likelihood, tags = self.crf(logits, labels), self.crf.decode(logits)
            loss = 0 - log_likelihood
        else:
            tags = self.crf.decode(logits)
        tags = torch.Tensor(tags)

        if not return_dict:
            output = (tags,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return loss, tags

In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate>=0.20.1

In [None]:
from datasets import Dataset
data = {
    'tokens': new_tokens_list,
    'ner_tags': new_labels_list
}
dataset = Dataset.from_dict(data)

In [None]:
from sklearn.metrics import classification_report, f1_score
from transformers import BertTokenizerFast, Trainer, TrainingArguments,BertTokenizer
from transformers.trainer_utils import IntervalStrategy
from sklearn.model_selection import train_test_split


train_dataset, test_dataset = train_test_split(dataset, test_size = 0.2, random_state = 2018)
train_dataset = Dataset.from_dict(train_dataset)
test_dataset = Dataset.from_dict(test_dataset)

train_dataset = train_dataset.rename_column('ner_tags', 'label_ids')
test_dataset = test_dataset.rename_column('ner_tags', 'label_ids')

In [None]:
model = BertCRF.from_pretrained('bert-base-cased', num_labels=26)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [None]:
def tokenize(batch):
    result = {
        'label_ids': [],
        'input_ids': [],
        'token_type_ids': [],
    }
    max_length = tokenizer.max_model_input_sizes['bert-base-cased']

    for tokens, label in zip(batch['tokens'], batch['label_ids']):
        tokenids = tokenizer(tokens, add_special_tokens=False)

        token_ids = []
        label_ids = []
        for ids, lab in zip(tokenids['input_ids'], label):
            #lab = int(lab)  # Convert lab to an integer
            if len(ids) > 1 and lab % 2 == 1:
                token_ids.extend(ids)
                chunk = [lab] * len(ids)
                chunk[0] = lab
                label_ids.extend(chunk)
            else:
                token_ids.extend(ids)
                chunk = [lab] * len(ids)
                label_ids.extend(chunk)

        token_type_ids = tokenizer.create_token_type_ids_from_sequences(token_ids)
        token_ids = tokenizer.build_inputs_with_special_tokens(token_ids)
        label_ids.insert(0, 0)
        label_ids.append(0)
        result['input_ids'].append(token_ids)
        result['label_ids'].append(label_ids)
        result['token_type_ids'].append(token_type_ids)

    result = tokenizer.pad(result, padding='longest', max_length=max_length, return_attention_mask=True, )
    for i in range(len(result['input_ids'])):
        diff = len(result['input_ids'][i]) - len(result['label_ids'][i])
        result['label_ids'][i] += [0] * diff
    return result


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))
train_dataset.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label_ids'])
test_dataset.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label_ids'])


def compute_metrics(pred):
    labels = pred.label_ids.flatten()
    preds = pred.predictions.flatten()
    f1 = f1_score(labels, preds, average='macro')
    print(classification_report(labels, preds))
    return {
        'f1': f1
    }


## Training

In [None]:
from transformers import TrainingArguments, Trainer, AdamW, get_scheduler, EarlyStoppingCallback


learning_rate = 3e-5
optimizer = AdamW(
    model.parameters(),
    lr=learning_rate,
    weight_decay=0.01,
)

num_warmup_steps = 200

scheduler = get_scheduler(
    "linear",
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=len(train_dataset) // 64 * 3,
)

model.optimizer = optimizer
model.lr_scheduler = scheduler

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    save_strategy=IntervalStrategy.EPOCH,
    evaluation_strategy=IntervalStrategy.EPOCH,
    logging_dir='./logs',

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False, 
    report_to="tensorboard", 
    run_name="my_experiment",
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], 
)


trainer.train()

results = trainer.evaluate()
print(results)


# Test Data

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

true_label = []
predictions = []
model.to(device)
        
with torch.no_grad():
            
  outputs = model(test_dataset['input_ids'].to(device), token_type_ids=None,
                            attention_mask=test_dataset['attention_mask'].to(device), labels=test_dataset['label_ids'].to(device))
      
logits = outputs[1].detach().cpu().numpy()
label_ids = test_dataset['label_ids'].to('cpu').numpy()

predictions = list(logits.flatten())
true_label = list(label_ids.flatten())

In [None]:
pred_tags = [p for p, l in zip(predictions, true_label) if l!=0 and p!=0]
true_test_tags = [l for p, l in zip(predictions, true_label) if l!=0 and p!=0]

In [None]:
pred_ = [int(x - 1) for x in pred_tags]
true_ = [int(x - 1) for x in true_test_tags]

In [None]:
pred_ = le.inverse_transform(pred_)
true_ = le.inverse_transform(true_)

In [None]:
p = []
t = []
for i in pred_:
  if i == 'O':
    p.append(i)
  else:
    p.append(i[2:])

for i in true_:
  if i == 'O':
    t.append(i)
  else:
    t.append(i[2:])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(true_, pred_))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(t, p))

In [None]:
report = classification_report(t, p, digits=4, output_dict=True)
f1_weighted = report['weighted avg']['f1-score']

recall_weighted = report['weighted avg']['recall']
precision_weighted = report['weighted avg']['precision']

# Print the results

print ('Weighted F1 Score: ', f1_weighted)
print ('Weighted Recall: ', recall_weighted)
print ('Weighted Precision: ', precision_weighted)

In [None]:
report = classification_report(t, p, digits=4, output_dict=True)
f1_weighted = report['macro avg']['f1-score']

recall_weighted = report['macro avg']['recall']
precision_weighted = report['macro avg']['precision']

# Print the results

print ('Macro F1 Score: ', f1_weighted)
print ('Macro Recall: ', recall_weighted)
print ('Macro Precision: ', precision_weighted)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import pandas as pd

cm_array = confusion_matrix(t, p)

cm_normalized = cm_array.astype('float') / cm_array.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots(figsize=(10, 10))

cm_array_df = pd.DataFrame(cm_normalized, index=np.unique(p), columns=np.unique(p))
annot_font_size = 14 
annot_kws = {'size': annot_font_size}

sns.heatmap(cm_array_df, annot=True, cbar=False, fmt='.2f', cmap='Blues', ax=ax, annot_kws=annot_kws)
ax.set_xlabel('Predicted', fontsize=15)
ax.set_ylabel('Actual', fontsize=15)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=15)  
ax.set_yticklabels(ax.get_yticklabels(), fontsize=15)
plt.show()


# Case Study

In [None]:
lis = list(range(25))
id2label = list(le.inverse_transform(lis))
id2label.insert(0,'[PAD]')

In [None]:
model1 = BertCRF.from_pretrained('./results/checkpoint-343', num_labels=26)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

def decode(label_ids, input_ids, offsets_mapping, id2label):
    result = []
    for k in range(len(label_ids)):
        words = []
        labels = []
        for i in range(len(label_ids[k])):
            start_ind, end_ind = offset_mapping[k][i]
            word = tokenizer.convert_ids_to_tokens([int(input_ids[k][i])])[0]
            is_subword = end_ind - start_ind != len(word)
            if is_subword:
                if word.startswith('##'):
                    words[-1] += word[2:]
            else:
                words.append(word)
                labels.append(id2label[int(label_ids[k][i])])
        result.append(
            {'words': words,
             'labels': labels}
        )
    return result


corpus = ['In October 2022, we identified an active infection of government, agriculture\
 and transportation organizations located in the Donetsk, Lugansk, and Crimea regions. Although the initial vector of compromise is unclear, the details of the next stage imply the use of spear phishing or similar methods.\
 The victims navigated to a URL pointing to a ZIP archive hosted on a malicious web server.'
]

inputs = tokenizer(corpus, max_length=512, padding=True, truncation=True, return_tensors='pt',
                   return_offsets_mapping=True)
offset_mapping = inputs.pop("offset_mapping").cpu().numpy().tolist()

outputs = model1(**inputs)
result = decode(outputs[1].numpy().tolist(), inputs['input_ids'].numpy().tolist(), offset_mapping, id2label)
for token, label in zip(result[0]['words'], result[0]['labels']):
    print("{:20}\t{}".format(token, label))