In [None]:
import os
import pandas as pd
import import_ipynb
from prep_data import getOntologyData, create_processedDF
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from transformers import pipeline
import huggingface_hub
from dotenv import load_dotenv

In [None]:
def loginHuggingFace():
    load_dotenv()
    HUGGINGFACE_APIKEY = os.getenv('HUGGINGFACE_APIKEY')
    !huggingface-cli login --token {HUGGINGFACE_APIKEY}

In [None]:
%%capture
loginHuggingFace()

In [None]:
data_df = getOntologyData("./Ontology/d3fend.ttl")

In [None]:
processed_df = create_processedDF(data_df)
processed_df.columns

In [None]:
processed_df['text'] = processed_df['classA'] + " [SEP] " + processed_df['classB']

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(processed_df['text'].tolist(), processed_df['label'].tolist(), test_size=0.2)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

In [None]:
train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

In [None]:
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels, **train_encodings})
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels, **test_encodings})
train_dataset.push_to_hub("OhWayTee/hierarchy-pairs", split="train")
test_dataset.push_to_hub("OhWayTee/hierarchy-pairs", split="test")

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
local_model_path = "./bert-taxonomy-model"
upload_model_path = "./bert-taxonomy-model"

In [None]:
model.push_to_hub(upload_model_path)

In [None]:
classifier = pipeline("text-classification", model=local_model_path, tokenizer=tokenizer)
prediction = classifier("Network Security [SEP] Firewall")
print(prediction)  

In [None]:
eval_results = trainer.evaluate()
print(eval_results)