In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split

if len(tf.config.list_physical_devices('GPU')) == 0: 
    print("No GPU detected")

In [None]:
modelName = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(modelName)
model = TFBertForSequenceClassification.from_pretrained(modelName, num_labels=3)

In [None]:
df = pd.read_csv('trainingData2.csv')
df['Text'] = df['Text'].str.lower()
text = df['Text'].tolist()
labels = df['Label'].tolist()
text_train, text_temp, labels_train, labels_temp = train_test_split(text, labels, test_size=0.2, random_state=7)
text_val, text_test, labels_val, labels_test = train_test_split(text_temp, labels_temp, test_size=0.5, random_state=7)

In [None]:
batchSize = 32
steps = len(text_train)//batchSize
val_steps = len(text_val)//batchSize

In [None]:
all_encodings = tokenizer(text, truncation=False)
seq_len = []
for encoding in all_encodings["input_ids"]:
    seq_len.append(len(encoding))
max_length = max(seq_len) + 2

In [None]:
train_encodings = tokenizer(text_train, truncation=True, padding='max_length', max_length=max_length)
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),labels_train))
train_dataset = train_dataset.shuffle(len(text_train)).batch(batchSize)

val_encodings = tokenizer(text_val, truncation=True, padding='max_length', max_length=max_length)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), labels_val))
val_dataset = val_dataset.shuffle(len(text_val)).batch(batchSize)

test_encodings = val_encodings = tokenizer(text_test, truncation=True, padding='max_length', max_length=max_length)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), labels_test))
test_dataset = test_dataset.shuffle(len(text_test)).batch(batchSize)

In [None]:
class_weights = {}
total_samples = len(labels_train)
for label in np.unique(labels_train):
    class_count = np.sum(np.array(labels_train) == label)
    class_weights[label] = total_samples / (len(np.unique(labels_train)) * class_count)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=4e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=3,  # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True
)
model.compile(optimizer=optimizer, loss=loss)

history = model.fit(
    train_dataset,
    epochs=20,
    validation_data = val_dataset,
    class_weight=class_weights,
    steps_per_epoch = steps,
    verbose = 1,
    validation_steps = val_steps,
    callbacks=[early_stopping],
)
model.evaluate(test_dataset)

In [None]:
out_file = "OSITv6"
model.save_pretrained(out_file)
history_df = pd.DataFrame(history.history)
history_df.to_csv(f'training_history_{out_file}.csv', index=False)

In [None]:
#ToDo: don't just hardcode the subtypes
subtypeMapping = {
    0: "Install or New",
    1: "Service Request",
    2: "Incident",
}

def getSTId(subtype):
    for key, value in subtypeMapping.items():
        if value == subtype:
            return key
    return None

def getSubtype(STId):
    for key, value in subtypeMapping.items():
        if key == STId:
            return value
    return None

In [None]:
new_text = ["I think the loss is too high here"]
new_encodings = tokenizer(new_text, truncation=False, padding=True)
new_dataset = tf.data.Dataset.from_tensor_slices((dict(new_encodings))).batch(2)
prediction = model.predict(new_dataset)
predicted_labels = np.argmax(prediction[0])
print(getSubtype(predicted_labels))

In [None]:
model.evaluate(test_dataset)