In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import pipeline
from datasets import Dataset
from sklearn.calibration import LabelEncoder
from transformers import TrainingArguments, Trainer

In [None]:
from sklearn.calibration import LabelEncoder

df = pd.read_excel('filtered_labeled_sandbox_errors.xlsx')

# Filter the data to keep only rows where 'error_category' is not null
df_labeled = df.dropna(subset=['error_category'])

# Combine 'module type' and 'Error' columns into a single 'text' column
df_labeled['text'] = df_labeled['module type'] + '-' + df_labeled['Error']

# Convert the pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(df_labeled[['text', 'error_category']])

print(train_dataset.shape)

# Encode the labels to integers if they are strings
label_encoder = LabelEncoder()
df_labeled['encoded_labels'] = label_encoder.fit_transform(df_labeled['error_category'])

# Update the dataset with encoded labels
train_dataset = Dataset.from_pandas(df_labeled[['text', 'encoded_labels']])

(21, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_labeled['text'] = df_labeled['module type'] + '-' + df_labeled['Error']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_labeled['encoded_labels'] = label_encoder.fit_transform(df_labeled['error_category'])


In [40]:
from transformers import DistilBertTokenizer

# Load the tokenizer for DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)

# Remove the original 'text' column since it's no longer needed after tokenization
train_dataset = train_dataset.remove_columns(["text"])
train_dataset = train_dataset.rename_column("encoded_labels", "labels")

# Set the format for PyTorch, so we can use it for training
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

In [41]:
from transformers import DistilBertForSequenceClassification

# Load the pre-trained DistilBERT model with a classification head
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(label_encoder.classes_)
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    no_cuda=True  # Ensure CUDA is not causing issues
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The model to train
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # The training dataset (entire labeled dataset)
    tokenizer=tokenizer,                 # The tokenizer
)

  trainer = Trainer(


#### Train the model 

In [43]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=9, training_loss=1.4071091545952692, metrics={'train_runtime': 26.9505, 'train_samples_per_second': 2.338, 'train_steps_per_second': 0.334, 'total_flos': 8345743773696.0, 'train_loss': 1.4071091545952692, 'epoch': 3.0})

In [44]:
trainer.save_model("./fineTuned-distilbert-error-classifier")

In [45]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder

model = DistilBertForSequenceClassification.from_pretrained("./fineTuned-distilbert-error-classifier")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# label_encoder = LabelEncoder()
label_encoder.fit([
	"UserModule - Undefined method id",
	"WorkspacePermission - Could not find agent without an ID",
	"BusinessObject - Standard error",
	"Missing configuration",
	"Mysql gone error",
	"Validation error",
	"Unknown error"
])  # Add your actual categories here

# Help me to predict the error category
def predict_error_category(text):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt")

    # Perform a forward pass of the model
    outputs = model(**inputs)

    # Get the predicted class for each example
    predicted_class = torch.argmax(outputs.logits, dim=1)

    # Convert the predicted class to the actual label
    predicted_label = label_encoder.inverse_transform(predicted_class)

    return predicted_label[0]

# # Put the actual error_category in the 'error_category' column
df['text'] = df['module type'] + '-' + df['Error']
df['error_category'] = df['error_category'].fillna(df['text'].apply(predict_error_category))


In [46]:
df.head()

Unnamed: 0,account id,module type,Error,error_category,text
0,708664,WorkspacePermissionConfiguration,"""{\""import_exceptions\"":\""Import config set it...",WorkspacePermission - Could not find agent wit...,"WorkspacePermissionConfiguration-""{\""import_ex..."
1,708687,WorkspacePermissionConfiguration,"""{\""import_exceptions\"":\""Import config set it...",WorkspacePermission - Could not find agent wit...,"WorkspacePermissionConfiguration-""{\""import_ex..."
2,708774,WorkspacePermissionConfiguration,"""{\""import_exceptions\"":\""Import config set it...",WorkspacePermission - Could not find agent wit...,"WorkspacePermissionConfiguration-""{\""import_ex..."
3,708664,UserConfiguration,"""{\""import_exceptions\"":\""Import config set it...",UserModule - Undefined method id,"UserConfiguration-""{\""import_exceptions\"":\""Im..."
4,708774,WorkspacePermissionConfiguration,"""{\""import_exceptions\"":\""Import config set it...",WorkspacePermission - Could not find agent wit...,"WorkspacePermissionConfiguration-""{\""import_ex..."


In [47]:
df.to_excel('predicted_error_category.xlsx', index=False)