In [2]:
!pip install datasets transformers pandas evaluate

# Step 2: Import Libraries
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import pandas as pd
from google.colab import files

# Step 3: Upload Training Data
print("Please upload your training dataset (CSV file).")
uploaded_train = files.upload()

# Load the uploaded training data
train_file_name = list(uploaded_train.keys())[0]
print(f"Training data file '{train_file_name}' uploaded.")
train_data = pd.read_csv(train_file_name)

# Step 4: Preprocess Training Data
# Filter columns and handle missing values
train_data = train_data[['def_text', 'VesselGroup', 'age', 'predicted_severity']].dropna()

# Map severity levels to numerical values
severity_mapping = {"Low": 0, "Medium": 1, "High": 2, "Not a deficiency": 3}
train_data['labels'] = train_data['predicted_severity'].map(severity_mapping)

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(train_data)

# Tokenizer Setup
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# DistilBERT Model is used due to its efficiency and lower resource requirement while still being able complete tasks at a high performance
def tokenize_function(example):
    return tokenizer(
        example["def_text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Step 5: Split Dataset for Training and Evaluation
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_data = split_dataset["train"]
eval_data = split_dataset["test"]

# Step 6: Load Pretrained Model for Sequence Classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(severity_mapping)
)

# Step 7: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",           # Directory for saving model checkpoints
    evaluation_strategy="steps",     # Save and evaluate periodically
    save_strategy="steps",
    save_steps=500,                  # Save every 500 steps
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=16,  # Batch size
    num_train_epochs=5,              # Number of epochs
    logging_dir="./logs",            # Logging directory
    logging_steps=50,                # Log every 50 steps
    load_best_model_at_end=True      # Automatically load the best model
)

# Step 8: Define Metrics
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Step 9: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Step 10: Train the Model
trainer.train()

# Step 11: Save the Model
trainer.save_model("./trained_model")
tokenizer.save_pretrained("./trained_model")

print("Model training complete. Model and tokenizer saved to './trained_model'.")

# Step 12: Upload Test Data
print("Please upload your test dataset (CSV file).")
uploaded_test = files.upload()

# Load the uploaded test data
test_file_name = list(uploaded_test.keys())[0]
print(f"Test data file '{test_file_name}' uploaded.")
test_data = pd.read_csv(test_file_name)

# Process Test Data: Tokenize and Predict
test_data['def_text'] = test_data['def_text'].fillna("")  # Ensure no missing values in 'def_text'

# Tokenize the test data
test_dataset = Dataset.from_pandas(test_data)

tokenized_test = test_dataset.map(
    lambda x: tokenizer(x['def_text'], padding="max_length", truncation=True, max_length=128),
    batched=True
)

# Generate Predictions
predictions = trainer.predict(tokenized_test)

# Convert predictions to severity levels
predicted_classes = predictions.predictions.argmax(axis=-1)
predicted_severity = [list(severity_mapping.keys())[cls] for cls in predicted_classes]


# Add predicted_severity to the test data
test_data['predicted_severity'] = predicted_severity

# Keep only requested columns
output_test_data = test_data[['PscInspectionId', 'deficiency_code', 'predicted_severity']]

# Save the modified test data
output_file_name = "modified_test_data.csv"
output_test_data.to_csv(output_file_name, index=False)
print(f"Modified test data saved to '{output_file_name}'.")

# Step 13: Download the Modified Test Data
files.download(output_file_name)

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m9

Saving resolved_predicted_severity.csv to resolved_predicted_severity.csv
Training data file 'resolved_predicted_severity.csv' uploaded.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/17970 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss,Accuracy
50,1.1725,1.099629,0.424597
100,1.0829,1.055045,0.432666
150,1.0794,1.020338,0.483306
200,1.0614,1.017518,0.473845
250,0.9894,0.978595,0.520033
300,0.9922,0.94082,0.543406
350,0.9656,0.928896,0.55537
400,0.9216,0.878898,0.606288
450,0.9106,0.858101,0.621035
500,0.9125,0.825141,0.640234


Could not locate the best model at ./results/checkpoint-4350/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


Model training complete. Model and tokenizer saved to './trained_model'.
Please upload your test dataset (CSV file).


Saving psc_severity_test.csv to psc_severity_test.csv
Test data file 'psc_severity_test.csv' uploaded.


Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Modified test data saved to 'modified_test_data.csv'.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>