In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [6]:
from datasets import load_dataset

dataset = load_dataset('Daye34/student_feedback_pattern_recognition_medium_summary')

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'student_name', 'feedback', 'type_of_feedback', 'feedback_source', 'summary'],
        num_rows: 7000
    })
    validation: Dataset({
        features: ['id', 'student_name', 'feedback', 'type_of_feedback', 'feedback_source', 'summary'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['id', 'student_name', 'feedback', 'type_of_feedback', 'feedback_source', 'summary'],
        num_rows: 1500
    })
})

In [8]:
NUM_LABELS = len(set(dataset["train"]["type_of_feedback"]))
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def tokenize_function(examples):
    return tokenizer(examples["feedback"], padding="max_length", truncation=True, max_length=256)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [10]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'student_name', 'feedback', 'type_of_feedback', 'feedback_source', 'summary', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7000
    })
    validation: Dataset({
        features: ['id', 'student_name', 'feedback', 'type_of_feedback', 'feedback_source', 'summary', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['id', 'student_name', 'feedback', 'type_of_feedback', 'feedback_source', 'summary', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1500
    })
})

In [11]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(dataset["train"]["type_of_feedback"])

def encode_labels(examples):
    examples["labels"] = label_encoder.transform(examples["type_of_feedback"])
    return examples

tokenized_dataset = tokenized_dataset.map(encode_labels, batched=True)

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

In [13]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)


In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0003,0.000217
2,0.0001,8.7e-05
3,0.0001,6.7e-05


TrainOutput(global_step=2625, training_loss=0.05144738943334891, metrics={'train_runtime': 4116.264, 'train_samples_per_second': 5.102, 'train_steps_per_second': 0.638, 'total_flos': 2762715691008000.0, 'train_loss': 0.05144738943334891, 'epoch': 3.0})

In [15]:
eval_results = trainer.evaluate(tokenized_dataset["test"])
print(eval_results)

{'eval_loss': 6.883285823278129e-05, 'eval_runtime': 55.9181, 'eval_samples_per_second': 26.825, 'eval_steps_per_second': 3.362, 'epoch': 3.0}


In [16]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

In [17]:
import pickle
from sklearn.preprocessing import LabelEncoder

In [22]:
list(set(dataset["train"]["type_of_feedback"]))

['balanced', 'critical', 'good', 'Good']

In [23]:
label_encoder = LabelEncoder()
label_encoder.fit(list(set(dataset["train"]["type_of_feedback"])))

In [24]:
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)