In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [2]:
from datasets import load_dataset

ds = load_dataset('Daye34/student_feedback_pattern_recognition_medium_summary')

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'student_name', 'feedback', 'type_of_feedback', 'feedback_source', 'summary'],
        num_rows: 7000
    })
    validation: Dataset({
        features: ['id', 'student_name', 'feedback', 'type_of_feedback', 'feedback_source', 'summary'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['id', 'student_name', 'feedback', 'type_of_feedback', 'feedback_source', 'summary'],
        num_rows: 1500
    })
})

In [4]:
def lower_feedback(examples, label='type_of_feedback'):
    examples[label] = examples[label].lower()
    return examples
def label_remap(examples, label='type_of_feedback'):
    if examples[label] == 'critical':
        examples[label] = 'negative'
    elif examples[label] == 'good':
        examples[label] = 'positive'
    else:
        examples[label] = 'neutral'
    return examples

In [5]:
dataset = ds.map(lower_feedback)
dataset = dataset.map(label_remap)

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [6]:
df = dataset['train'].to_pandas()
df.head()

Unnamed: 0,id,student_name,feedback,type_of_feedback,feedback_source,summary
0,2912,Denis,"[""I thoroughly enjoyed the pattern recognition...",positive,gpt-3.5,The pattern recognition course was enjoyable a...
1,7663,Zara,"[""The pattern recognition course was truly out...",positive,gpt-3.5,The pattern recognition course exceeded expect...
2,9375,Aino,['My experience with the recent course in patt...,neutral,gemini,The recent pattern recognition course provided...
3,5812,Denis,"[""I thoroughly enjoyed the pattern recognition...",positive,gpt-3.5,The pattern recognition course was enjoyable a...
4,8322,Faith,"[""Throughout the pattern recognition course, I...",negative,gpt-3.5,The pattern recognition course was challenging...


In [7]:
labels = set(dataset["train"]["type_of_feedback"])
labels

{'negative', 'neutral', 'positive'}

In [4]:
NUM_LABELS = len(labels)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def tokenize_function(examples):
    return tokenizer(examples["feedback"], padding="max_length", truncation=True, max_length=256)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [6]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'student_name', 'feedback', 'type_of_feedback', 'feedback_source', 'summary', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7000
    })
    validation: Dataset({
        features: ['id', 'student_name', 'feedback', 'type_of_feedback', 'feedback_source', 'summary', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['id', 'student_name', 'feedback', 'type_of_feedback', 'feedback_source', 'summary', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1500
    })
})

In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(dataset["train"]["type_of_feedback"])

def encode_labels(examples):
    examples["labels"] = label_encoder.transform(examples["type_of_feedback"])
    return examples

tokenized_dataset = tokenized_dataset.map(encode_labels, batched=True)

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

In [9]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)


In [10]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0001,0.000101
2,0.0001,3.9e-05
3,0.0,3e-05


TrainOutput(global_step=2625, training_loss=0.03740795269156695, metrics={'train_runtime': 152.022, 'train_samples_per_second': 138.138, 'train_steps_per_second': 17.267, 'total_flos': 2762690886144000.0, 'train_loss': 0.03740795269156695, 'epoch': 3.0})

In [11]:
eval_results = trainer.evaluate(tokenized_dataset["test"])
print(eval_results)

{'eval_loss': 3.0239818443078548e-05, 'eval_runtime': 2.6941, 'eval_samples_per_second': 556.779, 'eval_steps_per_second': 69.783, 'epoch': 3.0}


In [12]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

('./fine_tuned_model\\tokenizer_config.json',
 './fine_tuned_model\\special_tokens_map.json',
 './fine_tuned_model\\vocab.txt',
 './fine_tuned_model\\added_tokens.json',
 './fine_tuned_model\\tokenizer.json')

In [13]:
import pickle
from sklearn.preprocessing import LabelEncoder

In [18]:
list(set(dataset["train"]["type_of_feedback"]))

['negative', 'neutral', 'positive']

In [19]:
label_encoder = LabelEncoder()
label_encoder.fit(list(set(dataset["train"]["type_of_feedback"])))

In [20]:
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)