In [2]:
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import torch
from transformers import TrainingArguments, Trainer,BertForSequenceClassification
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
import numpy as np

## Problem 1

In [4]:
### PROBLEM 1 
tokenizer=AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
def tokenize_word(word):
    tokens=tokenizer.tokenize(word)
    for token in tokens:
        print(token)
    token_ids=tokenizer.convert_tokens_to_ids(tokens)
    return tokens,token_ids

In [14]:
tokenize_word('rude')

rude


(['rude'], [12726])

In [19]:
df_train=pd.read_csv('data/hw4_train.csv')
df_test=pd.read_csv('data/hw4_test.csv')
df_test.head()

Unnamed: 0,id,comment_text,toxic
0,0001ea8717f6de06,Thank you for understanding. I think very high...,0
1,000247e83dcc1211,:Dear god this site is horrible.,0
2,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0
3,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0
4,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0


In [20]:
class Example:
    def __init__(self,id,comment_text,label=None):
        self.id=id
        self.comment_text=comment_text
        self.label=label

In [21]:
def to_input(df):
    question_list=[]
    input_list=[]
    for idx,row in df.iterrows():
        comment_id=row['id']
        comment_text=row['comment_text']
        label=row['toxic']
        
#         print(question_id,rating,question,answer)
        
        example=Example(comment_id,comment_text,label)
        input_list.append(example)
    return input_list

def to_sequence(input_list,tokenzier,max_length=128):
    X_list=[]
    label_list=[]
    for example in input_list:
        x=example.comment_text
        label=example.label
        label_list.append(label)
        X_list.append(x)
#     print(X_list)
    X_train=tokenizer(X_list,padding=True,truncation=True,max_length=max_length)

    return X_train,label_list

In [22]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [23]:
train_input_list=to_input(df_train)
X_train,Y_train=to_sequence(train_input_list,tokenizer,max_length=128)
train_dataset=Dataset(X_train,Y_train)

test_input_list=to_input(df_test)
X_test,Y_test=to_sequence(test_input_list,tokenizer,max_length=128)
test_dataset=Dataset(X_test,Y_test)

In [24]:
device=torch.device("cuda" if torch.cuda.is_available else "cpu")
# model=AutoModel.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
model=BertForSequenceClassification.from_pretrained("microsoft/MiniLM-L12-H384-uncased", num_labels=2)
model=model.to(device)


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/trans

In [29]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred,average="macro")
    return {"accuracy": accuracy,
           "recall":recall,
           "precision":precision,
           "f1":f1}

In [30]:
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="epoch",
#     eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    seed=0,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Train pre-trained model
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 159571
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 39894


Epoch,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
1,0.2259,0.23916,0.923458,0.637931,0.590695,0.785465
2,0.2347,0.253182,0.932352,0.42775,0.75551,0.754844


Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in output/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to output/checkpoint-1500
Configuration saved in output/checkpoint-1500/config.json
Model weights saved in output/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to output/checkpoint-2000
Configuration saved in output/checkpoint-2000/config.json
Model weights saved in output/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to output/checkpoint-2500
Configuration saved in output/checkpoint-2500/config.json
Model weights saved in output/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to output/checkpoint-3000
Configuration saved in output/checkpoint-3000/config.json
Model weights saved in output/check

TrainOutput(global_step=39894, training_loss=0.21628322458843327, metrics={'train_runtime': 2201.3356, 'train_samples_per_second': 144.977, 'train_steps_per_second': 18.123, 'total_flos': 5255688708350976.0, 'train_loss': 0.21628322458843327, 'epoch': 2.0})