In [1]:
import pandas as pd
import numpy as np

from transformers import BertForSequenceClassification, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [2]:
t = tokenizer.encode("don't be so judgmental", return_tensors='pt')  # tokenizer will return pytorch tensors

print(t)
print(tokenizer.decode(t[0]))  # print decoded string with special tokens included
print(tokenizer.decode(t[0], skip_special_tokens=True))

tensor([[ 101, 1274,  112,  189, 1129, 1177, 9228, 1348,  102]])
[CLS] don't be so judgmental [SEP]
don't be so judgmental


In [3]:
df = pd.read_csv('../data/svicar iz milana.csv')
print(len(df))
print(df['Discussion Type'].unique())
print(df[df['Discussion Type'].isnull()])
print(df[df['Discussion Type'] == 'Others'])

df = df.dropna(subset=['Discussion Type'])
df = df[df['Discussion Type'] != 'Others']
print("\n\nFIXED")
print(len(df))
print(df['Discussion Type'].unique())
print(df[df['Discussion Type'].isnull()])
print(df[df['Discussion Type'] == 'Others'])


712
['Social' 'Seminar' 'Procedure' nan 'Deliberation' 'UX'
 'Imaginative Entry' 'Others']
    Message Discussion Type
13      mor             NaN
59        s             NaN
72   uwgyeu             NaN
77      ha"             NaN
85        w             NaN
175   darla             NaN
                                               Message Discussion Type
201  So, where have we landed: Lady or tiger? Trial...          Others
225  I agree that the princess could just banish th...          Others
236                                    Yay you got it!          Others
244  Oh I see, there are two questions here.  I fee...          Others
310  Looks good! I guess we can complete the post s...          Others


FIXED
701
['Social' 'Seminar' 'Procedure' 'Deliberation' 'UX' 'Imaginative Entry']
Empty DataFrame
Columns: [Message, Discussion Type]
Index: []
Empty DataFrame
Columns: [Message, Discussion Type]
Index: []


In [4]:
df["Discussion Type"]

0            Social
1           Seminar
2           Seminar
3           Seminar
4           Seminar
           ...     
707       Procedure
708       Procedure
709       Procedure
710       Procedure
711    Deliberation
Name: Discussion Type, Length: 701, dtype: object

In [5]:
from datasets import DatasetDict, Dataset

In [6]:
dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['Message', 'Discussion Type', '__index_level_0__'],
    num_rows: 701
})


In [7]:
train_dataset = dataset.shuffle(seed=1).select(range(int(len(dataset) * 0.65)))
valid_dataset = dataset.shuffle(seed=1).select(range(int(len(dataset) * 0.65), int(len(dataset) * 0.85)))
test_dataset = dataset.shuffle(seed=1).select(range(int(len(dataset) * 0.85), len(dataset)))

In [8]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validate': valid_dataset,
    'test': test_dataset
})

In [9]:
train_dataset = dataset_dict['train']
valid_dataset = dataset_dict['validate']
test_dataset = dataset_dict['test']

In [10]:
print(dataset_dict['train'][0])

{'Message': 'Each time you submit, the lady talks back to you and her icon changes.', 'Discussion Type': 'UX', '__index_level_0__': 536}


In [11]:
def preprocess_function(examples):
    return tokenizer(examples["Message"], truncation=True)

In [12]:
tokenized_datasets = dataset_dict.map(preprocess_function, batched=True, batch_size=16)

Map:   0%|          | 0/455 [00:00<?, ? examples/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

In [13]:
print(tokenized_datasets['train'][3])

{'Message': 'I’ll check in soon and add to it ', 'Discussion Type': 'Procedure', '__index_level_0__': 49, 'input_ids': [101, 146, 787, 1325, 4031, 1107, 1770, 1105, 5194, 1106, 1122, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [14]:
print(tokenized_datasets['train'][0].keys())

dict_keys(['Message', 'Discussion Type', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'])


In [15]:
id2label = {0: 'Social', 1: 'Seminar', 2: 'Procedure', 3: 'Deliberation', 4: 'UX', 5: 'Imaginative Entry'}
label2id = {'Social': 0, 'Seminar': 1,'Procedure': 2,'Deliberation': 3, 'UX': 4, 'Imaginative Entry': 5}

model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=6, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [18]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./runs",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [19]:
trainer.train()

  0%|          | 0/58 [00:00<?, ?it/s]

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.

In [None]:
trainer.evaluate()