In [42]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import torch
import pyarrow as pa
from datasets import Dataset
import numpy as np


# https://huggingface.co/docs/transformers/v4.17.0/en/tasks/sequence_classification
# https://huggingface.co/docs/transformers/en/training
# https://www.youtube.com/watch?v=TmT-sKxovb0

# Read the dataset

In [28]:
file_path = '04_comments_annotated-no_reactions.csv'
data = pd.read_csv(file_path, delimiter=';')
print(data.head())

                  c_id                                             c_text  \
0  1391717608802631681  Wer so ein Profilbild wie sie hochlädt kann nu...   
1  1389188826799673345  Willst du jetzt etwa behaupten das Querdenker ...   
2  1385241285645291521  Aber Bild hat doch gerade deswegen und diesbez...   
3  1385240437988986887  Das sehe ich auch genau so. Dieser Brinkhaus i...   
4  1389640445790199809  Mit den Milliarden Unterstützungsgeldern die s...   

               date  conv_id  Generalisation  Ambiguous  Objective  Subjective  
0  10.05.2021 11:31      NaN             3.0        0.0        0.0         1.0  
1  03.05.2021 12:03      NaN             3.0        1.0        0.0         1.0  
2  22.04.2021 14:37      NaN             1.0        0.0        1.0         1.0  
3  22.04.2021 14:33      NaN             0.0        1.0        0.0         1.0  
4  04.05.2021 17:57      NaN             2.0        0.0        0.0         1.0  


# Process the data

In [29]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased") #distilbert-base-multilingual-cased

In [30]:
def preprocess_function(row):
    encodings = tokenizer(row["c_text"], truncation=True, max_length=280, padding="max_length")
    
    encodings["text"]  = row["c_text"];
    encodings["labels"]  = [row["Generalisation"], row["Ambiguous"], row["Objective"], row["Subjective"]];
    
    return encodings;

In [31]:
print(preprocess_function(data.iloc[0]))

{'input_ids': [101, 70061, 10380, 10290, 68750, 27696, 10953, 10632, 39969, 26875, 14892, 12382, 11354, 10290, 84970, 11479, 10128, 10298, 45497, 71421, 17155, 11605, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [32]:
processed_data = []
for index, row in data.iterrows():
    processed_data.append(preprocess_function(row))

# Generate the dataset

In [33]:
processed_data = pd.DataFrame(processed_data)
processed_data = processed_data[:100]
train, validation = train_test_split(processed_data, test_size=0.2)

In [34]:
train_set = Dataset(pa.Table.from_pandas(train))
validation_set = Dataset(pa.Table.from_pandas(validation))

# Create model

In [35]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=4)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
training_args = TrainingArguments(output_dir='./result', evaluation_strategy='epoch')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=validation_set
)

# Train and evaluate

In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=30, training_loss=0.29848461151123046, metrics={'train_runtime': 317.6543, 'train_samples_per_second': 0.756, 'train_steps_per_second': 0.094, 'total_flos': 17386966195200.0, 'train_loss': 0.29848461151123046, 'epoch': 3.0})

In [38]:
trainer.evaluate()

{'eval_loss': 0.2996436357498169,
 'eval_runtime': 9.7337,
 'eval_samples_per_second': 2.055,
 'eval_steps_per_second': 0.308,
 'epoch': 3.0}

# Save the model

In [39]:
model.save_pretrained("./model/test/")

# Read the model

In [46]:
model_pred = AutoModelForSequenceClassification.from_pretrained("./model/test/")

In [47]:
tokenizer_pred = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

# Get Predictions

In [52]:
def get_prediction(text): 
    encoding = tokenizer_pred(text, return_tensors="pt", max_length=280, padding='max_length', truncation=True)
    encoding = {k: v.to(trainer.model.device) for k, v in encoding.items()}
    
    outputs = model_pred(**encoding)
    logits = outputs.logits
    
    print(logits)
    
   # sigmoid = torch.nn.Sigmoid()
   # probs = sigmoid(logits.sqeeze().cpu())
   # probs = probs.detach().numpy()
    #label = np.argmax(probs, axis=-1)
    
    #print((label))

In [54]:
get_prediction("Seit 1960 steigt jedes Jahr die Zahl der Eisbären, Ozonloch über Arktis weg. Tote von Naturkatastrophen so wenig wie nie. Nie soviel Wald wie jetzt. Der Welt geht's saugut. Nur einzelne schieben Panik und die Medien steigen ein, verkauft sich eben gut.")
get_prediction("Der Impf-Apartheider? Hattest du Whisky zum Frühstück?")

tensor([[ 3.1133,  0.0410, -1.0717,  1.4621]], grad_fn=<AddmmBackward0>)
tensor([[ 2.9709,  0.0121, -1.0608,  1.4462]], grad_fn=<AddmmBackward0>)
