# Text Leveling as Feature for Complexity Prediction

- **Idea:** The older the target group of a text, the more difficult to read is the text. 
- **Data**: [Lexica-Corpus](https://github.com/fhewett/lexica-corpus) with parallelen Wikipedia texts for children, youth and adults
- **Label:** children (0), youth (1), adults (2)
- **Method**: Sequence-Labeling.
    - split the lexica-corpus texts into sentences and label the sentences with their target group (label)
    - Fine-tune a Transformer model with these data (sentence - label)
    - predict the label of the complexity dataset (GermEval) with the fine-tuned model
    
    
- ToDo:
    - test different transformer models
    - [x] distilbert-base-german-cased
    - [ ] bert-base-german-cased
    - [ ] dbmdz/german-gpt2

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification  # , GPT2Tokenizer
from datasets import load_dataset, Dataset
import pandas as pd
from transformers import TrainingArguments
from transformers import TrainingArguments, Trainer

from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
import numpy as np
from datasets import load_metric

  from .autonotebook import tqdm as notebook_tqdm
2022-06-21 21:43:50.127629: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-21 21:43:50.127656: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cpu


## Fine-tuning Model

In [3]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=250)


In [4]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [5]:
def read_and_reduce_data(filename):
	# # read data, recode labels, remove unused columns
	dataframe = pd.read_csv(filename, header=0)
	# dataframe["level"] = dataframe['level'].map({'complex': 0, 'simple': 1})
	dataframe["level-detail"] = dataframe['level-detail'].map({'children': 0, 'youth': 1, 'adults': 2})
	dataframe_simple = dataframe  # [dataframe["level"]==1]
	dataframe_simple_short = dataframe_simple[["text", "level-detail"]]
	dataframe_simple_short.rename(columns={'level-detail': 'label'}, inplace=True)
	return dataframe_simple_short

In [6]:
def pre_process_data(filename):
	# split data into train and dev
	
	dataframe_simple_short = read_and_reduce_data(filename)
	
	train_data, dev_data = train_test_split(dataframe_simple_short, test_size=0.2)
	# test_data, dev_data = train_test_split(dev_data, test_size=0.1)
	
	train_dataset = Dataset.from_pandas(train_data)
	dev_dataset = Dataset.from_pandas(dev_data)
	# test_data = Dataset.from_pandas(test_data)
	
	# tokenize data
	tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
	tokenized_dev_dataset = dev_dataset.map(tokenize_function, batched=True)
	# tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
	
	return tokenized_train_dataset, tokenized_dev_dataset, None  # , tokenized_test_dataset

In [7]:
model_name = "distilbert-base-german-cased"  # "bert-base-german-cased",  # dbmdz/german-gpt2
tokenizer = AutoTokenizer.from_pretrained(model_name) 
# tokenizer = GPT2Tokenizer.from_pretrained('dbmdz/german-gpt2')
# tokenizer.pad_tokenizer = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
model.to(device)

Some weights of the model checkpoint at distilbert-base-german-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pr

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(31102, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [8]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [9]:
tokenized_train_dataset, tokenized_dev_dataset, tokenized_test_data = pre_process_data("data/Text_Leveling_Wikipedia/data_text_level.csv")
print(len(tokenized_train_dataset), len(tokenized_dev_dataset)) # , len(test_data))
metric = load_metric("accuracy")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_dev_dataset,
    compute_metrics=compute_metrics,
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe_simple_short.rename(columns={'level-detail': 'label'}, inplace=True)
100%|███████████████████████████████████████████| 31/31 [00:02<00:00, 10.81ba/s]
100%|█████████████████████████████████████████████| 8/8 [00:00<00:00,  9.82ba/s]


30334 7584


In [None]:
trainer.train()

In [None]:
trainer.save_model()

### Results Fine-tuning:
- German DistilBERT:
  - full dataset size: 38,000 (only lexica-corpus for children (0), youth (1) and adults (2))
   [11376/11376 39:28, Epoch 3/3]
   
Epoch |	Training Loss |	Validation Loss |	Accuracy
------|---------------|-----------------|---------
1 |	0.501700 |	0.477283 |	0.786392
2 |	0.377800 |	0.550949 |	0.788502
3 |	0.283300 |	0.718968 |	0.776899


## Load pretrained model and predict label

In [10]:
pretrained_model = AutoModelForSequenceClassification.from_pretrained("data/textleveling_model/")

loading configuration file data/textleveling_model/config.json
Model config DistilBertConfig {
  "_name_or_path": "data/textleveling_model/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": true,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.19.4",
  "vocab_size": 31102
}

loading weights file data/textleveling_model/pytorch_model.bin
All model checkpoint weights were used when initial

In [17]:
def get_levels(data_path):
    data = pd.read_csv(data_path)
    data.rename(columns={'sentence_id': 'ID'}, inplace=True)
    
    for i, row in data.iterrows():
        encoded_input = tokenizer(row["Sentence"], return_tensors='pt', padding="max_length", truncation=True, max_length=250)
        output = pretrained_model(**encoded_input)
        with torch.no_grad():
            logits = pretrained_model(**encoded_input).logits

        predicted_class_id = logits.argmax().item()
        data.loc[i, "F_text_level"] = int(predicted_class_id)

    if "MOS" in data.columns:
        print(data[["MOS", "F_text_level"]].corr())
    return data

In [18]:
train_scores = get_levels('data/public_data_text_complexity22/training_set.csv')

                   MOS  F_text_level
MOS           1.000000      0.587874
F_text_level  0.587874      1.000000


In [19]:
dev_scores = get_levels('data/public_data_text_complexity22/validation_set.csv')

In [22]:
test_scores = get_levels('data/public_data_text_complexity22/part2_public.csv')

In [23]:
train_scores.to_csv("data/feats/train_2.csv", index=False)
dev_scores.to_csv("data/feats/validation_2.csv", index=False)
test_scores.to_csv("data/feats/test_2.csv", index=False)