In [None]:
#We import our libraries.
import scipy
import accelerate
import bitsandbytes
from transformers import T5ForConditionalGeneration, AutoTokenizer
import torch
import pandas as pd
from tqdm import tqdm; tqdm.pandas()
from collections import Counter
from deep_translator import GoogleTranslator
import re
import csv
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, precision_recall_fscore_support
import numpy as np
from sklearn.linear_model import LogisticRegression

In [None]:
#We define a function to create a table of metrics.
def get_metrics_table(y_test, y_pred, feature, model):
    #Firstly we simply make a dataframe of the metrics classification report using the the test and predict values.
    rep = pd.DataFrame(metrics.classification_report(y_test, y_pred, output_dict=True)).transpose()
    #We round every number to 2 decimals.
    print(round(rep, 2))
    #We add to it columns in which we can describe which feature was classified and which model was used.
    rep["feature"] = feature; rep["model"] = model
    rep[["precision", "f1-score", "recall"]] = rep[["precision", "f1-score", "recall"]].apply(lambda x: round(x, 2))
    #We unround the support column, given that it is a discrete value.
    rep["support"] = rep["support"].apply(lambda x: int(x))
    #We add the accuracy as a column.
    #Since this value is equal for each class, we only want to display it once in the top row.
    #For the others we add a blank value.
    rep["accuracy"] = [rep[rep.index=="accuracy"].values[0][0]] + 4 * [" "]
    #We only want the first two rows, having the main metrics.
    rep = rep[:2]
    #We create a simple class name column.
    rep["class"] = ["no", "yes"]
    #We output all we need.
    rep = rep[["feature", "model", "class", "precision", "recall", "f1-score", "support", "accuracy"]][:2]
    return rep

## llm

In [None]:
#We firstly check whether are device is correctly connected to cuda.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu"); print(device)

In [271]:
#We load in our model and tokenizer.
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", load_in_8bit=True)                                                                 
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.12it/s]


In [11]:
#We load in the intro sentences dataset.
df = pd.read_excel("11_intro_sent.xlsx", index_col=0); df = df.sample(frac=1, random_state=1)

In [12]:
#We check its length.
len(df)

128689

In [13]:
#And its columns.
df.columns

Index(['index', 'news_source', 'sent', 'sent_index', 'source_presence'], dtype='object')

    f"You are a researcher into the use of sources in news content. Please answer me this question: are sources used in the following sentence: {translated}? You can choose from the following categories: 0) no: no sources are used or 1) yes: sources are used. Please answer with either 0 (for no) or 1 (for yes)."

In [14]:
#We define a function to get an LLM response.
def get_prompt_response(x):
    try:
        #Firstlu we meed tp translate our input, as the model of interest mostly understands english.
        translated = GoogleTranslator(source="nl", target="en").translate(x)
        #Secondly we need a prompt, which needs the translated input.
        input_string = f"You are a researcher into the use of sources in news content. Please answer me this question: are sources used in the following sentence: {translated}? You can choose from the following categories: 0) no: no sources are used or 1) yes: sources are used. Please answer with either 0 (for no) or 1 (for yes)."
        #We tokenize this input into tensor input data for the LLM to understand.
        inputs = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
        #We let the model generate its output.
        outputs = model.generate(inputs, max_length=9999)
        #We decode the output to get a readable result.
        result = tokenizer.decode(outputs[0])
        #We need a regular expression to clean up the answer.
        return re.search(">(.*)<", result).group(1).strip()
    except:
        return "999"

In [None]:
#We define an empty list.
outcome = []
#We open up an emptu csv file.
with open("llm_intro_data.csv", "w", newline="", encoding="utf-8") as file:      
    writer = csv.writer(file)
    #We write down our column names of interest.
    writer.writerow(["news_source", "index", "sent_index", "sent", "source_presence_flan"])
    #We loop over the rows.
    for index, row in df.iterrows():
        #We use our function and the sentence as the input.
        source_presence_flan = get_prompt_response(row["sent"])
        #We append the outcome to a list.
        outcome.append(source_presence_flan)
        #We continuously print the counts of each outcome.
        print(Counter(outcome), end="\r")
        #We write our values of interest away.
        writer.writerow([row["news_source"], row["index"], row["sent_index"], row["sent"], source_presence_flan])
        file.flush()
        #We stop iterating if each possible value (yes or no sourcing) occurred a 1000 times.
        if all(outcome.count(str(i)) == 1000 for i in range(1)):
            break
                     
file.close()

In [15]:
#We read in a version of our results.
df_result = pd.read_csv("11_intro_sourcing_llm.csv")

In [21]:
#We exclude the rows with false output.
df_result = df_result[df_result.source_presence_flan!=999]

In [22]:
#We count the occurance of each value.
Counter(df_result.source_presence_flan)

Counter({0: 11343, 1: 1000})

In [23]:
#We create a custom sampling function just to make sure that we sample random from the no category (0).
def custom_sampling(group, min_n=1000):
    #We sample a thousand of eahc category.
    return group.sample(min(min_n, len(group)), random_state=1)

#For each value we sample a thousand, and shuffle them randomly.
df_result_sample = df_result.groupby("source_presence_flan", group_keys=False).apply(custom_sampling).sample(frac=1, random_state=1)
#We then create an empty column for which we can input our manual annotations
df_result_sample["source_presence_manual"] = " "

In [25]:
#We write it away to an excel file.
df_result_sample.to_excel("sourcing_sample.xlsx")

In [24]:
#We count whether the sampling went correctly.
Counter(df_result_sample.source_presence_flan)

Counter({0: 1000, 1: 1000})

### manual

__Does the text provide any form of sourcing?__

- 1 = True
- 0 = False

Examples of sources are …

- Anonymous sources: sources
are not identifiable by withholding full names and disclosing little to no descriptive
featur.es
    - Sources, insiders, et cetera.
- Opaque sources: sources are only partly identifiable by withholding 
full names and solely providing abstract or overarching feature.s
    - The media, press agencies, experts, messages, et cetera.
- Explicit sources: sources are directly identifiable by 
providing full name.s
    - Specific news and/or public organizations, governmental bo and es, coprorate entities, identifiable statistics/content/reports/individuls.


In [26]:
#We read in the data with our manual annotations.
df_coded = pd.read_excel("11_intro_sourcing_man.xlsx", index_col=0)

In [27]:
#We drop any missing values.
df_coded = df_coded.dropna()

### llm metrics

In [116]:
#We comare our annotations with the llm output, using the earlier defined function for the metrics table.
overview_llm_presence = get_metrics_table(df_coded.source_presence_manual.astype(int), df_coded.source_presence_flan, feature="sourcing presence", model="LLM")

              precision  recall  f1-score  support
0                  0.85    0.81      0.83  1051.00
1                  0.80    0.84      0.82   949.00
accuracy           0.83    0.83      0.83     0.83
macro avg          0.83    0.83      0.83  2000.00
weighted avg       0.83    0.83      0.83  2000.00


### naive bayes

In [28]:
#We count the presence in our manual coding.
Counter(df_coded.source_presence_manual)

Counter({0: 1051, 1: 949})

In [29]:
#We define our X and y, namely the sentences as the X, and the y as the codings.
X = list(df_coded["sent"]); y = list(df_coded["source_presence_manual"].astype(int))

In [None]:
#We split our dataset into a train and test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 1, stratify=y)

In [120]:
#We define our vectorizer,
vectorizer = CountVectorizer()
#We use it to encode our test and train data.
X_train_enc = vectorizer.fit_transform(X_train)
X_test_enc = vectorizer.transform(X_test)
#We define our model. In this simply a Naive Bayes.
nb = MultinomialNB()
#We fit the model on our training data.
nb.fit(X_train_enc, y_train)
#We use it to predict the classes for our test data.
y_pred = nb.predict(X_test_enc)
#We now compare the NB output with our manual codings.
overview_nb_presence = get_metrics_table(y_test, y_pred, feature="sourcing presence", model="Naive Bayes")

              precision  recall  f1-score  support
0                  0.83    0.73      0.78   210.00
1                  0.74    0.84      0.78   190.00
accuracy           0.78    0.78      0.78     0.78
macro avg          0.78    0.78      0.78   400.00
weighted avg       0.79    0.78      0.78   400.00


### roberta

In [121]:
#We define our tokenizer from the Roberta dutch model.
tokenizer = RobertaTokenizer.from_pretrained("pdelobelle/robbert-v2-dutch-base")

In [122]:
#We now use this tokenizer to tokenize our training and test data.
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [123]:
#We define our model.
model = RobertaForSequenceClassification.from_pretrained("pdelobelle/robbert-v2-dutch-base", num_labels = 2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pdelobelle/robbert-v2-dutch-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#We define a custom dataset class.
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        #We store the input encodings.
        self.encodings = encodings
        #We store the corresponding labels.
        self.labels = labels
    def __getitem__(self, idx):
        #We create a dictionary to store the current item.
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        #If labels are provided, we add them to the item dictionary.
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        #We return the item.
        return item
    def __len__(self):
        #We return the length of the input ids list.
        return len(self.encodings["input_ids"])

In [125]:
#We define our train and test dataset.
train_dataset = Dataset(train_encodings, y_train)
test_dataset = Dataset(test_encodings, y_test)

In [126]:
#We print the occurance of the labels in both datasets.
print(Counter(train_dataset.labels)); print(Counter(test_dataset.labels))

Counter({0: 841, 1: 759})
Counter({0: 210, 1: 190})


In [127]:
#We define a function to track the metrics of the model.
def compute_metrics(p):
    pred, labels = p
    #We extract the label of the highest probability.
    pred = np.argmax(pred, axis=1)
    #We compare our labels with the predictions.
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    #We calculate additional metrics scores.
    precision, recall, f1, _ = precision_recall_fscore_support(y_true=labels, y_pred=pred, average="macro")
    #We return the metrics.
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [128]:
#We define our training arguments.
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=7,
    per_device_train_batch_size=8,
    logging_steps=100)

#We define our trainer, with our model, the training arguments, the train and test datasets and the metric computation function.
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [129]:
#We train the model.
trainer.train()



Step,Training Loss
100,0.2988
200,0.1326
300,0.0667
400,0.0181
500,0.0139
600,0.0105
700,0.0047




TrainOutput(global_step=700, training_loss=0.07789502054452896, metrics={'train_runtime': 120.4477, 'train_samples_per_second': 92.986, 'train_steps_per_second': 5.812, 'total_flos': 1059021997824000.0, 'train_loss': 0.07789502054452896, 'epoch': 7.0})

In [130]:
#We evaluate what we trained.
trainer.evaluate()

{'eval_loss': 0.30231335759162903,
 'eval_accuracy': 0.955,
 'eval_precision': 0.9546591932739465,
 'eval_recall': 0.9553884711779448,
 'eval_f1': 0.9549278846153846,
 'eval_runtime': 1.0077,
 'eval_samples_per_second': 396.958,
 'eval_steps_per_second': 24.81,
 'epoch': 7.0}

In [131]:
#We save the trained model.
trainer.save_model("source_presence_classifier")

In [132]:
#We load the pretrained model.
source_presence_classifier = RobertaForSequenceClassification.from_pretrained("11_source_presence_classifier").to("cuda")

In [133]:
source_presence_classifier.to("cuda")

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(40000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [134]:
#We define a function to get the model predictions.
def get_model_predictions(text, model=source_presence_classifier, output_format="labels"):
    #We tokenize the input.
    inputs = tokenizer(text,padding = True, truncation = True, return_tensors="pt").to("cuda")
    #We extract the outputs.
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predictions = np.round(predictions.cpu().detach().numpy(), 3)
    
    #If the prediction value of the first vlass is higher then that of the second, we return the first class.
    if predictions[0][0] > predictions[0][1]:
        result = 0
    #Else it will be the second class.
    else:
        result = 1
        
    if output_format=="raw":
        return predictions
    elif output_format=="labels":
        return result

In [135]:
y_pred_roberta = []

#We loop over all our test data.
for e in tqdm(X_test):
    #We make predictions using the function above.
    y = get_model_predictions(e)
    #We append the outcomes to a list.
    y_pred_roberta.append(y)

    
#We print the occurance of each class.
print(Counter(y_pred_roberta))

100%|██████████| 400/400 [00:03<00:00, 124.44it/s]

Counter({0: 206, 1: 194})





In [136]:
#We get some metrics, comparing it with our manual coding.
overview_roberta_presence = get_metrics_table(y_test, y_pred_roberta, feature="sourcing presence", model="Roberta")

              precision  recall  f1-score  support
0                  0.97    0.95      0.96   210.00
1                  0.94    0.96      0.95   190.00
accuracy           0.96    0.96      0.96     0.96
macro avg          0.95    0.96      0.95   400.00
weighted avg       0.96    0.96      0.96   400.00


## source presence prediction

In [140]:
#We make the source presence predictions for every sentence.
df["p_source_presence"] = df.sent.progress_apply(lambda x: get_model_predictions(x))

100%|██████████| 128689/128689 [17:40<00:00, 121.33it/s]


In [141]:
#We count the occurance of each class.
Counter(df.p_source_presence)

Counter({0: 102125, 1: 26564})

In [149]:
#We combine the performance of all models.
overview_presence = pd.concat([overview_nb_presence, overview_llm_presence, overview_roberta_presence])

In [150]:
overview_presence

Unnamed: 0,feature,model,class,precision,recall,f1-score,support,accuracy
0,sourcing presence,Naive Bayes,no,0.83,0.73,0.78,210,0.78
1,sourcing presence,Naive Bayes,yes,0.74,0.84,0.78,190,
0,sourcing presence,LLM,no,0.85,0.81,0.83,1051,0.83
1,sourcing presence,LLM,yes,0.8,0.84,0.82,949,
0,sourcing presence,Roberta,no,0.97,0.95,0.96,210,0.96
1,sourcing presence,Roberta,yes,0.94,0.96,0.95,190,


In [165]:
#We write the table away as a csv.
overview_presence.to_csv("overview_presence.csv")

## anonymous sourcing

In [170]:
#We define a function censor the presence of sources in text.
def get_source_censored(x):
    sent = x["sent"]
    news_source = x["news_source"]
    source_pattern = f"\\b({news_source}(\.nl)?)\\b"
    
    #If the regular expresssion above detects the sourcename in a sentence, then we replace the sourcename with SOURCE.
    #We also quickly assess whether a sourcename is present at all, returing 1 if so, and 0 if not.
    if re.search(source_pattern, sent):
        sent = re.sub(source_pattern, "SOURCE", sent)
        return 1, sent
    else:
        return 0, sent 

In [174]:
#We create columns for the two outputs of the above defined function.
df[["news_source_presence", "sent_c"]] = df.progress_apply(lambda x: pd.Series(get_source_censored(x)), axis=1)

100%|██████████| 128689/128689 [00:16<00:00, 7918.81it/s] 


In [175]:
#We print the presence of sources in sentences.
Counter(df.news_source_presence)

Counter({0: 127240, 1: 1449})

In [180]:
#We define a simple function to detect anonymous sourcing.
def get_anonymous_sourcing(x):
    #If the word anonymous, sources, or insiders is present we return 1. Or alterations of these words.
    if re.search(r"\b(anoniem(e)?|bron(nen)?(?! van\b)|ingewijde(n)?)\b", x, flags=re.IGNORECASE):
        result = 1
    else:
        #Else we return 0.
        result = 0
    return result 

In [181]:
#We define this as a new column.
df["anonymous_sourcing"] = df.sent.progress_apply(lambda x: get_anonymous_sourcing(x))

100%|██████████| 128689/128689 [00:00<00:00, 186713.14it/s]


In [182]:
#We count the presence.
Counter(df.anonymous_sourcing)

Counter({0: 128391, 1: 298})

In [191]:
#We only keep the rows with a 1 score on anonymous sourcing.
df_anonymous = df[df.anonymous_sourcing==1]

In [192]:
#We also make a subset of data for when sourcing was presence according to our model prediction.
df_present = df[(df.anonymous_sourcing==0) & (df.p_source_presence==1)]

In [193]:
#We randomly sample 2000 sentences from this.
df_present_sample = df_present.sample(2000, random_state=1)

In [194]:
#We add to it the anonymous sentences.
df_present_w_anonymous_sample = pd.concat([df_anonymous, df_present_sample])

In [195]:
#We write it away to excel.
df_present_w_anonymous_sample.to_excel("df_present_w_anonymous_sample.xlsx")

## soucing category manual

__Which type of source does the text provide?__

- 0 = absent
- 1 = anonymous
- 2 = opaque
- 3 = explicit


Definitions & examples:

- Absent: no sources are used.
- Anonymous sources: sources
are not identifiable by withholding full names and disclosing little to no descriptive
features.
    - Sources, insiders, et cetera.
- Opaque sources: sources are only partly identifiable by withholding full names and solely providing abstract or overarching features.
    - The media, press agencies, experts, messages, et cetera.
- Explicit sources: sources are directly identifiable by providing full names.
    - Specific news and/or public organizations, governmental bodies and coprorate entities, identifiable statistics/content/reports/individuals.

In [31]:
#We read in our manual sourcing category codings.
df_sourcing = pd.read_excel("11_intro_source_cat_man.xlsx", index_col=0)

In [364]:
#We remove all faulty read in columns.
df_sourcing = df_sourcing.loc[:, ~df_sourcing.columns.str.contains("Unnamed")]

In [365]:
#We drop the rows where we have missing values for our manual codes.
df_sourcing = df_sourcing.dropna(subset=["sourcing_cat_man"])

In [366]:
#We count the presence of each category.
Counter(df_sourcing.sourcing_cat_man)

Counter({3.0: 1238, 1.0: 139, 2.0: 620, 0.0: 300})

In [367]:
#We creat an extended metrics table function, given that we now have more then 2 categories to predict.
def get_metrics_table_cat(y_test, y_pred, feature, model, cat=["absent", "anonymous", "opaque", "explicit"], n_cat=4):
    #We once again transform the metrics classification report to a dataframe.
    rep = pd.DataFrame(metrics.classification_report(y_test, y_pred, output_dict=True)).transpose()
    #We round every column to 2.
    print(round(rep, 2))
    #We add the feature and model name as columns.
    rep["feature"] = feature; rep["model"] = model
    #We include the following metrics scores as ccolumns:
    rep[["precision", "f1-score", "recall"]] = rep[["precision", "f1-score", "recall"]].apply(lambda x: round(x, 2))
    #We add the support value and store it as discrete values.
    rep["support"] = rep["support"].apply(lambda x: int(x))
    #We include the accuracy once at the top row.
    rep["accuracy"] = [rep[rep.index=="accuracy"].values[0][0]] + 6 * [" "] 
    #We want the first 4 rows since we have 4 categories.
    rep = rep[:n_cat]
    #We add the class names for each row.
    rep["class"] = cat
    #We extract what we need.
    rep = rep[["feature", "model", "class", "precision", "recall", "f1-score", "support", "accuracy"]][:n_cat]
    return rep

## logistic regression

In [368]:
#We create a new X list with our sentences and now the y is the manual sourcing category codes.
X = list(df_sourcing["sent_c"]); y = list(df_sourcing["sourcing_cat_man"].astype(int))

In [369]:
#We define our test and training dataset.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 1, stratify=y)

In [370]:
#We count the presence in the train and test dataset.
Counter(y_train); Counter(y_test)

Counter({2: 124, 3: 248, 0: 60, 1: 28})

In [371]:
#We define our vectorizer.
vectorizer = CountVectorizer()
#We fit and transform our training data accordingly.
X_train_enc = vectorizer.fit_transform(X_train)
#We transform the test data.
X_test_enc = vectorizer.transform(X_test)
#We define a model, here a Logistic Regression.
model = LogisticRegression(max_iter=1000).fit(X_train_enc, y_train)
#We predict the labels for our test data.
y_pred = model.predict(X_test_enc)

In [372]:
#We print the performance compared to our manual coding.
overview_lg_sourcing = get_metrics_table_cat(y_test, y_pred, "sourcing category", "Logistic Regression")

              precision  recall  f1-score  support
0                  0.62    0.22      0.32    60.00
1                  0.73    0.39      0.51    28.00
2                  0.61    0.50      0.55   124.00
3                  0.67    0.87      0.75   248.00
accuracy           0.65    0.65      0.65     0.65
macro avg          0.66    0.49      0.53   460.00
weighted avg       0.65    0.65      0.63   460.00


In [373]:
overview_lg_sourcing

Unnamed: 0,feature,model,class,precision,recall,f1-score,support,accuracy
0,sourcing category,Logistic Regression,absent,0.62,0.22,0.32,60,0.65
1,sourcing category,Logistic Regression,anonymous,0.73,0.39,0.51,28,
2,sourcing category,Logistic Regression,opaque,0.61,0.5,0.55,124,
3,sourcing category,Logistic Regression,explicit,0.67,0.87,0.75,248,


## Roberta

NOTE: The following part is largely based on this script:

    https://github.com/annekroon/gesis-machine-learning/blob/main/fall-2023/day5/transformers_bert_classification.ipynb

In [374]:
#We load in some additional libraries.
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.utils import compute_sample_weight

In [375]:
#We load in the tokenizer of the Roberta model.
tokenizer = AutoTokenizer.from_pretrained("pdelobelle/robbert-v2-dutch-base")

In [376]:
#We extract the unique labels in our training data.
unique_labels = set(label for label in y_train)
#We create a function which can convert the labels to ids and the other way around.
label2id = {label: id for id, label in enumerate(unique_labels)}
id2label = {id: label for label, id in label2id.items()}

In [377]:
label2id.keys()

dict_keys([0, 1, 2, 3])

In [378]:
id2label.keys()

dict_keys([0, 1, 2, 3])

In [379]:
#We encode our texts of the train and testdata.
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings  = tokenizer(X_test, truncation=True, padding=True)

#We encode the labels for the train and test data.
train_labels_encoded = [label2id[y] for y in y_train]
test_labels_encoded  = [label2id[y] for y in y_test]

In [None]:
#We define a custom dataset class.
class MyDataset(torch.utils.data.Dataset):
    #We initialize the dataset with encodings and labels.
    def __init__(self, encodings, labels):
        #We store the input encodings.
        self.encodings = encodings
        #We store the corresponding labels.
        self.labels = labels
    def __getitem__(self, idx):
        #We create a dictionary to store the current item.
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        #We add labels to the item dictionary.
        item['labels'] = torch.tensor(self.labels[idx])
        #We return the item.
        return item
    def __len__(self):
        #We return the length of the labels list.
        return len(self.labels)

In [381]:
#We define our train and test datasets.
train_dataset = MyDataset(train_encodings, train_labels_encoded)
test_dataset = MyDataset(test_encodings, test_labels_encoded)

In [382]:
#We define our model.
model = AutoModelForSequenceClassification.from_pretrained("pdelobelle/robbert-v2-dutch-base", num_labels=len(id2label)).to("cuda")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pdelobelle/robbert-v2-dutch-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [383]:
#We create our metrics function to track performance.
def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average='macro', sample_weight=compute_sample_weight('balanced', labels))
    return {'accuracy': acc, 'macro_f1': macro_f1}

In [384]:
#We specifically focus on the macro f1.
metric_name = 'macro_f1'

In [385]:
#We define our training arguments.
training_args = TrainingArguments(
    num_train_epochs=9,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    warmup_steps=0,
    weight_decay=0.01,
    output_dir='./results',
    logging_dir='./logs',
    logging_steps=20,
    evaluation_strategy='steps',
)

In [386]:
#We define our trainer.
trainer = Trainer(
    model=model,                        
    args=training_args,                 
    train_dataset=train_dataset,       
    eval_dataset=test_dataset,           
    compute_metrics=compute_metrics     
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [387]:
#We train the model.
trainer.train()



Step,Training Loss,Validation Loss,Accuracy,Macro F1
20,1.1591,1.039227,0.53913,0.1
40,0.9661,0.836675,0.691304,0.299618
60,0.8124,0.698177,0.730435,0.36988
80,0.7651,0.644384,0.743478,0.469183
100,0.6887,0.60523,0.771739,0.44227
120,0.6193,0.588319,0.763043,0.577657
140,0.438,0.557678,0.795652,0.634237
160,0.4274,0.582825,0.804348,0.684302
180,0.4319,0.535029,0.817391,0.732862
200,0.3607,0.584172,0.813043,0.712577




TrainOutput(global_step=1035, training_loss=0.173978473417065, metrics={'train_runtime': 268.0863, 'train_samples_per_second': 61.67, 'train_steps_per_second': 3.861, 'total_flos': 1045041919250472.0, 'train_loss': 0.173978473417065, 'epoch': 9.0})

In [388]:
#We evaluate the training.
trainer.evaluate()

{'eval_loss': 0.8966751098632812,
 'eval_accuracy': 0.8391304347826087,
 'eval_macro_f1': 0.7857727677552304,
 'eval_runtime': 2.3345,
 'eval_samples_per_second': 197.043,
 'eval_steps_per_second': 12.422,
 'epoch': 9.0}

In [389]:
#We predict the outcomes.
predicted_results = trainer.predict(test_dataset)



In [390]:
#It has a shape of 4 given that we have four possible classes.
predicted_results.predictions.shape

(460, 4)

In [391]:
predicted_labels = predicted_results.predictions.argmax(-1) # Get the highest probability prediction
#We turn it to a list.
predicted_labels = predicted_labels.flatten().tolist() 
predicted_labels = [id2label[y] for y in predicted_labels]

In [392]:
#We count the occurance of the predictions.
Counter(predicted_labels)

Counter({2: 125, 3: 258, 1: 33, 0: 44})

In [393]:
#We save the model.
trainer.save_model("sourcing_cat_classifier")

In [394]:
#We create an overview of its performance.
overview_roberta_sourcing = get_metrics_table_cat(y_test, predicted_labels, "sourcing category", "Roberta")

              precision  recall  f1-score  support
0                  0.75    0.55      0.63    60.00
1                  0.76    0.89      0.82    28.00
2                  0.79    0.80      0.80   124.00
3                  0.89    0.92      0.91   248.00
accuracy           0.84    0.84      0.84     0.84
macro avg          0.80    0.79      0.79   460.00
weighted avg       0.84    0.84      0.84   460.00


In [395]:
overview_roberta_sourcing

Unnamed: 0,feature,model,class,precision,recall,f1-score,support,accuracy
0,sourcing category,Roberta,absent,0.75,0.55,0.63,60,0.84
1,sourcing category,Roberta,anonymous,0.76,0.89,0.82,28,
2,sourcing category,Roberta,opaque,0.79,0.8,0.8,124,
3,sourcing category,Roberta,explicit,0.89,0.92,0.91,248,


In [396]:
#We combine it with the performance of the LG.
overview_sourcing = pd.concat([overview_lg_sourcing, overview_roberta_sourcing])

In [397]:
overview_sourcing

Unnamed: 0,feature,model,class,precision,recall,f1-score,support,accuracy
0,sourcing category,Logistic Regression,absent,0.62,0.22,0.32,60,0.65
1,sourcing category,Logistic Regression,anonymous,0.73,0.39,0.51,28,
2,sourcing category,Logistic Regression,opaque,0.61,0.5,0.55,124,
3,sourcing category,Logistic Regression,explicit,0.67,0.87,0.75,248,
0,sourcing category,Roberta,absent,0.75,0.55,0.63,60,0.84
1,sourcing category,Roberta,anonymous,0.76,0.89,0.82,28,
2,sourcing category,Roberta,opaque,0.79,0.8,0.8,124,
3,sourcing category,Roberta,explicit,0.89,0.92,0.91,248,


In [398]:
#We write away these performance tables.
overview_sourcing.to_csv("overview_sourcing.csv")

## predicting sourcing cat

In [406]:
#We subset the data which either has sources or anonymous sources.
df_sourcing = df[(df.p_source_presence==1)|(df.anonymous_sourcing==1)]

In [407]:
#We load in our sourcing category classification model.
sourcing_cat_classifier = RobertaForSequenceClassification.from_pretrained("sourcing_cat_classifier").to("cuda")

In [408]:
#We define a function to get predictions.
def get_model_predictions(text, model=sourcing_cat_classifier, output_format="labels"):
    #We tokenize the text.
    inputs = tokenizer(text,padding = True, truncation = True, return_tensors="pt").to("cuda")
    #We extract the model outputs.
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predictions = np.round(predictions.cpu().detach().numpy(), 3)
    #We extract the class with the highest probability.
    pred_labels = predictions.argmax(-1)[0]
    pred_labels = id2label.get(pred_labels)

    if output_format=="raw":
        return predictions
    elif output_format=="labels":
        return pred_labels

In [412]:
#We use it on every censored sentence within the sourcing data subset.
df_sourcing["p_sourcing_cat"] = df_sourcing.sent_c.progress_apply(lambda x: get_model_predictions(x))

100%|██████████| 26633/26633 [03:35<00:00, 123.79it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sourcing["p_sourcing_cat"] = df_sourcing.sent_c.progress_apply(lambda x: get_model_predictions(x))


In [413]:
#We counter the presence of each category as predicted.
Counter(df_sourcing.p_sourcing_cat)

Counter({3: 15010, 0: 2344, 2: 8895, 1: 384})

In [414]:
#We write away to a csv file.
df_sourcing.to_csv("sourcing_complete.csv")