In [113]:
#We import our necessary libraries.
import pandas as pd
from collections import Counter
from tqdm import tqdm; tqdm.pandas()
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, precision_recall_fscore_support
import torch
import numpy as np
from nltk.tokenize import sent_tokenize
from ast import literal_eval

In [114]:
#We redefine our earlier used metrics label function as used in the script 11 for source presence.
def get_metrics_table(y_test, y_pred, feature, model):
    rep = pd.DataFrame(metrics.classification_report(y_test, y_pred, output_dict=True)).transpose()
    print(round(rep, 2))
    rep["feature"] = feature; rep["model"] = model
    rep[["precision", "f1-score", "recall"]] = rep[["precision", "f1-score", "recall"]].apply(lambda x: round(x, 2))
    rep["support"] = rep["support"].apply(lambda x: int(x))
    rep["accuracy"] = [rep[rep.index=="accuracy"].values[0][0]] + 4 * [" "] 
    rep = rep[:2]
    rep["class"] = ["no", "yes"]
    rep = rep[["feature", "model", "class", "precision", "recall", "f1-score", "support", "accuracy"]][:2]
    return rep

__Does the text give any insight into when, why, how or against which standards the article was created?__

- 1 = True
- 0 = False

Examples are providing explanations of …

- The origin of the news and/or news selection processes:
    - Dit artikel is een nieuwe versie van een stuk dat we eind vorige maand publiceerden. Omdat de rechter vandaag uitspraak doet, brengen we het opnieuw onder de aandacht.
    - Een versie van dit artikel verscheen ook in de krant van 11 oktober 2023.
- Internal news standards and/or motives:
    - Doordat het negatieve nieuws NU.nl vaak domineert, sneeuwt het positieve nieuws soms onder. Daarom zetten we wekelijks vrolijk stemmende berichten op een rij.
- Internal news decisions:
    - De NOS heeft ervoor gekozen om geen bedrijven te benaderen die vallen onder elektriciteit, cement en waterstof, omdat daar op dit moment weinig van wordt geïmporteerd van buiten de EU naar Nederland.
- News sourcing and production processes:
    - De Volkskrant heeft per mail contact gezocht met de King Saud Universiteit, maar geen reactie ontvangen.
    - Deze samenvatting is gemaakt met behulp van AI en gecheckt door NU.nl.

__Note: It is of importance that the role of the media company and or its workers is proactively disclosed. For instance whereas simply stating which sources are used does not constitute process information, indicating how sources were contacted by the media company and or its workers and/or why sources were used does.__

In [3]:
#We read our annotated data.
df_sample = pd.read_csv("21_process_info_man.csv", sep=";", encoding="utf-8", index_col=0)

In [4]:
#We drop faulty read columns.
df_sample = df_sample.loc[:, ~df_sample.columns.str.contains("Unnamed")]

In [5]:
#We drop any missing values.
df_sample = df_sample.dropna()

In [6]:
#We create a function that censors sentences on the presence or absence of the outlet name within the sentence.
def get_source_censored(x):
    sent = x["self_referral_text"]
    news_source = x["news_source"]
    source_pattern = f"\\b({news_source}(\.nl)?)\\b"
    
    #We search if the outlet name is present.
    #If so we replace it by SOURCE and return 1.
    if re.search(source_pattern, sent):
        sent = re.sub(source_pattern, "SOURCE", sent)
        return 1, sent
    else:
        #In any other case we leave the sentence untouched and return it with 0.
        return 0, sent 

In [120]:
#We apply the function to our data.
df_sample[["news_source_presence", "sent_c"]] = df_sample.progress_apply(lambda x: pd.Series(get_source_censored(x)), axis=1)

100%|██████████| 1980/1980 [00:00<00:00, 8751.25it/s] 


In [124]:
#We count the presence of outlet names in sentences.
Counter(df_sample.news_source_presence)

Counter({0: 1563, 1: 417})

In [125]:
#We make sure that our manual process codes (0 or 1) is interpreted as an integer.
df_sample["process_code"] = df_sample.process_code.astype(int)

In [126]:
#We count the prevalence of the classes.
Counter(df_sample.process_code)

Counter({1: 438, 0: 1542})

In [128]:
#We convert our sentences and codes to respectively an X and y dataset.
X = list(df_sample["sent_c"]); y = list(df_sample["process_code"].astype(int))

In [129]:
#We split it up in train and test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 1, stratify=y)

### naive bayes

In [130]:
#We define our vectorizer.
vectorizer = TfidfVectorizer(min_df=5, max_df=.5)
#We transform our training and test data.
X_train_enc = vectorizer.fit_transform(X_train)
X_test_enc = vectorizer.transform(X_test)
#We define our model.
nb = MultinomialNB()
#We fit our data.
nb.fit(X_train_enc, y_train)
#We predict the labels for our test data by the model.
y_pred = nb.predict(X_test_enc)
#We print the metrics overview.
overview_nb = get_metrics_table(y_test, y_pred, feature="process information", model="Naive Bayes")

              precision  recall  f1-score  support
0                  0.88    0.98      0.93   308.00
1                  0.88    0.52      0.66    88.00
accuracy           0.88    0.88      0.88     0.88
macro avg          0.88    0.75      0.79   396.00
weighted avg       0.88    0.88      0.87   396.00


### roberta

In [140]:
#We define our model.
tokenizer = RobertaTokenizer.from_pretrained("pdelobelle/robbert-v2-dutch-base")

In [141]:
#We tokenize our training and test data.
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [142]:
#We define our model.
model = RobertaForSequenceClassification.from_pretrained("pdelobelle/robbert-v2-dutch-base", num_labels = 2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pdelobelle/robbert-v2-dutch-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [143]:
#We define a custom dataset class.
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        #We store the input encodings.
        self.encodings = encodings
        #We store the corresponding labels.
        self.labels = labels
    def __getitem__(self, idx):
        #We create a dictionary to store the current item.
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        #If labels are provided, we add them to the item dictionary.
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        #We return the item.
        return item
    def __len__(self):
        #We return the length of the input ids list.
        return len(self.encodings["input_ids"])

In [144]:
#We create a train and test dataset.
train_dataset = Dataset(train_encodings, y_train)
test_dataset = Dataset(test_encodings, y_test)

In [145]:
#We count the occurance of the labels.
print(Counter(train_dataset.labels)); print(Counter(test_dataset.labels))

Counter({0: 1234, 1: 350})
Counter({0: 308, 1: 88})


In [146]:
#We define a function to track the metrics of the model.
def compute_metrics(p):
    pred, labels = p
    #We extract the label of the highest probability.
    pred = np.argmax(pred, axis=1)
    #We compare our labels with the predictions.
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    #We calculate additional metrics scores.
    precision, recall, f1, _ = precision_recall_fscore_support(y_true=labels, y_pred=pred, average="macro")
    #We return the metrics.
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [147]:
#We define our training arguments and trainer.
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=9,
    per_device_train_batch_size=8,
    logging_steps=50)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [148]:
#We train the model.
trainer.train()

Step,Training Loss
50,0.3884
100,0.2885
150,0.136
200,0.1702
250,0.1238
300,0.0977
350,0.0596
400,0.0551
450,0.041
500,0.0434




TrainOutput(global_step=891, training_loss=0.090747172081912, metrics={'train_runtime': 138.9723, 'train_samples_per_second': 102.582, 'train_steps_per_second': 6.411, 'total_flos': 1113551764047360.0, 'train_loss': 0.090747172081912, 'epoch': 9.0})

In [149]:
#We evaluate our trainer.
trainer.evaluate()

{'eval_loss': 0.5673494935035706,
 'eval_accuracy': 0.9040404040404041,
 'eval_precision': 0.8579090389016018,
 'eval_recall': 0.8693181818181818,
 'eval_f1': 0.8633986928104574,
 'eval_runtime': 1.2776,
 'eval_samples_per_second': 309.963,
 'eval_steps_per_second': 19.568,
 'epoch': 9.0}

In [150]:
#We save our model.
trainer.save_model("process_information_classifier")

In [151]:
#We read in our process info model.
model_process_information = RobertaForSequenceClassification.from_pretrained("process_information_classifier")

In [152]:
model_process_information.to("cuda")

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(40000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [153]:
#We define a function to get the model predictions.
def get_model_predictions(text, model=source_presence_classifier, output_format="labels"):
    #We tokenize the input.
    inputs = tokenizer(text,padding = True, truncation = True, return_tensors="pt").to("cuda")
    #We extract the outputs.
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predictions = np.round(predictions.cpu().detach().numpy(), 3)
    
    #If the prediction value of the first vlass is higher then that of the second, we return the first class.
    if predictions[0][0] > predictions[0][1]:
        result = 0
    #Else it will be the second class.
    else:
        result = 1
        
    if output_format=="raw":
        return predictions
    elif output_format=="labels":
        return result

In [154]:
y_pred_roberta = []

#We predict every sentence in our test data.
for e in tqdm(X_test):
    y = get_model_predictions(e)
    y_pred_roberta.append(y)

#We count the presence of each class.
print(Counter(y_pred_roberta))

100%|██████████| 396/396 [00:03<00:00, 117.93it/s]

Counter({0: 304, 1: 92})





In [155]:
#We compare the performance with our manual coding.
overview_roberta = get_metrics_table(y_test, y_pred_roberta, feature="process information", model="Roberta")

              precision  recall  f1-score  support
0                  0.94    0.93      0.94    308.0
1                  0.77    0.81      0.79     88.0
accuracy           0.90    0.90      0.90      0.9
macro avg          0.86    0.87      0.86    396.0
weighted avg       0.91    0.90      0.90    396.0


In [156]:
#We add the performance overview to our earlier Naive Bayes performance.
overview_process = pd.concat([overview_nb, overview_roberta])

In [157]:
overview_process

Unnamed: 0,feature,model,class,precision,recall,f1-score,support,accuracy
0,process information,Naive Bayes,no,0.88,0.98,0.93,308,0.88
1,process information,Naive Bayes,yes,0.88,0.52,0.66,88,
0,process information,Roberta,no,0.94,0.93,0.94,308,0.9
1,process information,Roberta,yes,0.77,0.81,0.79,88,


In [158]:
#We write away these combined metrics.
overview_process.to_csv("overview_process.csv")

### predicting process information

In [8]:
#We read in our data with all self referential sentences.
df = pd.read_csv("21_self_ref_sent.csv", index_col=0)

In [None]:
df["self_referral_text"] = df["self_referral_text"].apply(literal_eval)

In [165]:
#We define a function to produce the process info factor.
def get_process_info_factor(x, length=max(df.index)):
    
    presence = x["self_referral_presence"]
    self_referral_text = x["self_referral_text"]
    text = x["txt_text_one"]
    news_source = x["news_source"]; source_pattern = f"\\b({news_source}(\.nl)?)\\b"
    sentences = sent_tokenize(text); n_sentences = len(sentences)
    results = []

    print(f"{np.round(x.name/length*100,2)}", end="\r")
    
    #We add a minimum to it, namely if the text only has two sentences, we simply give it a score of 0.
    if n_sentences <= 2:
        return 0
    
    #In the case that there are self referral sentences present we do the following:
    elif presence==True:
        #We loop over the self referrential sentences.
        for sent in self_referral_text:
            if re.search(source_pattern, sent):
                #If an outlet name is present we replace it by SOURCE to reduce outlet bias.
                sent = re.sub(source_pattern, "SOURCE", sent)
            #We then predict the presence or absence of process info in the self referrential sentence.
            result = get_model_predictions(sent)
            #We append the result (0 or 1) to a list.
            results.append(result)
        #We then calculate the ratio of the number of process info sentences (the sum of predictions) and divide it by the total sentence length.
        return sum(results) / n_sentences
    
    #If no self referral sentences are present at all it is also directly a score of 0.
    elif presence==False:
        return 0

In [166]:
#We apply the function to our data.
df["process_info_factor"] = df.apply(lambda x: get_process_info_factor(x), axis=1)

100.0

In [167]:
#We print some descriptives.
df.process_info_factor.describe()

count    28901.000000
mean         0.003703
std          0.023204
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          0.545455
Name: process_info_factor, dtype: float64

In [168]:
#We write it away as a csv.
df.to_csv("df_w_process_info_factor.csv")