# Segment retrieval

In [86]:
import pandas as pd
import numpy as np
import re

## 1. Loading the data

In [43]:
excel1 = "corpus_three_annot.xlsx"
excel2 = "corpus_two_annot.xlsx"
corpus1 = pd.read_excel(excel1)
corpus2 = pd.read_excel(excel2)

### 1. 1. Annotated corpus

In [44]:
corpus1 = corpus1[["id_segment", 'text_segment', "relevance_label", "relevance_type_norm"]]
print("Lenght corpus 1 : {}".format(len(corpus1.index)))

Lenght corpus 1 : 161


In [45]:
corpus2 = corpus2[["id_segment", 'text_segment', "relevance_label", "relevance_type_norm"]]
print("Lenght corpus 2 : {}".format(len(corpus2.index)))

Lenght corpus 2 : 169


In [46]:
annotated_corpus = pd.concat([corpus1, corpus2])
len(annotated_corpus.index)

330

In [47]:
pd.DataFrame(annotated_corpus["relevance_label"].value_counts())

Unnamed: 0,relevance_label
0,218
1,71
2,41


In [48]:
annotated_corpus.head()

Unnamed: 0,id_segment,text_segment,relevance_label,relevance_type_norm
0,1-s2.0-S0301479717300713-main_226b,"#text': '(Lambin et al., 2003)'}], '#text': 'W...",0,
1,1-s2.0-S0303243414001718-main_19b,#text': 'After the droughts in the 1970s and 1...,2,"LULCC, DRIVERS"
2,1-s2.0-S030438781000043X-mainext_313b,#text': 'Few investment opportunities are avai...,0,
3,1-s2.0-S095937809800003X-main_75b,#text': 'Pastoral production has often existed...,1,PRACTICES
4,1-s2.0-S0006320709005400-main_16b,#text': 'The forests of West and Central Afric...,2,LULC


### 1. 2. All corpus

In [22]:
all_corpus = pd.read_excel("all_corpus_processed.xlsx")

In [23]:
len(all_corpus.index)

20345

### 1.3. Cleaning text

In [49]:
def clean_text(text):
    text = str(text)
    # Use a regular expression to find and remove anything between curly braces
    cleaned_text = re.sub(r'\{.*?\}', ' ', text)

    # Use a regular expression to remove all occurrences of '#text':
    cleaned_text = re.sub(r"'#text':", '', cleaned_text)

    cleaned_text = re.sub(r"'#text':", '', cleaned_text)
    # Use a regular expression to remove all occurrences of '@xmlns':
    cleaned_text = re.sub(r"'@xmlns':", '', cleaned_text)

    # Use a regular expression to remove all occurrences of '<variable>: {'
    cleaned_text = re.sub(r"'\w+':\s*", ' ', cleaned_text)

    # Remove any sequence of more than two special characters or spaces
    cleaned_text = re.sub(r'[^a-zA-Z0-9 .%]+', ' ', cleaned_text)

    # Clean up excessive spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

In [50]:
annotated_corpus['cleaned_text'] = annotated_corpus['text_segment'].apply(clean_text)
all_corpus['cleaned_text'] = all_corpus['text_segment'].apply(clean_text)

## 2. Supervised classification

In [15]:
import os
from tqdm import tqdm
tqdm.monitor_interval = 0
tqdm.pandas()
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_predict, KFold, train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix, accuracy_score, cohen_kappa_score, f1_score, recall_score, precision_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.multiclass import unique_labels
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from sklearn.pipeline import Pipeline

In [16]:
import datasets
from datasets import Dataset, DatasetDict
from datasets import ClassLabel

In [51]:
annotated_corpus["label_encoded"] = pd.Categorical(annotated_corpus["relevance_label"], ordered=True).codes

#### LinearSVM

In [52]:
# Texts to vectorize
X = annotated_corpus['cleaned_text'].tolist() 
X = [x.lower() for x in X]
Y = annotated_corpus['label_encoded'].tolist() 

# Initialize models
svc_model = LinearSVC(class_weight='balanced')

def calculate_metrics(model, X, y, cv= 5):
    y_pred = cross_val_predict(model, X, y, cv=cv)
    precision, recall, f1, _ = precision_recall_fscore_support(y, y_pred, average=None)
    
    print("Classification Report:")
    print(classification_report(y, y_pred))

    return precision, recall, f1

Default values for TF-IDF vectorizer.

In [53]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer()

# Convert texts to TF-IDF matrix
X_tfidf = tfidf.fit_transform(X)

# Evaluate Linear SVC
print("Linear SVC Metrics:")
svc_precision, svc_recall, svc_f1 = calculate_metrics(svc_model, X_tfidf, Y)

Linear SVC Metrics:
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.94      0.82       218
           1       0.34      0.14      0.20        71
           2       0.82      0.34      0.48        41

    accuracy                           0.69       330
   macro avg       0.63      0.47      0.50       330
weighted avg       0.65      0.69      0.64       330



In [85]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')  

# Convert texts to TF-IDF matrix
X_tfidf = tfidf.fit_transform(X)

# Evaluate Linear SVC
print("Linear SVC Metrics:")
svc_precision, svc_recall, svc_f1 = calculate_metrics(svc_model, X_tfidf, Y)

Linear SVC Metrics:
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.96      0.83       218
           1       0.52      0.20      0.29        71
           2       0.77      0.24      0.37        41

    accuracy                           0.71       330
   macro avg       0.67      0.47      0.49       330
weighted avg       0.69      0.71      0.65       330



#### BERT

In [72]:
annotated_corpus["cleaned_text"] = annotated_corpus["cleaned_text"].astype("string")
annotated_corpus["label"] = pd.Categorical(annotated_corpus["relevance_label"], ordered=True).codes

import datasets
from datasets import Dataset, DatasetDict
from datasets import ClassLabel

dataset = Dataset.from_pandas(annotated_corpus[["cleaned_text", "label"]])
dataset = dataset.train_test_split(test_size=0.2)
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['cleaned_text', 'label', '__index_level_0__'],
        num_rows: 264
    })
    test: Dataset({
        features: ['cleaned_text', 'label', '__index_level_0__'],
        num_rows: 66
    })
})


In [82]:
# pretrained_model = "bert-base-uncased"
pretrained_model = "roberta-base"

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
def tokenize_function(batch):
    tokenized_batch = tokenizer(batch['cleaned_text'], padding=True, truncation=True, max_length=128)
    return tokenized_batch

tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model, num_labels=3)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

loading configuration file config.json from cache at C:\Users\valentin/.cache\huggingface\hub\models--roberta-base\snapshots\e2da8e2f811d1448a5b465c236feacd80ffbac7b\config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at C:\Users\valentin/.cache\huggingface\hub\models--roberta-base\snapshots\e2da8e2f811d1448a5b465c236feacd80ffbac7b

Map:   0%|          | 0/264 [00:00<?, ? examples/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

In [83]:
# f1_metric = load_metric("f1")
from datasets import load_metric

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    my_metrics ={
        "accuracy": accuracy_score(y_pred=predictions, y_true=labels),
    "F1": f1_score(y_pred=predictions, y_true=labels, average = 'weighted')
        }

    classif_report = classification_report(predictions, labels, digits=2, output_dict = True)
    all_metrics = df_scores_small(classif_report)
    
    all_metrics.to_csv(f'{filename}.csv')
    return my_metrics

In [84]:
from transformers import TrainingArguments, Trainer
import numpy as np

def df_scores_small(classif_report):
    cls = [x for x in classif_report.keys() if x not in ['accuracy', 'macro avg', 'weighted avg']]
    dfs = []
    for key in cls:
        df = pd.DataFrame(classif_report[key], index=[key])
        dfs.append(df)

    final_df = pd.concat(dfs)
    for key in ['macro avg', 'weighted avg']:
        df = pd.DataFrame(classif_report[key], index=[key])
        dfs.append(df)

    final_df = pd.concat(dfs)
    final_df['accuracy'] = classif_report['accuracy']

    return(final_df)    


training_args = TrainingArguments(
    output_dir="test_trainer", 
    evaluation_strategy="epoch",
    num_train_epochs=5,
    #batch_size=16,
    seed=42
    )

filename = "cm_roberta"
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: cleaned_text, __index_level_0__. If cleaned_text, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 264
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 165
  Number of trainable parameters = 124647939


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.656272,0.681818,0.620073
2,No log,0.64746,0.757576,0.665308
3,No log,0.521572,0.757576,0.770349
4,No log,0.752479,0.727273,0.734754
5,No log,0.714733,0.757576,0.754267


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: cleaned_text, __index_level_0__. If cleaned_text, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: cleaned_text, __index_level_0__. If cleaned_text, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set don't have a corres

TrainOutput(global_step=165, training_loss=0.513590957179214, metrics={'train_runtime': 2299.7633, 'train_samples_per_second': 0.574, 'train_steps_per_second': 0.072, 'total_flos': 86827427850240.0, 'train_loss': 0.513590957179214, 'epoch': 5.0})

In [185]:
trainer.save_model("roberta_model")

Saving model checkpoint to roberta_model
Configuration saved in roberta_model\config.json
Model weights saved in roberta_model\pytorch_model.bin


In [187]:
from transformers import pipeline
classification_model = AutoModelForSequenceClassification.from_pretrained("roberta_model")
clf = pipeline("text-classification", model = classification_model, tokenizer=tokenizer, 
               truncation = True, max_length = 128, return_all_scores = True)



## 3. Segment retrieval

### 3.1. Random selection of segments

In [188]:
all_corpus_match = all_corpus[all_corpus['Match'] == 1]

In [189]:
import random
all_sentences_match = all_corpus_match["cleaned_text"].tolist()
random.shuffle(all_sentences_match)

In [190]:
all_sentences_match_eval = all_sentences_match[0:200]

### 3.2. Label predicition with custom trained RoBERTa model

In [198]:
rows = list()
for text in tqdm(all_sentences_match_eval, total=len(all_sentences_match_eval)):
    prediction = clf(text)[0]
    max_prob = max(prediction, key=lambda x: x['score'])
    label = prediction.index(max_prob)
    rows.append((label, prediction[1]['score'],prediction[2]['score']))
    
predictions_roberta  = pd.DataFrame(rows)


  0%|                                                                                          | 0/200 [00:00<?, ?it/s][A
  1%|▊                                                                                 | 2/200 [00:00<00:24,  8.03it/s][A
  2%|█▋                                                                                | 4/200 [00:00<00:23,  8.33it/s][A
  3%|██▍                                                                               | 6/200 [00:00<00:25,  7.66it/s][A
  4%|██▊                                                                               | 7/200 [00:00<00:27,  7.10it/s][A
  4%|███▋                                                                              | 9/200 [00:01<00:24,  7.71it/s][A
  5%|████                                                                             | 10/200 [00:01<00:24,  7.90it/s][A
  6%|████▊                                                                            | 12/200 [00:01<00:21,  8.87it/s][A
  6%|█████▎    

In [203]:
predictions_roberta.columns = ['pred_label', 'prob_label_1', 'prob_label_2']
predictions_roberta['pred_label'].value_counts()

0    118
1     49
2     33
Name: pred_label, dtype: int64

In [225]:
df_analysis = pd.DataFrame({"cleaned_text": all_sentences_match_eval})

In [226]:
df_analysis = pd.concat([df_analysis, predictions_roberta], axis = 1)

In [213]:
df_analysis

Unnamed: 0,cleaned_text,pred_label,prob_label_1,prob_label_2
0,The cause of land use and land cover changes i...,1,0.853471,0.120585
1,One quarter of the millipede species recorded ...,0,0.066776,0.003415
2,We started with the literature analysis to rec...,0,0.004605,0.001459
3,Even though open questions were intentionally ...,0,0.005202,0.001532
4,Next to this the historical time series of lan...,0,0.004718,0.001501
...,...,...,...,...
195,Finally we used the mosaics from the 2 differe...,0,0.004369,0.001450
196,Having explored the state of the literature an...,0,0.006992,0.001289
197,We use the term urban sprawl to describe a spe...,0,0.016153,0.001613
198,C 3 crops are expected to produce more however...,0,0.060429,0.003117


In [215]:
df_analysis_sorted = df_analysis.sort_values(by=['pred_label','prob_label_2'], ascending=False)
for i in range(20) :
    print("Label : {}".format(df_analysis_sorted.iloc[i]['pred_label']))
    print(df_analysis_sorted.iloc[i]['cleaned_text'])
    print('_____________________________________________')

Label : 2
The types of land on which bow occur farmland and degraded savanna increased in northern Benin by 5.4% per year during the period 1975 1990 and 9.5% per year during the periods 1990 2010 while the natural vegetation forest woodland and tree savanna decreased by the same amount
_____________________________________________
Label : 2
Thus farmland persists and increases each year at the expense of forest woodland and tree savanna . The area of natural vegetation forest woodland and tree savanna that was considered degraded increased by 4.1% between the first and the second time periods i.e
_____________________________________________
Label : 2
About 8362.44 ha of tree savanna was converted into shrub savanna and 2605.5 ha into woodland respectively
_____________________________________________
Label : 2
The decrease in rainfall has been associated with a concentration of the cultivation of the sandy soils on the dune
_____________________________________________
Label : 2
The 

### 3.3. Semantic similarity

In [104]:
relevant_sentences = annotated_corpus[annotated_corpus["label"]==2]["cleaned_text"].tolist()

In [106]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [217]:
encoding_model = SentenceTransformer(pretrained_model)
similarities_roberta = list()
for sentence in tqdm(all_sentences_match_eval) :
    sim = list()
    e1 = encoding_model.encode(sentence).reshape(1,-1)
    for i in range(len(relevant_sentences)) :
        e2 = encoding_model.encode(relevant_sentences[i]).reshape(1,-1)
        sim = sim + [cosine_similarity(e1, e2)[0][0]]
    similarities_roberta.append(sim)

No sentence-transformers model found with name C:\Users\valentin/.cache\torch\sentence_transformers\roberta-base. Creating a new one with MEAN pooling.
loading configuration file C:\Users\valentin/.cache\torch\sentence_transformers\roberta-base\config.json
Model config RobertaConfig {
  "_name_or_path": "C:\\Users\\valentin/.cache\\torch\\sentence_transformers\\roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weigh

 50%|████████████████████████████████████████                                        | 100/200 [05:45<05:46,  3.47s/it][A
 50%|████████████████████████████████████████▍                                       | 101/200 [05:48<05:41,  3.44s/it][A
 51%|████████████████████████████████████████▊                                       | 102/200 [05:52<05:31,  3.38s/it][A
 52%|█████████████████████████████████████████▏                                      | 103/200 [05:55<05:19,  3.29s/it][A
 52%|█████████████████████████████████████████▌                                      | 104/200 [05:58<05:19,  3.33s/it][A
 52%|██████████████████████████████████████████                                      | 105/200 [06:01<05:16,  3.33s/it][A
 53%|██████████████████████████████████████████▍                                     | 106/200 [06:05<05:14,  3.35s/it][A
 54%|██████████████████████████████████████████▊                                     | 107/200 [06:08<05:12,  3.36s/it][A
 54%|███████████

In [218]:
encoding_model = SentenceTransformer('all-MiniLM-L6-v2')
similarities_miniLM = list()
for sentence in tqdm(all_sentences_match_eval) :
    sim = list()
    e1 = encoding_model.encode(sentence).reshape(1,-1)
    for i in range(len(relevant_sentences)) :
        e2 = encoding_model.encode(relevant_sentences[i]).reshape(1,-1)
        sim = sim + [cosine_similarity(e1, e2)[0][0]]
    similarities_miniLM.append(sim)

loading configuration file C:\Users\valentin/.cache\torch\sentence_transformers\sentence-transformers_all-MiniLM-L6-v2\config.json
Model config BertConfig {
  "_name_or_path": "C:\\Users\\valentin/.cache\\torch\\sentence_transformers\\sentence-transformers_all-MiniLM-L6-v2\\",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file C:\Users\valentin/.cache\torch\sentence_transformers\sentence-transformers_all-MiniLM-L6-v2\pytorch_model.bin
All

 60%|████████████████████████████████████████████████                                | 120/200 [03:34<01:56,  1.46s/it][A
 60%|████████████████████████████████████████████████▍                               | 121/200 [03:35<02:06,  1.60s/it][A
 61%|████████████████████████████████████████████████▊                               | 122/200 [03:38<02:15,  1.73s/it][A
 62%|█████████████████████████████████████████████████▏                              | 123/200 [03:39<02:15,  1.76s/it][A
 62%|█████████████████████████████████████████████████▌                              | 124/200 [03:41<02:03,  1.62s/it][A
 62%|██████████████████████████████████████████████████                              | 125/200 [03:42<02:01,  1.62s/it][A
 63%|██████████████████████████████████████████████████▍                             | 126/200 [03:43<01:49,  1.48s/it][A
 64%|██████████████████████████████████████████████████▊                             | 127/200 [03:45<01:54,  1.57s/it][A
 64%|███████████

In [227]:
df_sim = pd.DataFrame({"sim_miniLM_mean" : [np.mean(x) for x in similarities_miniLM],
                      "sim_miniLM_max" : [np.max(x) for x in similarities_miniLM],
                      "sim_roberta" : [np.mean(x) for x in similarities_roberta],
                       "sim_roberta_max" : [np.max(x) for x in similarities_roberta]})

In [228]:
df_analysis = pd.concat([df_analysis, df_sim], axis = 1)

In [237]:
df_analysis.to_excel("df_analysis.xlsx", index = False)