# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 0.Setting Colab Method for future model developing
Firstly, run the following block to mount the drive to the colab. Then, drag the data folder/**eval.py** to the "Colab Folder Space" to ensure the code runs successfully.

If data folder updated, attempt to forcibly remount, call `drive.mount("/content/drive", force_remount=True)`.


In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

# 1.DataSet Processing

## 1.1 Reading and gathering data

Using `json` package reading and gathering claims and evidences, then print an output.

In [2]:
import json
from collections import Counter
from statistics import mean

with open('data/train-claims.json', 'r') as input_file:
    train_claim_data = json.load(input_file)

# Read in development data (claim)
with open('data/dev-claims.json', 'r') as input_file:
    dev_claim_data = json.load(input_file)

# Read in test data (claim)
with open('data/test-claims-unlabelled.json', 'r') as input_file:
    test_claim_data = json.load(input_file)

# Read in evidence data
with open('data/evidence.json', 'r') as input_file:
    evi_data = json.load(input_file)

#EDA


claim_count = 0
evi_count = 0
claim_length = []
evidence_count = []
evidence_length = []
labels = []

for key,value in train_claim_data.items():
    claim_count+=1
    claim_length.append(len(value["claim_text"]))
    evidence_count.append(len(value["evidences"]))
    evidence_length += [len(evi_data[x]) for x in value["evidences"]]
    labels.append(value["claim_label"])

for key,value in evi_data.items():
    evi_count+=1

print("claim count: ",claim_count)
print("evidence count: ",evi_count)
print("max claim length: ",max(claim_length))
print("min claim length: ",min(claim_length))
print("mean claim length: ",mean(claim_length))
print("max evidence count: ",max(evidence_count))
print("min evidence count: ",min(evidence_count))
print("mean evidence count: ",mean(evidence_count))
print("max evidence length: ",max(evidence_length))
print("min evidence length: ",min(evidence_length))
print("mean evidence length: ",mean(evidence_length))
print(Counter(labels))



inside = 0
outside = 0

train_evi_id = []
for claim_id,claim_value in train_claim_data.items():
    train_evi_id=train_evi_id+claim_value['evidences']

for claim_id,claim_value in dev_claim_data.items():
    test_evi_id=claim_value['evidences']
    for e in test_evi_id:
        if e in train_evi_id:
            inside += 1
        else:
            outside += 1
print("Dev evi inside train evi", inside)
print("Dev evi outside train evi", outside)

full_evidence_id = list(evi_data.keys())
full_evidence_text  = list(evi_data.values())
train_claim_id = list(train_claim_data.keys())
train_claim_text  = [ v["claim_text"] for v in train_claim_data.values()]


claim count:  1228
evidence count:  1208827
max claim length:  332
min claim length:  26
mean claim length:  122.95521172638436
max evidence count:  5
min evidence count:  1
mean evidence count:  3.3566775244299674
max evidence length:  1979
min evidence length:  13
mean evidence length:  173.5
Counter({'SUPPORTS': 519, 'NOT_ENOUGH_INFO': 386, 'REFUTES': 199, 'DISPUTED': 124})
Dev evi inside train evi 163
Dev evi outside train evi 328


## 1.2 Data preprocessing

### Implementing preprocessing fuctions

In [3]:
import nltk
import string
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word, 'v')
    return lemma if lemma != word else lemmatizer.lemmatize(word, 'n')

def is_pure_english(text):
    english_letters = set(string.ascii_letters)
    cleaned_text = ''.join(char for char in text if char.isalpha() or char.isspace())
    return all(char in english_letters or char.isspace() for char in cleaned_text)

def remove_non_eng(dictionary):
    eng_data = {}
    for key, value in dictionary.items():
        if is_pure_english(value):
            eng_data[key] = value
    return eng_data

def contains_climate_keywords(text, keywords):
    text = text.lower()
    for keyword in keywords:
        if re.search(r"\b" + re.escape(keyword) + r"\b", text):
            return True
    return False

def filter_climate_related(dictionary, keywords):
    cs_data = {}
    for key, value in dictionary.items():
        if contains_climate_keywords(value, keywords):
            cs_data[key] = value
    return cs_data

def text_preprocessing(text, remove_stopwords=False):
    words = [lemmatize(w) for w in text.lower().split()]
    if remove_stopwords:
        words = [w for w in words if w not in stopwords]
    return " ".join(words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ABC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ABC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Implementing **Claim data preprocessing** and **Evidence data preprocessing** functions

In [4]:
# 权威网站 https://www.ucdavis.edu/climate/definitions
climate_keywords = [
    "climate", "environment", "global warming", "greenhouse effect", "carbon", "co2", "carbon dioxide",
    "methane", "renewable energy", "sustainability", "ecology", "biodiversity", "fossil fuels",
    "emissions", "air quality", "ozone", "solar energy", "wind energy", "climate change", "climate crisis",
    "climate adaptation", "climate mitigation", "ocean", "sea levels", "ice melting", "deforestation",
    "reforestation", "pollution"
]

def filter_evidence_by_train(train_claim_data, evidence_data):

    # Collect all evidence ids in the training set
    train_evidence_ids = set()

    for claim in train_claim_data.values():
        train_evidence_ids.update(claim['evidences'])

    # filter evidence data by the evidence ids in the training set
    filtered_evidence_data = {key: value for key, value in evidence_data.items() if key in train_evidence_ids}

    return filtered_evidence_data

def preprocess_claim_data(claim_data):

    claim_data = remove_non_eng(claim_data)
    for key in claim_data.keys():
        claim_data[key]["claim_text"] = text_preprocessing(claim_data[key]["claim_text"])

    claim_data_text = [claim_data[key]["claim_text"] for key in claim_data.keys()]
    claim_data_id = list(claim_data.keys())

    return claim_data_text, claim_data_id

def preprocess_evi_data(evi_data, climate_keywords, train_claim_data):
    evi_data = remove_non_eng(evi_data)
    cs_evi_data = filter_climate_related(evi_data, climate_keywords)

    # filter evidence data by the evidence ids in the training set
    train_evi_data = filter_evidence_by_train(train_claim_data, cs_evi_data)

    for key in train_evi_data.keys():
        train_evi_data[key] = text_preprocessing(train_evi_data[key], remove_stopwords=True)

    cleaned_evidence_text = list(train_evi_data.values())
    cleaned_evidence_id = list(train_evi_data.keys())

    return cleaned_evidence_text, cleaned_evidence_id



### Start dataset preprocessing

In [5]:
train_claim_text, train_claim_id = preprocess_claim_data(train_claim_data)
dev_claim_text, dev_claim_id = preprocess_claim_data(dev_claim_data)
test_claim_text, test_claim_id = preprocess_claim_data(test_claim_data)
cleaned_evidence_text, cleaned_evidence_id = preprocess_evi_data(evi_data, climate_keywords, train_claim_data)

## 1.3 Development Set Prediction

In this section, we perform the main tasks of the project on the development set:

1. **Evidence Retrieval**: For each claim, find the most relevant evidence from the corpus.
2. **Claim Classification**: Predict the label for each claim based on the retrieved evidence and the claim's similarity to the training claims.

The code uses TF-IDF vectorization and cosine similarity to measure the relevance between claims and evidence, and between development and training claims. The most similar evidence and training claims are used for prediction.

In [7]:
import operator
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# dev remove stop word会好一点， test不remove反而好
evidence_tfidf_vectorizer = TfidfVectorizer(stop_words="english", use_idf=True)

# claim_tfidf_vectorizer = TfidfVectorizer(stop_words="english", use_idf=True)
claim_tfidf_vectorizer = TfidfVectorizer(use_idf=True)

evidence_tfidf_vectorizer.fit(train_claim_text+cleaned_evidence_text) #english_evidence#
train_claim_emb_list = claim_tfidf_vectorizer.fit_transform(train_claim_text)

full_evi_emb_list = evidence_tfidf_vectorizer.transform(cleaned_evidence_text) #english_evidence#


In [8]:
# evi_k=4
# claim_k=1

# with open('data/dev-claims.json', 'r') as input_file:
#     test_out_temp = json.load(input_file)

# for claim_id,claim_value in test_out_temp.items():
#     # Task1
#     # 把test claim转化成vector

#     test_claim_emb = evidence_tfidf_vectorizer.transform([claim_value['claim_text']])
#     evi_sim_dict = {}

#     # 计算出test claim和所有evidence的相似度
#     sim = cosine_similarity(test_claim_emb, full_evi_emb_list)[0]

#     for i in range(len(sim)):
#         evi_sim_dict[cleaned_evidence_id[i]] = sim[i]

#     # 对evidence根据和claim的相似度排序
#     s_sim = [(k, v) for k, v in sorted(evi_sim_dict.items(), key=lambda item: item[1],reverse=True)][:evi_k]
#     sel_sim = [k for k,v in s_sim]
#     # 把最相似的前k个evidence的id写入到test claim的evidence list
#     test_out_temp[claim_id]["evidences"] = sel_sim

#     # Task2
#     # 把test claim转化成vector 注意我这两问用了不同的vectorizer，因为有不同的预处理步骤，和应用数据目标
#     test_claim_emb = claim_tfidf_vectorizer.transform([claim_value['claim_text']])

#     # 计算出test claim和所有train claim的相似度
#     claim_sim_dict = {}
#     claim_sim = cosine_similarity(test_claim_emb, train_claim_emb_list)[0]
#     for i in range(len(claim_sim)):
#         claim_sim_dict[train_claim_id[i]] = claim_sim[i]

#     # 取最相似的k个train claim
#     most_sim_claims = [(k, v) for k, v in sorted(claim_sim_dict.items(), key=lambda item: item[1],reverse=True)]
#     # 我这里用的是k=1只考虑最相似的那一个，label拿出来
#     most_sim_claim = max(most_sim_claims, key=operator.itemgetter(1))[0]

#     test_out_temp[claim_id]["claim_label"] = train_claim_data[most_sim_claim]["claim_label"]



# # Writing to sample.json
# with open("data/dev_predict.json", "w") as outfile:
#     json.dump(test_out_temp, outfile)

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [9]:
print(next(iter(train_claim_data.values())))
print(len(train_claim_data))
print(train_claim_id[0])
print(train_claim_text[0])

{'claim_text': 'not only be there no scientific evidence that co2 be a pollutant, higher co2 concentration actually help ecosystem support more plant and animal life.', 'claim_label': 'DISPUTED', 'evidences': ['evidence-442946', 'evidence-1194317', 'evidence-12171']}
1228
claim-1937
not only be there no scientific evidence that co2 be a pollutant, higher co2 concentration actually help ecosystem support more plant and animal life.


In [14]:
dev_retrieval_cls_data = []
dev_retrieval_cls_label = []
cleaned_evidence = dict(zip(cleaned_evidence_id, cleaned_evidence_text))
cleaned_dev_claim = dict(zip(dev_claim_id, dev_claim_text))
get_max_len_evi = []

with open('data/dev-claims.json', 'r') as input_file:
    test_out_temp = json.load(input_file)

for claim_id,claim_value in cleaned_dev_claim.items():
    # Task1
    # 把test claim转化成vector
    print(claim_value)
    print(len(claim_value))
    get_max_len_evi.append(len(claim_value))
    
    test_claim_emb = evidence_tfidf_vectorizer.transform([claim_value])
    evi_sim_dict = {}
    
    # 计算出test claim和所有evidence的相似度
    sim = cosine_similarity(test_claim_emb, full_evi_emb_list)[0]

    for i in range(len(sim)):
        evi_sim_dict[cleaned_evidence_id[i]] = sim[i]
    s_sim = [(k, v) for k, v in sorted(evi_sim_dict.items(), key=lambda item: item[1])]
    s_sim_top = [(k, v) for k, v in sorted(evi_sim_dict.items(), key=lambda item: item[1],reverse=True)][:5]

    for i in s_sim:
        if i in s_sim_top:
            print(i)
            dev_retrieval_cls_label.append(1)
        else:
            dev_retrieval_cls_label.append(0)

    sum_s_sim = 0

    for i in s_sim:
        sum_s_sim += i[1]
        dev_retrieval_cls_data.append(claim_value + cleaned_evidence[i[0]])
        
    mean_s_sim = sum_s_sim/200

    # print(mean_s_sim)
    # print(s_sim)
    
    print(max(get_max_len_evi))

[south australia] have the most expensive electricity in the world.
0.029390009925948594
[('evidence-4241', 0.0), ('evidence-4674', 0.0), ('evidence-5055', 0.0), ('evidence-5116', 0.0), ('evidence-5251', 0.0), ('evidence-5381', 0.0), ('evidence-6590', 0.0), ('evidence-8822', 0.0), ('evidence-11688', 0.0), ('evidence-12171', 0.0), ('evidence-17867', 0.0), ('evidence-19522', 0.0), ('evidence-19964', 0.0), ('evidence-22100', 0.0), ('evidence-22475', 0.0), ('evidence-23743', 0.0), ('evidence-24687', 0.0), ('evidence-25019', 0.0), ('evidence-26359', 0.0), ('evidence-26773', 0.0), ('evidence-26809', 0.0), ('evidence-27377', 0.0), ('evidence-27584', 0.0), ('evidence-28478', 0.0), ('evidence-29224', 0.0), ('evidence-29528', 0.0), ('evidence-29963', 0.0), ('evidence-30044', 0.0), ('evidence-30352', 0.0), ('evidence-31308', 0.0), ('evidence-32112', 0.0), ('evidence-33883', 0.0), ('evidence-34792', 0.0), ('evidence-35538', 0.0), ('evidence-36052', 0.0), ('evidence-37319', 0.0), ('evidence-37705',

In [59]:
train_retrieval_cls_data = []
train_retrieval_cls_label = []
cleaned_train_claim = dict(zip(train_claim_id, train_claim_text))

with open('data/train-claims.json', 'r') as input_file:
    train_claims = json.load(input_file)

print(next(iter(cleaned_train_claim.items())))
print(train_claim_emb_list.shape)

('claim-1937', 'not only be there no scientific evidence that co2 be a pollutant, higher co2 concentration actually help ecosystem support more plant and animal life.')
(1228, 3423)


In [77]:
print(len(test_out_temp))
print(train_claim_id)
print(train_claim_text)
print(len(cleaned_train_claim))

154
['claim-1937', 'claim-126', 'claim-2510', 'claim-2021', 'claim-2449', 'claim-851', 'claim-2773', 'claim-949', 'claim-1019', 'claim-2834', 'claim-1441', 'claim-1181', 'claim-2417', 'claim-2152', 'claim-826', 'claim-1066', 'claim-3003', 'claim-3059', 'claim-568', 'claim-60', 'claim-378', 'claim-2486', 'claim-1782', 'claim-1235', 'claim-2065', 'claim-1390', 'claim-2789', 'claim-1414', 'claim-377', 'claim-558', 'claim-1447', 'claim-1038', 'claim-2794', 'claim-930', 'claim-2763', 'claim-1920', 'claim-2961', 'claim-1395', 'claim-1404', 'claim-2257', 'claim-646', 'claim-1545', 'claim-1317', 'claim-1547', 'claim-2280', 'claim-118', 'claim-1357', 'claim-69', 'claim-2849', 'claim-499', 'claim-749', 'claim-428', 'claim-1837', 'claim-1875', 'claim-1324', 'claim-1980', 'claim-409', 'claim-2193', 'claim-3084', 'claim-1227', 'claim-1499', 'claim-1790', 'claim-2090', 'claim-2199', 'claim-1498', 'claim-1792', 'claim-2540', 'claim-2474', 'claim-2741', 'claim-3102', 'claim-1112', 'claim-2488', 'claim

In [81]:
for claim_id, claim_data in train_claims.items():
    
    if claim_id in cleaned_train_claim:
        claim_text = cleaned_train_claim[claim_id]
       
        for evidence_id in claim_data['evidences']:
            
            if evidence_id in cleaned_evidence:
                train_retrieval_cls_data.append(claim_text + cleaned_evidence[evidence_id])
                train_retrieval_cls_label.append(1)

In [82]:
print(train_retrieval_cls_data[1])
print(len(train_retrieval_cls_label))
print(len(train_retrieval_cls_data))
print(train_retrieval_cls_label[1])

not only be there no scientific evidence that co2 be a pollutant, higher co2 concentration actually help ecosystem support more plant and animal life.plant grow much 50 percent faster concentration 1,000 ppm co 2 compare ambient conditions, though assume change climate limitation nutrients.
5487
5487
1


In [84]:
print(Counter(dev_retrieval_cls_label))
print(dev_retrieval_cls_data[1])
print(len(dev_retrieval_cls_label))
print(len(dev_retrieval_cls_data))
# data : claim+ 1 evi
# label: 1 or 0 means whether this evi is in top 200(hive high similarity so may right)

Counter({0: 190960, 1: 770})
[south australia] have the most expensive electricity in the world.[the] frequency, [the] ferocity untimely rain increases, [along with] erratic monsoons, drought floods; cause [by climate change]."
191730
191730


In [85]:
print(Counter(train_retrieval_cls_label))
print(train_retrieval_cls_data[1])
print(len(train_retrieval_cls_label))
print(len(train_retrieval_cls_data))

Counter({1: 5487})
not only be there no scientific evidence that co2 be a pollutant, higher co2 concentration actually help ecosystem support more plant and animal life.plant grow much 50 percent faster concentration 1,000 ppm co 2 compare ambient conditions, though assume change climate limitation nutrients.
5487
5487


In [86]:
# need to install
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(train_retrieval_cls_data)

In [87]:
vocab_size = len(tokenizer.word_index) + 1  # 0 is padding token
print(vocab_size)
print(tokenizer.word_index)

5529
{'<UNK>': 1, 'the': 2, 'be': 3, 'climate': 4, 'of': 5, 'global': 6, 'a': 7, 'change': 8, 'warm': 9, 'to': 10, 'and': 11, 'in': 12, 'carbon': 13, 'that': 14, 'increase': 15, 'temperature': 16, 'human': 17, 'greenhouse': 18, 'cause': 19, 'emission': 20, 'co2': 21, 'have': 22, 'gas': 23, 'dioxide': 24, 'rise': 25, 'ocean': 26, 'sea': 27, 'ice': 28, 'level': 29, 'year': 30, 'by': 31, 'atmosphere': 32, 'warming': 33, 'since': 34, 'effect': 35, '2': 36, 'for': 37, 'on': 38, 'more': 39, 'not': 40, 'scientist': 41, 'it': 42, 'surface': 43, '1': 44, 'scientific': 45, 'than': 46, 'century': 47, 'would': 48, 'atmospheric': 49, 'energy': 50, 'methane': 51, 'evidence': 52, 'from': 53, 'concentration': 54, 'time': 55, 'due': 56, 'activity': 57, 'heat': 58, 'co': 59, 'report': 60, 'could': 61, 'fossil': 62, 'water': 63, '0': 64, 'natural': 65, 'show': 66, 'model': 67, 'arctic': 68, 'lead': 69, 'there': 70, 'use': 71, 'consensus': 72, 'also': 73, 'recent': 74, 'ipcc': 75, 'per': 76, "earth's": 77

In [88]:
xseq_train = tokenizer.texts_to_sequences(train_retrieval_cls_data)
xseq_dev = tokenizer.texts_to_sequences(dev_retrieval_cls_data)

In [89]:
max_i = 0
for i in xseq_train:
    max_i = max(max_i, len(i))
print(max_i)

max_i = 0
for i in xseq_dev:
    max_i = max(max_i, len(i))
print(max_i)

236
266


In [90]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

maxlen = 320
xseq_train = pad_sequences(xseq_train, maxlen=maxlen, padding='post')
xseq_dev = pad_sequences(xseq_dev, maxlen=maxlen, padding='post')

In [91]:
# from workshop
import tensorflow as tf
from tensorflow.keras.layers import LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.regularizers import l2


embedding_dim = 200
hidden_dim = 400

#model definition
# feedforward network (MLP)
model = Sequential(name="retrieval_cls_lstm")
model.add(layers.Embedding(input_dim=vocab_size,
                           output_dim=embedding_dim,
                           input_length=maxlen))

model.add(layers.Dropout(0.1))
# model.add(LSTM(hidden_dim, return_sequences=True, dropout=0.1))
# model.add(LSTM(hidden_dim, dropout=0.1))

model.add(layers.Bidirectional(LSTM(hidden_dim, return_sequences=True, dropout=0.1)))
model.add(layers.Bidirectional(LSTM(hidden_dim, dropout=0.1)))

model.add(layers.Dropout(0.1))
model.add(layers.Dense(hidden_dim, activation='tanh'))
model.add(layers.Dropout(0.1))
model.add(layers.Dense(1, activation='sigmoid'))

#since it's a binary classification problem, we use a binary cross entropy loss here
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[keras.metrics.Recall()])
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.compile(loss='binary_crossentropy', optimizer='adam')

decay_steps = 3000
learning_rate = 1e-2
lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
    learning_rate, decay_steps
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
model.compile(loss='binary_crossentropy', optimizer=optimizer)
model.summary()

Model: "retrieval_cls_lstm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 320, 200)          1105800   
                                                                 
 dropout_6 (Dropout)         (None, 320, 200)          0         
                                                                 
 bidirectional_4 (Bidirectio  (None, 320, 800)         1923200   
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 800)              3843200   
 nal)                                                            
                                                                 
 dropout_7 (Dropout)         (None, 800)               0         
                                                                 
 dense_4 (Dense)             (None, 400)        

In [92]:
import numpy as np

train_retrieval_cls_label = np.array(train_retrieval_cls_label)
dev_retrieval_cls_label = np.array(dev_retrieval_cls_label)

In [93]:
# Train the model

model.fit(xseq_train,train_retrieval_cls_label,epochs=10,validation_data=(xseq_dev, dev_retrieval_cls_label))

Epoch 1/10
  4/172 [..............................] - ETA: 2:11:13 - loss: 0.1759

In [None]:
# Save the model
model.save('retrieval_cls_lstm')

# Load the model
# model = tf.keras.models.load_model('retrieval_cls_lstm')



INFO:tensorflow:Assets written to: retrieval_cls_lstm\assets


INFO:tensorflow:Assets written to: retrieval_cls_lstm\assets


# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
# %%cmd
# python eval.py --predictions dev-claims-baseline.json --groundtruth dev-claims.json
# python eval.py --predictions dev_predict.json --groundtruth dev-claims.json


import subprocess

# proc = subprocess.Popen(["python", "eval.py", "--predictions", "data\dev_predict.json", "--groundtruth", "data\dev-claims.json"
# ], stdout=subprocess.PIPE, shell=True)
# (out, err) = proc.communicate()
# print(str(out))

# 高自动化模型/预处理选择，可以自动读取准确度
output = subprocess.check_output("python eval.py --predictions data/dev_predict.json --groundtruth data/dev-claims.json", shell=True)
output_str = output.decode('utf-8')

# Split the output into lines
output_lines = output_str.strip().split('\n')

# Format the output
formatted_lines = []
for line in output_lines:
    metric, value = line.split('=')
    metric = metric.strip()
    value = value.strip()
    formatted_line = f"{metric}: {value}"
    formatted_lines.append(formatted_line)

# Join the formatted lines into a single string
formatted_output = '\n'.join(formatted_lines)
print(formatted_output)

Evidence Retrieval F-score (F): 0.04555246340960627
Claim Classification Accuracy (A): 0.38961038961038963
Harmonic Mean of F and A: 0.08156814348266166


## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*