## Tested Sustainable Finance on Testing Data

In [1]:
#import libraries
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification,TextClassificationPipeline
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
import numpy as np

In [2]:
# Load the training dataset into a pandas dataframe.
df = pd.read_csv("/content/test.csv")

# Report the number of sentences.
print('Number of validation sentences: {:,}\n'.format(df.shape[0]))

# Display 10 random rows from the data.
df.head()

Number of validation sentences: 11,857



Unnamed: 0,standard_type,document_title,document_text,label
0,un environment programme finance initiative,Practical Approaches to Applying the EU Taxono...,Practical Approaches to Applying the EU Taxono...,0
1,task force on climate-related financial disclo...,Climate-related Financial Disclosures,determine areas meriting further research and ...,0
2,task force on climate-related financial disclo...,Metric Climate-related Financial Disclosures,"Without the right information, investors and o...",0
3,Non-sustainable standards,ifrs-9-financial-instruments,active market for an identical asset or liabil...,1
4,un environment programme finance initiative,Common Framework OF Sustainable Finance Taxono...,Bank Group (2021). Energy Prices in Latin Amer...,0


In [3]:
# Get the lists of sentences and their standard type.
sentences = df.document_text.values
labels = df.label.values

In [4]:
#load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Pelumioluwa/Sustainable-Finance-BERT")
model = AutoModelForSequenceClassification.from_pretrained("Pelumioluwa/Sustainable-Finance-BERT")
#pass both in a pipeline
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]



In [5]:
#make predictions on validation set
preds = []
max_score_item_list = []
for sentence in sentences:
    pred = pipe(sentence)
    # Flatten the list of lists.
    flat_data = [item for sublist in pred for item in sublist]

    # Find the dictionary with the highest score.
    max_score_item = max(flat_data, key=lambda x:x['score'])
    max_score_item_list.append(max_score_item)

    if max_score_item['label'] == 'LABEL_0':
        preds.append(0)
    else:
        preds.append(1)

In [6]:
len(preds)

11857

In [7]:
# Confusion Matrix
cm = confusion_matrix(labels, preds)
# Accuracy
accuracy = accuracy_score(labels, preds)
# Precision
precision = precision_score(labels, preds)
# Recall
recall = recall_score(labels, preds)
# F1-Score
f1 = f1_score(labels, preds)
# ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(labels, preds)
roc_auc = auc(fpr, tpr)

print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("ROC AUC:", roc_auc)

Confusion Matrix:
[[9985  117]
 [  11 1744]]
Accuracy: 0.9892046892131231
Precision: 0.9371305749596991
Recall: 0.9937321937321937
F1-Score: 0.9646017699115044
ROC AUC: 0.9910751643774807


In [8]:
# Count the number of 1s and 0s in labels
num_nonsustain = np.count_nonzero(labels == 1)
num_sustain = np.count_nonzero(labels == 0)

print('non_sustain is : ', num_nonsustain)
print('sustain is : ', num_sustain)

non_sustain is :  1755
sustain is :  10102


In [9]:
#count numbers of 1s and 0s in preds
num_nonsustain_pred = preds.count(1)
num_sustain_pred = preds.count(0)

print('non_sustain is : ', num_nonsustain_pred)
print('sustain is : ', num_sustain_pred)

non_sustain is :  1861
sustain is :  9996
