In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.metrics import multilabel_confusion_matrix
from sentence_transformers import SentenceTransformer


# Load SDG keywords
with open('Data/sdg_keywords.json', 'r') as file:
    sdg_keywords = json.load(file)

# Load test dataset
df = pd.read_csv('Data/test_large.csv')
# df = df.sample(n=25, random_state=42)
df

Unnamed: 0,Text,SDG 1,SDG 2,SDG 3,SDG 4,SDG 5,SDG 6,SDG 7,SDG 8,SDG 9,SDG 10,SDG 11,SDG 12,SDG 13,SDG 14,SDG 15,SDG 16,SDG 17
0,"Shape, Built Enviro Projects the built environ...",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,PEP 5 this course is identified by rmit univer...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Comm Skills for Health Prof this course will e...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SW Field Education A in this course you will u...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,LSC Work Integrated Learning 2 in this course ...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,Policy analysis for growth prospects in region...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
756,Building an understanding of liveability acros...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
757,Stage 2 - Latrobe Valley Smart Specialisation ...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
758,Balance Victoria: Potential Impacts of a Form...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
#Just checking data types
unique_types = df['Text'].apply(type).unique()
print(unique_types)
print(len(df))

# Filtering out rows where 'Text' column has a data type of float or contains NaN values
df = df[df['Text'].apply(lambda x: not isinstance(x, float))]
df = df.dropna(subset=['Text'])
df = df.reset_index(drop=True)

print(len(df))
texts = df['Text'].tolist()

[<class 'str'> <class 'float'>]
760
755


In [None]:
# Initialize SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L3-v2')

# Create embeddings for text data
texts_embeddings = model.encode(texts, convert_to_numpy=True)

# Create individual embeddings for SDG keywords
sdg_keywords_embeddings = {sdg: [(word, model.encode([word], convert_to_numpy=True)[0]) for word in words] for sdg, words in sdg_keywords.items()}

In [None]:
def assign_single_sdg(text_vector, current_sdg, threshold, top_n_keywords=5):
    keyword_embeddings = sdg_keywords_embeddings[current_sdg]
    individual_word_similarities = [(keyword, cosine_similarity([embedding], [text_vector])[0][0]) for keyword, embedding in keyword_embeddings]
    individual_word_similarities.sort(key=lambda x: x[1], reverse=True)
    max_sim = individual_word_similarities[0][1]
    sdg_assignment = 1 if max_sim > threshold else 0
    top_keywords = {}
    if sdg_assignment == 1:
        top_keywords_for_sdg = [word[0] for word in individual_word_similarities[:top_n_keywords]]
        top_keywords[current_sdg] = top_keywords_for_sdg
    return sdg_assignment, top_keywords

In [None]:
# Process texts and predict SDGs
threshold = 0.60
true_labels = df.drop('Text', axis=1)
results = [{'Text': text, 'Predicted_SDGs': ''} for text in texts]
predictions_df = pd.DataFrame({'Text': texts})

for sdg in true_labels.columns:
    if sdg not in sdg_keywords:
        print(f"Warning: {sdg} is not found in sdg_keywords. Skipping...")
        continue

    assigned_sdgs_keywords = [assign_single_sdg(text_vector, sdg, threshold) for text_vector in texts_embeddings]
    assigned_sdgs, top_keywords_list = zip(*assigned_sdgs_keywords)

    for idx, text in enumerate(texts):
        if assigned_sdgs[idx] == 1:
            results[idx]['Predicted_SDGs'] += sdg + ', '
            results[idx][f'Top_Keywords_for_{sdg}'] = ', '.join(top_keywords_list[idx][sdg])

    predictions_df[sdg] = assigned_sdgs
    print(f"Finished processing {sdg}.")

# Trim the trailing commas in 'Predicted_SDGs'
for row in results:
    if row['Predicted_SDGs']:
        row['Predicted_SDGs'] = row['Predicted_SDGs'].rstrip(', ')

results_df = pd.DataFrame(results)
results_df = results_df.fillna("")
results_df

In [None]:
from sklearn.metrics import classification_report
for sdg in true_labels.columns:
    if sdg in predictions_df.columns:  # Only compare if the SDG exists in the predictions
        print(f"Classification Report for {sdg}:")
        print(classification_report(true_labels[sdg], predictions_df[sdg]))
        print("------------------------------------------------------")

In [None]:
preds_df = predictions_df.drop('Text', axis=1)
print(classification_report(true_labels, preds_df))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

results_df.to_csv(f"Reports/Version4/Results.csv")

preds_df = predictions_df.drop(columns=['Text'])
with open("Reports/Version4/evaluation_report.txt", "w") as f:
    # Write overall classification report for all SDGs
    f.write("Overall Classification Report:\n")
    f.write(classification_report(true_labels, preds_df, target_names=true_labels.columns))
    f.write("------------------------------------------------------\n\n")

    # Write classification reports for each individual SDG
    for sdg in true_labels.columns:
        if sdg in preds_df.columns:  # Only compare if the SDG exists in the predictions
            y_true = true_labels[sdg]
            y_pred = preds_df[sdg]
            f.write(f"Classification Report for {sdg}:\n")
            f.write(classification_report(y_true, y_pred) + "\n")
            f.write("------------------------------------------------------\n")

    # Write confusion matrices for each SDG
    f.write("Confusion Matrices:\n")
    for sdg in true_labels.columns:
        if sdg in preds_df.columns:  # Only compare if the SDG exists in the predictions
            y_true = true_labels[sdg]
            y_pred = preds_df[sdg]
            mcm = confusion_matrix(y_true, y_pred)
            f.write(f"Confusion Matrix for {sdg}:\n")
            f.write(str(mcm) + "\n")
            f.write("------------------------------------------------------\n")
