In [9]:
!pip install gensim
!pip install -U spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m


Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [169]:
import json
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
import spacy
nlp = spacy.load("en_core_web_sm")


# Load SDG keywords
with open('Data/sdg_keywords.json', 'r') as file:
    sdg_keywords = json.load(file)

# Load test dataset
df = pd.read_csv('Data/bal_test.csv')
#df = df.sample(n=50, random_state=42)
df

Unnamed: 0,Text,SDG 1,SDG 2,SDG 3,SDG 4,SDG 5,SDG 6,SDG 7,SDG 8,SDG 9,SDG 10,SDG 11,SDG 12,SDG 13,SDG 14,SDG 15,SDG 16,SDG 17
0,ARC Centre of Excellence for Environmental Dec...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,Biofabricating complex synthetic skin structur...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,Eco-Bio sanitary pads to end period poverty Th...,1,0,1,0,1,0,0,0,0,1,0,1,1,0,0,0,0
3,Meat Industry Efficiency and Innovation Capaci...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,Corporate offshore strategy-led Korean communi...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Working with Disability Suppor in this course ...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
496,Ecotoxicology this course introduces you to th...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
497,Languages and Literacies in this course you wi...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
498,Luxury Fashion Business Mngt The luxury segmen...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [170]:
#Just checking data types
unique_types = df['Text'].apply(type).unique()
print(unique_types)
print(len(df))

# Filtering out rows where 'Text' column has a data type of float or contains NaN values
df = df[df['Text'].apply(lambda x: not isinstance(x, float))]
df = df.dropna(subset=['Text'])
df = df.reset_index(drop=True)

print(len(df))
texts = df['Text'].tolist()

[<class 'str'>]
500
500


In [171]:
def extract_nouns(text):
    """Extract nouns and proper nouns from the provided text."""
    doc = nlp(text)
    nouns_and_propn = [token.text for token in doc if token.pos_ in ["NOUN", "PROPN"]]
    return nouns_and_propn  # Return a list of nouns and proper nouns for Word2Vec

texts_with_nouns = [extract_nouns(text) for text in texts]

In [172]:
# Compute TF-IDF scores for the words
vectorizer = TfidfVectorizer(analyzer=lambda x: x)  # Already tokenized
tfidf_matrix = vectorizer.fit_transform(texts_with_nouns)
word2tfidf = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))

def get_top_n_words(text, n=5):
    """Returns the top n words based on TF-IDF scores."""
    unique_text = set(text)  # Convert to set to remove duplicates
    word_scores = [(word, word2tfidf.get(word, 0)) for word in unique_text]
    sorted_words = sorted(word_scores, key=lambda x: x[1], reverse=True)
    return [word[0] for word in sorted_words[:n]]

# Only pick top N words from each text for embedding comparison
N = 15
texts_with_top_n_words = [get_top_n_words(text, N) for text in texts_with_nouns]
texts_with_top_n_words

  idf = np.log(n_samples / df) + 1


[['Queensland',
  'calibre',
  'Decisions',
  'track',
  'CEED',
  'fact',
  'journals',
  'ecology',
  'centre',
  'Excellence',
  'leaders',
  'goal',
  'leader',
  'vision',
  'actions'],
 ['Replace',
  '3R',
  'stimuli',
  'Emily',
  'corrosivity',
  'survival',
  's3862245',
  'vasculature',
  'Refine',
  'Reduce',
  'Liu',
  'layering',
  'Skin',
  'pharmaceutical',
  'breadth'],
 ['disposable',
  'absorption',
  'silicone',
  'petrochemical',
  'gels',
  'pad',
  'glues',
  'impermeability',
  'superabsorbent',
  'shock',
  'syndrome',
  'LMIC',
  'MHH',
  'pads',
  'eco'],
 ['bank',
  'Meat',
  'margin',
  'volume-',
  'Enhancement',
  'Efficiency',
  'pioneer',
  'Automotive',
  'Benchmarking',
  'potentials',
  'automation',
  'Capacity',
  'meat',
  'handling',
  'assurance'],
 ['Suzhou',
  'Hanoi',
  'Relocation',
  'establishment',
  'labor',
  'Foreign',
  'Korea',
  'Direct',
  'FDI',
  'Investment',
  'attention',
  'Case',
  'flows',
  'transformation',
  'Vietnam'],
 

In [173]:
# # Train Word2Vec model on the extracted nouns
# model = Word2Vec(sentences=texts_with_nouns, vector_size=100, window=5, min_count=1, workers=4)
# model.train(texts_with_nouns, total_examples=model.corpus_count, epochs=10)


# # Function to get the average word vector for a list of words
# def get_avg_word_vector(words_list, model, vector_size):
#     avg_vector = np.zeros((vector_size,), dtype="float32")
#     num_words = 0
#     for word in words_list:
#         if word in model.wv:
#             avg_vector = np.add(avg_vector, model.wv[word])
#             num_words += 1
#     if num_words:
#         avg_vector = np.divide(avg_vector, num_words)
#     return avg_vector

# # Create embeddings for the extracted nouns from text data
# texts_embeddings = np.array([get_avg_word_vector(words, model, 100) for words in texts_with_nouns])

# # Create individual embeddings for SDG keywords
# sdg_keywords_embeddings = {sdg: [(word, get_avg_word_vector([word], model, 100)) for word in words] for sdg, words in sdg_keywords.items()}


In [174]:
def phrase_embedding(phrase, model):
    """Compute the average embedding of a phrase."""
    # Split the phrase into individual words
    words = phrase.split("_")
    
    # Get vectors for each word and store them in a list
    vectors = [model.wv[word] for word in words if word in model.wv]
    
    # If no words of the phrase are in the model's vocabulary, return a zero vector
    if not vectors:
        return np.zeros(model.vector_size)

    # Average the vectors
    average_vector = np.mean(vectors, axis=0)
    
    return average_vector

In [175]:


# Train Word2Vec model on the extracted nouns
model = Word2Vec(sentences=texts_with_nouns, vector_size=300, window=4, min_count=1, workers=4)
model.train(texts_with_nouns, total_examples=model.corpus_count, epochs=10)

all_sdg_keywords = [item for sublist in sdg_keywords.values() for item in sublist]

tokenized_sdg_keywords = [keyword.split() for keyword in all_sdg_keywords]

bigram_model = Phrases(tokenized_sdg_keywords, min_count=1, threshold=10)
bigram_phraser = Phraser(bigram_model)
bigram_keywords = [bigram_phraser[keyword] for keyword in tokenized_sdg_keywords]

trigram_model = Phrases(bigram_keywords, min_count=1, threshold=10)
trigram_phraser = Phraser(trigram_model)
trigram_keywords = ['_'.join(trigram_phraser[keyword]) for keyword in bigram_keywords]

# Updated embeddings computation
sdg_keywords_embeddings = {sdg: [(word, phrase_embedding(word, model)) for word in trigram_keywords] for sdg, words in sdg_keywords.items()}
#sdg_keywords_embeddings



In [176]:
# def assign_single_sdg(text_vector, current_sdg, threshold, top_n_keywords=5):
#     # Retrieve the embeddings for the current SDG's keywords
#     keyword_embeddings = sdg_keywords_embeddings[current_sdg]
    
#     # Calculate cosine similarities between the text_vector and each keyword embedding
#     individual_word_similarities = [(keyword, cosine_similarity([embedding], [text_vector])[0][0]) for keyword, embedding in keyword_embeddings]
    
#     # Sort the keywords based on their similarity to the text_vector
#     individual_word_similarities.sort(key=lambda x: x[1], reverse=True)
    
#     # Get the highest similarity score
#     max_sim = individual_word_similarities[0][1]
#     #print(max_sim)
    
#     # Assign the SDG if the highest similarity score is greater than the threshold
#     sdg_assignment = 1 if max_sim > threshold else 0
    
#     top_keywords = {}
    
#     # If the SDG was assigned, get the top_n_keywords most similar to the text_vector
#     if sdg_assignment == 1:
#         top_keywords_for_sdg = [word[0] for word in individual_word_similarities[:top_n_keywords]]
#         top_keywords[current_sdg] = top_keywords_for_sdg
        
#     return sdg_assignment, top_keywords

In [179]:
def assign_single_sdg(text_words, current_sdg, threshold, top_n_keywords=5):
    # Ensure we are getting keywords only for the current SDG
    keyword_embeddings_for_current_sdg = sdg_keywords_embeddings[current_sdg]
    
    # Adjust the keyword format to match the embeddings format
    adjusted_keywords = [kw.replace(" ", "_") for kw in sdg_keywords[current_sdg]]
    
    # Filter the keywords and embeddings to keep only those present in both lists
    valid_keyword_embeddings = [(kw, emb) for kw, emb in keyword_embeddings_for_current_sdg if kw in adjusted_keywords]
    adjusted_keywords = [kw for kw, _ in valid_keyword_embeddings]
    keyword_embeddings = [emb for _, emb in valid_keyword_embeddings]

    # Extract valid word vectors from the text
    valid_words = [word for word in text_words if word in model.wv]
    valid_word_vectors = [model.wv[word] for word in valid_words]

    # Compute similarities in batch
    similarities = cosine_similarity(valid_word_vectors, keyword_embeddings)
    max_sim = similarities.max()

    # Find the row and column indices that produced the max similarity score
    max_sim_row_index, max_sim_col_index = np.unravel_index(np.argmax(similarities, axis=None), similarities.shape)

    # Get the text word and keyword associated with these indices
    max_sim_text_word = valid_words[max_sim_row_index]
    max_sim_keyword = adjusted_keywords[max_sim_col_index]

    # Print the relevant text word and keyword if max_sim exceeds the threshold
    if max_sim > threshold:
        print(f"Text Word: {max_sim_text_word}")
        print(f"Keyword: {max_sim_keyword}")
        print(f"Similarity Score: {max_sim}")
        print("----------")

    sdg_assignment = 1 if max_sim > threshold else 0
    top_keywords = {}

    # If the SDG was assigned, get the top_n_keywords most similar to the words in the text
    if sdg_assignment == 1:
        all_word_similarities_dict = {}
        for idx, word_vector in enumerate(valid_word_vectors):
            for keyword, similarity in zip(adjusted_keywords, similarities[idx]):
                if keyword not in all_word_similarities_dict or all_word_similarities_dict[keyword] < similarity:
                    all_word_similarities_dict[keyword] = similarity

        sorted_keywords = sorted(all_word_similarities_dict.items(), key=lambda x: x[1], reverse=True)
        top_keywords_for_sdg = [word[0] for word in sorted_keywords[:top_n_keywords]]
        top_keywords[current_sdg] = top_keywords_for_sdg

    return sdg_assignment, top_keywords



In [180]:
# Process texts and predict SDGs
threshold = 0.9999999999999
true_labels = df.drop('Text', axis=1)
results = [{'Text': text, 'Predicted_SDGs': ''} for text in texts]
predictions_df = pd.DataFrame({'Text': texts})

for sdg in true_labels.columns:
    if sdg not in sdg_keywords:
        print(f"Warning: {sdg} is not found in sdg_keywords. Skipping...")
        continue

    assigned_sdgs_keywords = [assign_single_sdg(text_nouns, sdg, threshold) for text_nouns in texts_with_top_n_words]   # Use texts_with_nouns directly here
    assigned_sdgs, top_keywords_list = zip(*assigned_sdgs_keywords)

    for idx, text in enumerate(texts):
        if assigned_sdgs[idx] == 1:
            results[idx]['Predicted_SDGs'] += sdg + ', '
            results[idx][f'Top_Keywords_for_{sdg}'] = ', '.join(top_keywords_list[idx][sdg])

    predictions_df[sdg] = assigned_sdgs
    print(f"Finished processing {sdg}.")

# Trim the trailing commas in 'Predicted_SDGs'
for row in results:
    if row['Predicted_SDGs']:
        row['Predicted_SDGs'] = row['Predicted_SDGs'].rstrip(', ')

results_df = pd.DataFrame(results)
results_df = results_df.fillna("")
results_df

Text Word: resilience
Keyword: poor_resilience
Similarity Score: 1.0000000000000004
----------
Text Word: poverty
Keyword: absolute_poverty
Similarity Score: 1.0
----------
Text Word: resilience
Keyword: poor_resilience
Similarity Score: 1.0000000000000004
----------
Text Word: poverty
Keyword: absolute_poverty
Similarity Score: 1.0
----------
Text Word: poverty
Keyword: absolute_poverty
Similarity Score: 1.0
----------
Text Word: children
Keyword: counting_marginalised_children
Similarity Score: 1.0000000000000004
----------
Text Word: economic
Keyword: economic_marginalisation
Similarity Score: 1.0
----------
Text Word: children
Keyword: counting_marginalised_children
Similarity Score: 1.0000000000000004
----------
Finished processing SDG 1.
Text Word: targets
Keyword: hunger_targets
Similarity Score: 1.0000000000000002
----------
Text Word: resource
Keyword: resource_poor
Similarity Score: 0.9999999999999998
----------
Text Word: energy
Keyword: energy_poor
Similarity Score: 1.00000

Text Word: resistance
Keyword: antibiotic_resistance
Similarity Score: 1.0000000000000009
----------
Text Word: distress
Keyword: debt_distress
Similarity Score: 1.0000000000000009
----------
Text Word: vaccines
Keyword: antipoverty_vaccines
Similarity Score: 1.0000000000000009
----------
Text Word: ethics
Keyword: medical_ethics
Similarity Score: 1.0
----------
Text Word: wealth
Keyword: wealth
Similarity Score: 1.0000000000000002
----------
Text Word: practices
Keyword: newborn_practices
Similarity Score: 1.000000000000001
----------
Text Word: ethics
Keyword: medical_ethics
Similarity Score: 1.0
----------
Text Word: model
Keyword: meikirch_model
Similarity Score: 1.0000000000000009
----------
Text Word: model
Keyword: meikirch_model
Similarity Score: 1.0000000000000009
----------
Text Word: cigarette
Keyword: cigarette_smoke
Similarity Score: 1.0000000000000007
----------
Text Word: prevention
Keyword: community-based_prevention
Similarity Score: 1.0000000000000009
----------
Text 

Text Word: facility
Keyword: facility_birth
Similarity Score: 1.0000000000000009
----------
Text Word: weight
Keyword: low_birth_weight
Similarity Score: 1.0000000000000009
----------
Text Word: distribution
Keyword: contraceptive_distribution
Similarity Score: 1.0000000000000004
----------
Text Word: system
Keyword: caste_system
Similarity Score: 1.0000000000000004
----------
Text Word: tumors
Keyword: malignant_tumors
Similarity Score: 1.0000000000000004
----------
Text Word: delivery
Keyword: delivery
Similarity Score: 1.0000000000000009
----------
Text Word: women
Keyword: pregnant_women
Similarity Score: 1.0000000000000009
----------
Text Word: initiation
Keyword: early_initiation
Similarity Score: 1.0000000000000004
----------
Text Word: quality
Keyword: quality_of_health-care
Similarity Score: 1.0000000000000002
----------
Text Word: emergency
Keyword: obstetric_emergency
Similarity Score: 1.0000000000000004
----------
Text Word: development
Keyword: african_development
Similari

Text Word: asia
Keyword: central_asia
Similarity Score: 1.0000000000000004
----------
Text Word: rehabilitation
Keyword: community-based_rehabilitation
Similarity Score: 1.0000000000000004
----------
Text Word: children
Keyword: hiv-exposed_uninfected_children
Similarity Score: 1.0000000000000004
----------
Text Word: ehealth
Keyword: ehealth
Similarity Score: 1.0000000000000007
----------
Text Word: professional
Keyword: professional_birth_attendants
Similarity Score: 1.0000000000000002
----------
Text Word: campaigns
Keyword: immunization_campaigns
Similarity Score: 1.0000000000000002
----------
Text Word: relations
Keyword: church-state_relations_in_melanesia
Similarity Score: 1.0000000000000004
----------
Text Word: practices
Keyword: newborn_practices
Similarity Score: 1.000000000000001
----------
Text Word: adults
Keyword: adults
Similarity Score: 1.0000000000000009
----------
Text Word: medicines
Keyword: essential_medicines
Similarity Score: 1.0000000000000002
----------
Text W

Text Word: literacy
Keyword: basic_literacy
Similarity Score: 1.0000000000000009
----------
Finished processing SDG 4.
Text Word: experience
Keyword: unwanted_sexual_experience
Similarity Score: 1.0000000000000009
----------
Text Word: households
Keyword: female-headed_households
Similarity Score: 1.0000000000000007
----------
Text Word: communities
Keyword: andean_communities
Similarity Score: 1.0000000000000004
----------
Text Word: groups
Keyword: womens_groups
Similarity Score: 1.0000000000000004
----------
Text Word: exploitation
Keyword: sexual_exploitation
Similarity Score: 1.0000000000000002
----------
Text Word: retention
Keyword: girls_retention
Similarity Score: 1.0000000000000004
----------
Text Word: groups
Keyword: womens_groups
Similarity Score: 1.0000000000000004
----------
Text Word: factors
Keyword: maternal_factors
Similarity Score: 1.0000000000000009
----------
Text Word: violence
Keyword: sexual_violence
Similarity Score: 1.0000000000000004
----------
Text Word: ri

Text Word: project
Keyword: aquapot_project
Similarity Score: 1.0000000000000002
----------
Text Word: water
Keyword: accessible_water
Similarity Score: 1.0000000000000004
----------
Text Word: contamination
Keyword: fecal_contamination
Similarity Score: 1.000000000000001
----------
Text Word: flood
Keyword: flood
Similarity Score: 1.0000000000000004
----------
Text Word: project
Keyword: aquapot_project
Similarity Score: 1.0000000000000002
----------
Text Word: floods
Keyword: floods
Similarity Score: 1.0000000000000004
----------
Text Word: adoption
Keyword: sanitation_adoption
Similarity Score: 1.0000000000000004
----------
Text Word: supply
Keyword: drinking-water_supply
Similarity Score: 1.0000000000000004
----------
Text Word: system
Keyword: sanitation_system
Similarity Score: 1.0000000000000004
----------
Text Word: adoption
Keyword: sanitation_adoption
Similarity Score: 1.0000000000000004
----------
Text Word: schools
Keyword: wash_in_schools
Similarity Score: 1.00000000000000

Text Word: incentives
Keyword: financial_incentives
Similarity Score: 1.0000000000000007
----------
Text Word: growth
Keyword: gdp_growth
Similarity Score: 1.000000000000001
----------
Text Word: change
Keyword: occupational_change
Similarity Score: 1.0000000000000009
----------
Text Word: economy
Keyword: economy
Similarity Score: 1.0000000000000004
----------
Text Word: enterprise
Keyword: enterprise
Similarity Score: 1.0000000000000004
----------
Text Word: investment
Keyword: sustainable_return_on_investment
Similarity Score: 1.0
----------
Text Word: evaluation
Keyword: evaluation_economique
Similarity Score: 1.0000000000000009
----------
Text Word: patterns
Keyword: expenditure_patterns
Similarity Score: 1.0000000000000007
----------
Text Word: workforce
Keyword: midwifery_workforce
Similarity Score: 1.0000000000000004
----------
Text Word: security
Keyword: financial_security
Similarity Score: 1.0000000000000002
----------
Text Word: change
Keyword: occupational_change
Similarit

Text Word: infrastructure
Keyword: ecological_infrastructure
Similarity Score: 1.0000000000000009
----------
Text Word: buildings
Keyword: industrial_buildings
Similarity Score: 1.0000000000000004
----------
Text Word: innovation
Keyword: inclusive_innovation
Similarity Score: 1.0000000000000004
----------
Text Word: markets
Keyword: markets
Similarity Score: 1.0000000000000004
----------
Text Word: buildings
Keyword: industrial_buildings
Similarity Score: 1.0000000000000004
----------
Text Word: markets
Keyword: markets
Similarity Score: 1.0000000000000004
----------
Text Word: peacebuilding
Keyword: peacebuilding
Similarity Score: 1.0000000000000007
----------
Text Word: innovations
Keyword: technological_innovations
Similarity Score: 1.0000000000000004
----------
Text Word: systems
Keyword: traditional_agroforestry_systems
Similarity Score: 1.0000000000000004
----------
Text Word: laboratory
Keyword: laboratory
Similarity Score: 1.0
----------
Text Word: internet
Keyword: internet
S

Text Word: nations
Keyword: least_developed_nations
Similarity Score: 1.0000000000000004
----------
Text Word: trade
Keyword: free_trade
Similarity Score: 1.0000000000000002
----------
Text Word: cycle
Keyword: adaptive_cycle
Similarity Score: 1.0000000000000009
----------
Text Word: assistance
Keyword: foreign_assistance
Similarity Score: 1.0000000000000009
----------
Text Word: development
Keyword: development
Similarity Score: 1.0000000000000002
----------
Text Word: divisions
Keyword: ethnic_divisions
Similarity Score: 1.0000000000000004
----------
Text Word: economic
Keyword: economic_disparity
Similarity Score: 1.0
----------
Text Word: cycle
Keyword: adaptive_cycle
Similarity Score: 1.0000000000000009
----------
Text Word: states
Keyword: fragile_states
Similarity Score: 1.0000000000000007
----------
Text Word: discrimination
Keyword: discrimination
Similarity Score: 1.0000000000000009
----------
Text Word: cooperation
Keyword: eu-gcc_cooperation
Similarity Score: 1.000000000000

Text Word: travel
Keyword: travel
Similarity Score: 1.000000000000001
----------
Text Word: migration
Keyword: migration
Similarity Score: 1.0000000000000007
----------
Text Word: racism
Keyword: racism
Similarity Score: 1.0000000000000002
----------
Text Word: systems
Keyword: small_systems
Similarity Score: 1.0000000000000004
----------
Text Word: contamination
Keyword: faecal_contamination
Similarity Score: 1.000000000000001
----------
Text Word: settlements
Keyword: urban_settlements
Similarity Score: 1.0000000000000004
----------
Text Word: human
Keyword: human_dignity
Similarity Score: 1.0000000000000009
----------
Text Word: urbanization
Keyword: sustainable_urbanization
Similarity Score: 1.0000000000000002
----------
Text Word: humanities
Keyword: humanities
Similarity Score: 1.0000000000000002
----------
Text Word: biomass
Keyword: faecal_biomass
Similarity Score: 1.0000000000000009
----------
Text Word: society
Keyword: civil_society
Similarity Score: 1.0000000000000009
-----

Text Word: pollutants
Keyword: emerging_pollutants
Similarity Score: 1.0000000000000004
----------
Text Word: change
Keyword: environmental_change
Similarity Score: 1.0000000000000009
----------
Text Word: pollution
Keyword: diffuse_pollution
Similarity Score: 1.0000000000000004
----------
Text Word: change
Keyword: environmental_change
Similarity Score: 1.0000000000000009
----------
Text Word: emissions
Keyword: cumulative_emissions
Similarity Score: 1.0000000000000007
----------
Text Word: changes
Keyword: environmental_changes
Similarity Score: 1.0000000000000004
----------
Text Word: emission
Keyword: emission
Similarity Score: 1.0000000000000009
----------
Text Word: climate
Keyword: climate_forestatiion
Similarity Score: 1.0000000000000002
----------
Text Word: agreement
Keyword: paris_agreement
Similarity Score: 1.0000000000000004
----------
Text Word: declaration
Keyword: paris_declaration
Similarity Score: 1.0000000000000009
----------
Text Word: principles
Keyword: paris_prin

Text Word: asthma
Keyword: asthma
Similarity Score: 1.0000000000000007
----------
Text Word: inventory
Keyword: tree_inventory
Similarity Score: 1.0000000000000002
----------
Text Word: agriculture
Keyword: irrigated_agriculture
Similarity Score: 1.0000000000000002
----------
Text Word: production
Keyword: net_cereal_production
Similarity Score: 1.0000000000000002
----------
Text Word: diversity
Keyword: convention_on_biological_diversity
Similarity Score: 1.0000000000000009
----------
Text Word: minerals
Keyword: minerals
Similarity Score: 1.0000000000000004
----------
Text Word: inventory
Keyword: tree_inventory
Similarity Score: 1.0000000000000002
----------
Text Word: soil
Keyword: soil
Similarity Score: 1.0000000000000007
----------
Text Word: products
Keyword: tree_products
Similarity Score: 1.0000000000000009
----------
Text Word: asthma
Keyword: asthma
Similarity Score: 1.0000000000000007
----------
Text Word: production
Keyword: net_cereal_production
Similarity Score: 1.000000

Text Word: human
Keyword: human_trafficking
Similarity Score: 1.0000000000000009
----------
Text Word: nations
Keyword: the_united_nations_(un)
Similarity Score: 1.0000000000000004
----------
Text Word: morbidity
Keyword: maternal_morbidity
Similarity Score: 1.0000000000000009
----------
Text Word: child
Keyword: child
Similarity Score: 1.0000000000000009
----------
Text Word: administration
Keyword: international_administration
Similarity Score: 1.0000000000000009
----------
Text Word: ethics
Keyword: ethics
Similarity Score: 1.0
----------
Text Word: accountability
Keyword: accountability
Similarity Score: 1.0000000000000002
----------
Text Word: death
Keyword: death_clustering
Similarity Score: 1.0000000000000002
----------
Text Word: order
Keyword: birth_order
Similarity Score: 1.0000000000000004
----------
Text Word: abuse
Keyword: disrespect_and_abuse
Similarity Score: 1.0000000000000004
----------
Text Word: preparedness
Keyword: birth_preparedness
Similarity Score: 1.0000000000

Text Word: policy
Keyword: coordinated_policy
Similarity Score: 1.0000000000000002
----------
Text Word: framework
Keyword: the_integrative_worldview_framework_(iwf)
Similarity Score: 1.0000000000000009
----------
Text Word: partner
Keyword: sustainable_partner
Similarity Score: 1.0000000000000004
----------
Text Word: change
Keyword: global_change
Similarity Score: 1.0000000000000009
----------
Text Word: strategy
Keyword: global_strategy
Similarity Score: 1.0000000000000004
----------
Text Word: support
Keyword: international_support
Similarity Score: 1.0000000000000009
----------
Text Word: progress
Keyword: progress_of_mdg4
Similarity Score: 1.0000000000000009
----------
Text Word: relevance
Keyword: indicatorsi_relevance
Similarity Score: 1.0000000000000007
----------
Text Word: policies
Keyword: coordinated_policies
Similarity Score: 1.0000000000000004
----------
Text Word: aids
Keyword: aids
Similarity Score: 1.0000000000000007
----------
Text Word: system
Keyword: trading_syste

Unnamed: 0,Text,Predicted_SDGs,Top_Keywords_for_SDG 16,Top_Keywords_for_SDG 17,Top_Keywords_for_SDG 3,Top_Keywords_for_SDG 15,Top_Keywords_for_SDG 8,Top_Keywords_for_SDG 11,Top_Keywords_for_SDG 2,Top_Keywords_for_SDG 4,Top_Keywords_for_SDG 1,Top_Keywords_for_SDG 6,Top_Keywords_for_SDG 5,Top_Keywords_for_SDG 12,Top_Keywords_for_SDG 14,Top_Keywords_for_SDG 10,Top_Keywords_for_SDG 9,Top_Keywords_for_SDG 7,Top_Keywords_for_SDG 13
0,ARC Centre of Excellence for Environmental Dec...,"SDG 16, SDG 17","who_kobe_centre, political_ecology, world_wate...","goal_#5, millennium_goal, sustainable_developm...",,,,,,,,,,,,,,,
1,Biofabricating complex synthetic skin structur...,"SDG 3, SDG 16","neonatal_survival, newborn_survival, hyogo_fra...",,"maternal_survival, survival_of_recent_births, ...",,,,,,,,,,,,,,
2,Eco-Bio sanitary pads to end period poverty Th...,"SDG 3, SDG 15",,,"copenhagen_syndrome, guillian-barre_syndrome, ...","eco_restoration, land_degradation, land_degrad...",,,,,,,,,,,,,
3,Meat Industry Efficiency and Innovation Capaci...,,,,,,,,,,,,,,,,,,
4,Corporate offshore strategy-led Korean communi...,"SDG 3, SDG 8, SDG 11",,,"genetic_transformation, disability-adjusted_li...",,"labor, obstetric_labor, international_financia...","urban_transformation, model_villages, urban_de...",,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Working with Disability Suppor in this course ...,"SDG 2, SDG 3, SDG 4, SDG 5, SDG 11, SDG 16","child, united_nations_disability_convention, m...",,"child_feeding, child_malnutrition, child_stunt...",,,"family, family_farming, family_poultry, small_...","child_hunger, child_survival, child_nutrition,...","child_diarrhea, family_literacy, education_and...",,,"child_maltreatment, girl_child, child_marriage...",,,,,,
496,Ecotoxicology this course introduces you to th...,"SDG 6, SDG 9, SDG 13",,,,,,,,,,"pollution, water_footprint_assessment, wastewa...",,,,,"laboratory, technology_assessment, market_perf...",,"diffuse_pollution, pollution, indoor_air_pollu..."
497,Languages and Literacies in this course you wi...,SDG 11,,,,,,"language, language_and_culture, language_of_in...",,,,,,,,,,,
498,Luxury Fashion Business Mngt The luxury segmen...,,,,,,,,,,,,,,,,,,


In [95]:
#results_df.iloc[0]

In [181]:
from sklearn.metrics import classification_report
for sdg in true_labels.columns:
    if sdg in predictions_df.columns:  # Only compare if the SDG exists in the predictions
        print(f"Classification Report for {sdg}:")
        print(classification_report(true_labels[sdg], predictions_df[sdg]))
        print("------------------------------------------------------")

Classification Report for SDG 1:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       475
           1       0.50      0.16      0.24        25

    accuracy                           0.95       500
   macro avg       0.73      0.58      0.61       500
weighted avg       0.93      0.95      0.94       500

------------------------------------------------------
Classification Report for SDG 2:
              precision    recall  f1-score   support

           0       0.96      0.86      0.91       474
           1       0.12      0.35      0.17        26

    accuracy                           0.83       500
   macro avg       0.54      0.60      0.54       500
weighted avg       0.92      0.83      0.87       500

------------------------------------------------------
Classification Report for SDG 3:
              precision    recall  f1-score   support

           0       0.84      0.65      0.73       374
           1       0.38     

In [182]:
preds_df = predictions_df.drop('Text', axis=1)
print(classification_report(true_labels, preds_df))

              precision    recall  f1-score   support

           0       0.50      0.16      0.24        25
           1       0.12      0.35      0.17        26
           2       0.38      0.63      0.47       126
           3       0.48      0.26      0.34       125
           4       0.09      0.27      0.13        22
           5       0.16      0.42      0.23        24
           6       0.52      0.36      0.42        39
           7       0.30      0.39      0.34        66
           8       0.21      0.16      0.18        74
           9       0.19      0.18      0.19        66
          10       0.27      0.47      0.34        73
          11       0.27      0.12      0.17        75
          12       0.27      0.17      0.21        48
          13       0.14      0.24      0.18        21
          14       0.14      0.28      0.18        25
          15       0.24      0.43      0.31        77
          16       0.16      0.17      0.17        53

   micro avg       0.26   

  _warn_prf(average, modifier, msg_start, len(result))


In [183]:
from sklearn.metrics import classification_report, confusion_matrix

results_df.to_csv(f"Reports/WordEmbedding/Thresh099_Results.csv")

preds_df = predictions_df.drop(columns=['Text'])
with open("Reports/WordEmbedding/Thresh099_eval_report.txt", "w") as f:
    # Write overall classification report for all SDGs
    f.write("Overall Classification Report:\n")
    f.write(classification_report(true_labels, preds_df, target_names=true_labels.columns))
    f.write("------------------------------------------------------\n\n")

    # Write classification reports for each individual SDG
    for sdg in true_labels.columns:
        if sdg in preds_df.columns:  # Only compare if the SDG exists in the predictions
            y_true = true_labels[sdg]
            y_pred = preds_df[sdg]
            f.write(f"Classification Report for {sdg}:\n")
            f.write(classification_report(y_true, y_pred) + "\n")
            f.write("------------------------------------------------------\n")

    # Write confusion matrices for each SDG
    f.write("Confusion Matrices:\n")
    for sdg in true_labels.columns:
        if sdg in preds_df.columns:  # Only compare if the SDG exists in the predictions
            y_true = true_labels[sdg]
            y_pred = preds_df[sdg]
            mcm = confusion_matrix(y_true, y_pred)
            f.write(f"Confusion Matrix for {sdg}:\n")
            f.write(str(mcm) + "\n")
            f.write("------------------------------------------------------\n")


  _warn_prf(average, modifier, msg_start, len(result))
