# **Dictionary Generation**

## Homework 2: Mini Research Project

### *Introduction to Text Mining and Natural Language Processing*

#### Authors:
- Anastasiia Chernavskaia
- Marvin Ernst
- Viktoria Gagua  

**Installing relevant libraries and packages:**

In [1]:
# Standard Library Imports
import os
import sys
import re
import subprocess
import json
import warnings
from collections import Counter, defaultdict

# Data Handling
import pandas as pd
import numpy as np

# Text Processing (NLTK)
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer, PorterStemmer
from nltk.corpus import stopwords
from nltk.util import ngrams

# Text Processing (spaCy)
import spacy

# Machine Learning & NLP
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Visualization
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt

# Progress Bar
from tqdm import tqdm

# Hugging Face Datasets
from datasets import load_dataset, get_dataset_infos

# Suppress Warnings
warnings.filterwarnings('ignore')

Download Resources:

In [2]:
# Function to download NLTK resources
def download_nltk_resources():
    required_resources = ['wordnet', 'stopwords', 'punkt']
    for resource in required_resources:
        try:
            nltk.data.find(f'tokenizers/{resource}' if resource == 'punkt' else f'corpora/{resource}')
        except LookupError:
            nltk.download(resource)

download_nltk_resources()

# Function to install and load spaCy model
def install_spacy_model(model_name):
    try:
        return spacy.load(model_name)
    except OSError:
        print(f"Downloading spaCy model: {model_name}")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])
        return spacy.load(model_name)

sp = install_spacy_model('en_core_web_sm')

# Enable tqdm for pandas
tqdm.pandas()

# Initialize stemmers and lemmatizer
porter = SnowballStemmer("english")
lmtzr = WordNetLemmatizer()
STOP_WORDS = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to /Users/newmac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Get the Corpus for creating the Libraries: it's a dataset of UNDP reports on SDG progress, available through the datasets library.

In [3]:
ds = load_dataset("UNDP/sdgi-corpus")

# Step 1: Exploring the data

Fetch dataset metadata:

In [4]:
info = get_dataset_infos("UNDP/sdgi-corpus")
print(info)

{'default': DatasetInfo(description='', citation='', homepage='', license='', features={'text': Value(dtype='string', id=None), 'embedding': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'metadata': {'country': Value(dtype='string', id=None), 'file_id': Value(dtype='string', id=None), 'language': Value(dtype='string', id=None), 'locality': Value(dtype='string', id=None), 'size': Value(dtype='string', id=None), 'type': Value(dtype='string', id=None), 'year': Value(dtype='int64', id=None)}}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='sdgi-corpus', config_name='default', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=125183220, num_examples=5880, shard_lengths=None, dataset_name='sdgi-corpus'), 'test': SplitInfo(name='test', num_bytes=37231371, num_examples=1470, shard_lengths=None, dataset_name='sdgi-corpus')}, download_checksums

In [5]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'embedding', 'labels', 'metadata'],
        num_rows: 5880
    })
    test: Dataset({
        features: ['text', 'embedding', 'labels', 'metadata'],
        num_rows: 1470
    })
})


Because the dataset already came pre-split into train and test, we combine them to have a larger sample of texts: 5880 reports in Train, 1470 in Test, so 7350 in the Combined dataset.

In [128]:
train_df = ds['train'].to_pandas()  

# Convert test split to dataframe:
test_df = ds['test'].to_pandas()

# Combine both splits into one dataframe:
train_df['split'] = 'train'
test_df['split'] = 'test'
full_df = pd.concat([train_df, test_df], ignore_index=True)

print(train_df.info())
print(test_df.info())
print(full_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5880 entries, 0 to 5879
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       5880 non-null   object
 1   embedding  5880 non-null   object
 2   labels     5880 non-null   object
 3   metadata   5880 non-null   object
 4   split      5880 non-null   object
dtypes: object(5)
memory usage: 229.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       1470 non-null   object
 1   embedding  1470 non-null   object
 2   labels     1470 non-null   object
 3   metadata   1470 non-null   object
 4   split      1470 non-null   object
dtypes: object(5)
memory usage: 57.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7350 entries, 0 to 7349
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
--- 

In [7]:
full_df.head(20)

Unnamed: 0,text,embedding,labels,metadata,split
0,END POVERTY\r\nIN ALL ITS FORMS \r\nEVERYWHERE...,"[-0.0066694277, -0.0135167073, 0.0032988016000...",[1],"{'country': 'arg', 'file_id': 'sdg-01/vlr-repo...",train
1,PONER FIN A LA POBREZA\r\nEN TODAS SUS FORMAS ...,"[-0.0120671522, -0.0182979926, 0.0129378373000...",[1],"{'country': 'arg', 'file_id': 'sdg-01/vlr-repo...",train
2,End poverty in all its \r\nforms everywhere\r\...,"[-0.0063659865, -0.0047052945, 0.0053374511000...",[1],"{'country': 'arg', 'file_id': 'sdg-01/vlr-repo...",train
3,Poner fin a la pobreza\r\nen todas sus formas\...,"[-0.012985382200000001, -0.0147539768, 0.00048...",[1],"{'country': 'arg', 'file_id': 'sdg-01/vlr-repo...",train
4,"15\r\nWith regard to this SDG in particular, i...","[-0.007488854200000001, -0.0136874933, 0.02816...",[1],"{'country': 'arg', 'file_id': 'sdg-01/vlr-repo...",train
5,Informe Voluntario Local CÓRDOBA 2022 | Provin...,"[-0.0177864786, -0.0321719944, 0.0175944883, -...",[1],"{'country': 'arg', 'file_id': 'sdg-01/vlr-repo...",train
6,9\r\n4. PROGRESO DE LOS OBJETIVOS\r\n \r\n1. F...,"[-0.014082906800000001, -0.0218088981, 0.01269...",[1],"{'country': 'arg', 'file_id': 'sdg-01/vlr-repo...",train
7,ODS 1. Poner fin a la pobreza en todas sus for...,"[-0.0198593847, -0.014584235800000001, -0.0031...",[1],"{'country': 'arg', 'file_id': 'sdg-01/vlr-repo...",train
8,"ࠀ߿ |Fin de la pobreza\r\nࠀ.ࠃ De aquí a ߿ࠂ߿ࠁ, g...","[-0.0218512956, -0.0250046607, -0.0008152561, ...",[1],"{'country': 'arg', 'file_id': 'sdg-01/vlr-repo...",train
9,22 melbourne.vic.gov.au\r\nContext\r\nPoverty ...,"[0.0011064174, -0.0195970125, 0.0097718257, -0...",[1],"{'country': 'aus', 'file_id': 'sdg-01/vlr-repo...",train


In [None]:
full_df = full_df.drop(columns=['split']) # Drop the split column

Checking unique labels for SDGs:

In [9]:
all_labels = np.concatenate(full_df['labels'].values)
unique_labels = np.unique(all_labels)
num_unique_labels = len(unique_labels)
print(f"Number of unique labels: {num_unique_labels}")
print("Unique labels:", unique_labels)

Number of unique labels: 17
Unique labels: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]


Checking for the reports with more than one SDG label:

In [10]:
num_multi_label = (full_df['labels'].apply(len) > 1).sum()
print(f"Number of documents with more than one label: {num_multi_label}")

Number of documents with more than one label: 809


We decided to keep the multi-labeled documents as we are further trying to recognize that documents can be relevant to multiple SDGs.

# Step 2: Filtering the dataset

As UNDP reports come in six official UN languages, we have to filter and only keep the documents that have text data in English:

In [11]:
df_english = full_df[full_df['metadata'].apply(lambda x: x['language'] == 'en')]

Check the shape before and after filtering:

In [12]:
print(f"Original shape: {full_df.shape}")
print(f"After filtering for English: {df_english.shape}")

Original shape: (7350, 4)
After filtering for English: (5282, 4)


# Step 3: Creating Separate DataFrames - one per each SDG

In [13]:
sdg_dfs = {}

for i in range(1, 18):
    sdg_num = f"sdg-{i:02d}"  
    df_name = f"df_sdg{i:02d}" 

    sdg_dfs[df_name] = df_english[df_english['metadata'].apply(lambda x: x['file_id'].startswith(sdg_num))]

    print(f"{df_name}: {len(sdg_dfs[df_name])} documents")

globals().update(sdg_dfs)

df_sdg01: 284 documents
df_sdg02: 265 documents
df_sdg03: 317 documents
df_sdg04: 302 documents
df_sdg05: 290 documents
df_sdg06: 251 documents
df_sdg07: 245 documents
df_sdg08: 312 documents
df_sdg09: 262 documents
df_sdg10: 265 documents
df_sdg11: 300 documents
df_sdg12: 261 documents
df_sdg13: 279 documents
df_sdg14: 212 documents
df_sdg15: 249 documents
df_sdg16: 275 documents
df_sdg17: 310 documents


# Step 4: Text Preprocessing

In [None]:
# Function to preprocess text: remove special symbols, punctuation and numbers, lowercase, remove stopwords, and lemmatize
def preprocess_text(text, remove_stopwords=True, lemmatize=True, use_spacy=False):
    if not isinstance(text, str) or not text.strip():
        return ""

    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    if use_spacy:
        tokens = [token.text for token in sp(text)]  # Using preloaded spaCy model
    else:
        tokens = word_tokenize(text)  # Default: Use NLTK tokenization
    
    if remove_stopwords:
        custom_stopwords = {'per', 'also', 'percent', 'target', 'sdg', 'country', 'national', 'government', 'development', 'year', 'system', 'sector', 
            'policy', 'support', 'access', 'public', 'new', 'level', 'sdgs', 'people'}
        stopwords_set = STOP_WORDS.union(custom_stopwords)
        tokens = [token for token in tokens if token not in stopwords_set]

    if lemmatize:
        tokens = [lmtzr.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

Apply preprocessing to all SDG DataFrames:

In [15]:
for sdg, df in sdg_dfs.items():
    print(f"Preprocessing {sdg} documents...")
    df["processed_text"] = df["text"].astype(str).progress_apply(
        lambda x: preprocess_text(x, remove_stopwords=True, lemmatize=True, use_spacy=True))

Preprocessing df_sdg01 documents...


100%|██████████| 284/284 [01:37<00:00,  2.92it/s]


Preprocessing df_sdg02 documents...


100%|██████████| 265/265 [01:16<00:00,  3.49it/s]


Preprocessing df_sdg03 documents...


100%|██████████| 317/317 [01:44<00:00,  3.03it/s]


Preprocessing df_sdg04 documents...


100%|██████████| 302/302 [02:00<00:00,  2.52it/s]


Preprocessing df_sdg05 documents...


100%|██████████| 290/290 [01:54<00:00,  2.54it/s]


Preprocessing df_sdg06 documents...


100%|██████████| 251/251 [01:12<00:00,  3.46it/s]


Preprocessing df_sdg07 documents...


100%|██████████| 245/245 [01:10<00:00,  3.49it/s]


Preprocessing df_sdg08 documents...


100%|██████████| 312/312 [02:03<00:00,  2.53it/s]


Preprocessing df_sdg09 documents...


100%|██████████| 262/262 [01:35<00:00,  2.74it/s]


Preprocessing df_sdg10 documents...


100%|██████████| 265/265 [01:17<00:00,  3.41it/s]


Preprocessing df_sdg11 documents...


100%|██████████| 300/300 [01:54<00:00,  2.61it/s]


Preprocessing df_sdg12 documents...


100%|██████████| 261/261 [01:07<00:00,  3.85it/s]


Preprocessing df_sdg13 documents...


100%|██████████| 279/279 [01:32<00:00,  3.01it/s]


Preprocessing df_sdg14 documents...


100%|██████████| 212/212 [00:54<00:00,  3.90it/s]


Preprocessing df_sdg15 documents...


100%|██████████| 249/249 [01:23<00:00,  2.96it/s]


Preprocessing df_sdg16 documents...


100%|██████████| 275/275 [01:55<00:00,  2.38it/s]


Preprocessing df_sdg17 documents...


100%|██████████| 310/310 [02:01<00:00,  2.55it/s]


Preprocessing Results:

In [None]:
# Display a sample of preprocessed text
sdg_samples = {}
for sdg, df in sdg_dfs.items():
    if not df.empty: 
        sample = df[['text', 'processed_text']].sample(3, random_state=42) 
        sdg_samples[sdg] = sample

sample_df = pd.concat(sdg_samples.values(), keys=sdg_samples.keys()).reset_index(level=0).rename(columns={'level_0': 'SDG'})

display(sample_df)

Unnamed: 0,SDG,text,processed_text
22,df_sdg01,109\r\nsdg 1. no poVertY \r\nEnd poverty in al...,poverty end poverty form everywhere eradicate ...
5918,df_sdg01,37\r\nData deployment\r\nOne weakness of publi...,data deployment one weakness policy availabili...
231,df_sdg01,Voluntary National Review | 20\r\nSDG 1 – No P...,voluntary review poverty general situation sou...
587,df_sdg02,2019 RWANDA VOLUNTARY NATIONAL REVIEW (VNR) RE...,rwanda voluntary review vnr report end hunger ...
508,df_sdg02,60 VOLUNTARY NATIONAL REVIEW | ICELAND 2023\r\...,voluntary review iceland goal zero hunger supp...
481,df_sdg02,24 | VOLUNTARY NATIONAL REVIEW • FIJI’S PROGRE...,voluntary review fiji progress implementation ...
741,df_sdg03,14\r\nAccording to the OECD \r\nBetter Life in...,according oecd better life index belgium rank ...
6045,df_sdg03,From Agenda to Action — 31\r\nHealth and welfa...,agenda action health welfare promotion hyte in...
679,df_sdg03,5th Monitoring Report - 23 \r\nSDG 3 – Summary...,th monitoring report summary content ensure he...
1261,df_sdg04,100 KINGDOM OF THE NETHERLANDS - VNR 2022\r\nS...,kingdom netherlands vnr quality education seve...


# Step 5: **TF-IDF** on the UNDP reports to generate dictionaries

Prepare SDG text for TF-IDF:

In [17]:
sdg_corpus = {sdg: " ".join(df['processed_text']) for sdg, df in sdg_dfs.items()}

Define TF-IDF Vectorizer with unigrams & bigrams:

We experimented with min_df and max_df and discovered that very generous bounds (0.9 and 0.1) give us the most meaningful dictionaries.

In [108]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=0.1, max_features=500) 

Fit TF-IDF on the SDG corpus:

In [109]:
tfidf_matrix = vectorizer.fit_transform(sdg_corpus.values())

Extract feature names (terms):

In [110]:
feature_names = vectorizer.get_feature_names_out()

SDG names:

In [54]:
sdg_names = {
    'df_sdg01': 'No Poverty',
    'df_sdg02': 'Zero Hunger',
    'df_sdg03': 'Good Health and Well-being',
    'df_sdg04': 'Quality Education',
    'df_sdg05': 'Gender Equality',
    'df_sdg06': 'Clean Water and Sanitation',
    'df_sdg07': 'Affordable and Clean Energy',
    'df_sdg08': 'Decent Work and Economic Growth',
    'df_sdg09': 'Industry, Innovation and Infrastructure',
    'df_sdg10': 'Reduced Inequality',
    'df_sdg11': 'Sustainable Cities and Communities',
    'df_sdg12': 'Responsible Consumption and Production',
    'df_sdg13': 'Climate Action',
    'df_sdg14': 'Life Below Water',
    'df_sdg15': 'Life on Land',
    'df_sdg16': 'Peace, Justice and Strong Institutions',
    'df_sdg17': 'Partnerships for the Goals'
}

Compute TF-IDF Scores:

Previously we tried with 20 terms, and the performance was already good and now we increase it to 50 and also add Tf-Idf scores to get even better performance, i.e. we weight terms that might be rare higher when they appear in a speach.

In [111]:
n_top_terms = 50

tfidf_sdg_dicts = {}

for i, (sdg, text) in enumerate(sdg_corpus.items()):
    tfidf_scores = tfidf_matrix[i].toarray().flatten() #times v appears in the corpus
    top_indices = tfidf_scores.argsort()[-n_top_terms:][::-1] 
    tfidf_sdg_dicts[sdg] = [(feature_names[idx], tfidf_scores[idx]) for idx in top_indices]

#### Looking at the dictionaries, evaluating the results:

In [112]:
tfidf_sdg_dicts

{'df_sdg01': [('social protection', 0.5204973493351615),
  ('poverty line', 0.32045497673694034),
  ('poverty rate', 0.2727177739462925),
  ('poverty form', 0.2475779639972804),
  ('end poverty', 0.21796592149320282),
  ('extreme poverty', 0.2092362398848441),
  ('pension', 0.1806588660838548),
  ('living poverty', 0.1587353082558311),
  ('social assistance', 0.15814739970673544),
  ('risk poverty', 0.1538577271900391),
  ('elderly', 0.14914859874364758),
  ('everywhere', 0.14179620303093257),
  ('social service', 0.13444380731821753),
  ('exclusion', 0.1318179517065336),
  ('basic service', 0.12026418701512429),
  ('poor vulnerable', 0.11763833140344034),
  ('social exclusion', 0.11346634997546445),
  ('deprivation', 0.1129170444184407),
  ('person disability', 0.09610631538763208),
  ('wage', 0.08822874855258026),
  ('population living', 0.08665697598322859),
  ('proportion population', 0.08507772181855953),
  ('selangor', 0.08095449757449326),
  ('parent', 0.07554710726743005),
  ('

# Step 6: Adjusting weights

Previously we encountered the situation that bigrams and unigrams consisting of the same words as the bigram recieved very high tf-idf scores, e.g. "energy", "renewable", and "renewable energy". If we keep the Ungrams only, it misses context of phrases like “climate change”. On the other hand, if we keep bigrams only, it fails if words appear separately in different contexts. Thus, we decided to keep both, bigrams and unigrams. (However,  keeping both might slightly increase computational cost.) We  decided to do the following step, to handle this tradeoff.

First, we store bigrams for each SDG:

In [113]:
bigram_sets = {sdg: {t[0] for t in terms if " " in t[0]} for sdg, terms in tfidf_sdg_dicts.items()}

*Adjust unigram scores if a related bigram exists:*

Keeps unigrams if they are useful but lowers their importance if they are part of a bigram.

Allows identifying “climate change” as a phrase, but still recognizes “climate” and “change” separately.

Balances unigrams and bigrams dynamically rather than removing one or the other.

In [114]:
for sdg, terms in tfidf_sdg_dicts.items():
    adjusted_terms = []
    
    for term, score in terms:
        if " " not in term and any(term in bigram for bigram in bigram_sets[sdg]):
            score *= 0.5  # Reduce importance of unigram if it's part of a bigram
        adjusted_terms.append((term, score))
    
    tfidf_sdg_dicts[sdg] = adjusted_terms

df_tfidf = pd.DataFrame({
    "SDG Name": [sdg_names.get(sdg, sdg) for sdg in tfidf_sdg_dicts.keys()],  # Use full SDG names
    "Top Terms": [", ".join([term[0] for term in terms]) for terms in tfidf_sdg_dicts.values()]
})

Inspecting the reuslts:

In [115]:
for index, row in df_tfidf.iterrows():
    print(f"{row['SDG Name']}:\n{row['Top Terms']}\n")

No Poverty:
social protection, poverty line, poverty rate, poverty form, end poverty, extreme poverty, pension, living poverty, social assistance, risk poverty, elderly, everywhere, social service, exclusion, basic service, poor vulnerable, social exclusion, deprivation, person disability, wage, population living, proportion population, selangor, parent, homeless, men woman, homelessness, debt, unemployed, labour market, rm, healthcare, older, household income, child age, aged year, woman child, health insurance, education health, eradicate, disabled, violence, equal right, age group, gini, sex, monetary, health education, child aged, minimum wage

Zero Hunger:
stunting, malnutrition, sustainable agriculture, end hunger, obesity, food insecurity, overweight, food production, genetic, nutritional, genetic resource, vegetable, agricultural production, nutritious, zero hunger, meal, seed, ane, soil, food nutrition, diet, agricultural land, hectare, child year, rice, irrigation, pregnant, 

# Step 7: Storing the preliminary results for further comparison

*Store top TF-IDF terms per SDG:*

(Save to JSON file.)

In [116]:
sdg_dict_json = {sdg: {term: float(score) for term, score in terms} for sdg, terms in tfidf_sdg_dicts.items()}

with open("sdg_dictionaries.json", "w") as f:
    json.dump(sdg_dict_json, f, indent=4)

print(" SDG dictionaries saved as 'sdg_dictionaries.json'")

 SDG dictionaries saved as 'sdg_dictionaries.json'


We also create a dictionary, that only stores terms with high TF-IDF scores:

In [117]:
filtered_sdg_dicts = {
    sdg: {term: score for term, score in terms.items() if score > 0.1}
    for sdg, terms in sdg_dict_json.items()
}

with open("filtered_sdg_dictionaries.json", "w") as f:
    json.dump(filtered_sdg_dicts, f, indent=4)

# Step 8: Creating a second set of dictionaries (we'll refer to them "Description dictionaries")

To further improve our dictionaries, we create 17 separate dictionaries on the basis of SDG descriptions from the SDG website: https://www.globalgoals.org/

Each entry consists of the SDG number and the corresponding textual description that is highly indicative of its respective SDG. For example, textual description of SDG1 "No poverty" is very densly loaded with term like "poverty", "extreme poverty", "poor", "vulnerable", etc. These texts will help us identify specific terms that are indicative of each SDG. After preprocessing, each SDG description is processed separately, and a term frequency (TF) dictionary is generated for each SDG, keeping only top 10 most frequent unigrams and bigrams.

Following the creation of the Description dictionaries, the next step is comparison with dictionaries created earlier using TF-IDF method applied to the corpus of pre-labeled UNDP text. The goal of comparing these two sets of dictionaries is to upweight words that appear in both (UNDP dataset and Descriptions dataset) while maintaining the original frequency for words that appear in only one of them. This is achieved by iterating through both dictionaries and adjusting word importance based on overlap. Words that appear in both dictionaries receive a higher weight factor (multiplying their frequency in the UNDP dataset by (1 + the relative frequency of the word in the Descriptions dataset)), while words exclusive to one dataset retain their original frequency. This step ensures that the most relevant words receive higher importance in subsequent analyses.

In [118]:
full_df = pd.read_csv("sdg_key.csv")

# Replace occurrences of "program" in all string columns
full_df = full_df.applymap(lambda x: x.replace("program", "programme") if isinstance(x, str) else x)

full_df['sdg'] = full_df['sdg'].apply(lambda x: f"df_sdg{int(x):02d}")

full_df.head(20)

Unnamed: 0,sdg,text
0,df_sdg01,NO POVERTY\n\nEND POVERTY IN ALL ITS FORMS EVE...
1,df_sdg02,"ZERO HUNGER\n\nEND HUNGER, ACHIEVE FOOD SECURI..."
2,df_sdg03,GOOD HEALTYH AND WELL-BEING\nENSURE HEALTHY LI...
3,df_sdg04,QUALITY EDUCATION\n\nENSURE INCLUSIVE AND EQUI...
4,df_sdg05,GENDER EQUALITY\n\nACHIEVE GENDER EQUALITY AND...
5,df_sdg06,CLEAN WATER AND SANITATION\n\nENSURE AVAILABIL...
6,df_sdg07,AFFORDABLE AND CELAN ENERGY\n\nENSURE ACCESS T...
7,df_sdg08,DECENT WORK AND ECONOMIC GROWTH\n\nPROMOTE SUS...
8,df_sdg09,INDUSTRY INNOVATION AND INFRASTRUCTURE\n\nBUIL...
9,df_sdg10,REDUCED INEQUALITITES\n\nREDUCE INEQUALITY WIT...


We use the same preprocessing as for the previous set of UNDP-based dictionaries

In [119]:
# Initialize Lemmatizer
lmtzr = nltk.WordNetLemmatizer()

# Custom Stopwords
custom_stopwords = {'per', 'cent', 'also', 'percent', 'target', 'sdg', 'country', 'national', 'government', 'implement', 'extreme', 'supply', 'level', 'enhance', 'world',
                    'development', 'year', 'system', 'sector', 'policy', 'support', 'access', 'end', 'plant', 'global', 'universal', 'use', 'campaign', 'promote',
                    'public', 'new', 'level', 'sdgs', 'people', 'reduce', 'developing country', 'developing', 'developed country', 'developed', 'least', 'least country', 'programme'}
stopwords_set = set(stopwords.words("english")).union(custom_stopwords)

# Preprocessing Function
def preprocess_text(text, remove_stopwords=True, lemmatize=True, use_spacy=False):
    if not isinstance(text, str) or not text.strip():
        return ""

    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'program', 'programme', text)
    text = re.sub(r'\s+', ' ', text).strip()

    if use_spacy:
        tokens = [token.text for token in nlp(text)]  # Using preloaded spaCy model
    else:
        tokens = word_tokenize(text)  # Default: Use NLTK tokenization

    if lemmatize:
        tokens = [lmtzr.lemmatize(token) for token in tokens]
            
    if remove_stopwords:
        tokens = [token for token in tokens if token not in stopwords_set]

    return ' '.join(tokens)

# Process text for each SDG and store in dictionaries
sdg_dicts = defaultdict(dict)

for _, row in full_df.iterrows():
    sdg_num = row["sdg"]
    processed_text = preprocess_text(row["text"])
    sdg_dicts[sdg_num] = {"sdg": sdg_num, "processed_text": processed_text}

# Convert dictionary to a list of dictionaries
sdg_list = [sdg_dicts[sdg] for sdg in sorted(sdg_dicts.keys())]

# Convert to pandas df for easier display
processed_df = pd.DataFrame(sdg_list)
display(processed_df)


Unnamed: 0,sdg,processed_text
0,df_sdg01,poverty poverty form everywhere eradicating po...
1,df_sdg02,zero hunger hunger achieve food security impro...
2,df_sdg03,good healtyh wellbeing ensure healthy life wel...
3,df_sdg04,quality education ensure inclusive equitable q...
4,df_sdg05,gender equality achieve gender equality empowe...
5,df_sdg06,clean water sanitation ensure availability sus...
6,df_sdg07,affordable celan energy ensure affordable reli...
7,df_sdg08,decent work economic growth sustained inclusiv...
8,df_sdg09,industry innovation infrastructure build resil...
9,df_sdg10,reduced inequalitites inequality within among ...


Instead of TF-IDF method, we just count the TF in our texts

In [120]:
# Dictionary to store term frequency for each SDG
sdg_def_dict = {}

# Process term frequency for each SDG separately
for sdg in sdg_list:
    sdg_num = sdg["sdg"]
    text = [sdg["processed_text"]]
    
    # Initialize CountVectorizer for term frequency with n-grams (1,3)
    vectorizer = CountVectorizer(ngram_range=(1,2), max_features=10)
    term_matrix = vectorizer.fit_transform(text)
    
    # Convert to dictionary format
    feature_names = vectorizer.get_feature_names_out()
    term_freq_values = term_matrix.toarray().flatten()
    term_freq_dict = dict(zip(feature_names, term_freq_values))
    
    # Sort dictionary by frequency
    sorted_term_freq_dict = dict(sorted(term_freq_dict.items(), key=lambda item: item[1], reverse=True))
    
    # Store in main dictionary
    sdg_def_dict[sdg_num] = sorted_term_freq_dict

sdg_def_dict

{'df_sdg01': {'poverty': 14,
  'resource': 5,
  'economic': 4,
  'social': 4,
  'including': 3,
  'life': 3,
  'poor': 3,
  'poor vulnerable': 3,
  'service': 3,
  'vulnerable': 3},
 'df_sdg02': {'food': 16,
  'agricultural': 9,
  'including': 7,
  'market': 6,
  'hunger': 5,
  'production': 5,
  'ensure': 4,
  'sustainable': 4,
  'export': 3,
  'technology': 3},
 'df_sdg03': {'health': 16,
  'disease': 9,
  'death': 7,
  'mortality': 6,
  'medicine': 5,
  'risk': 4,
  'vaccine': 4,
  'abuse': 3,
  'live': 3,
  'noncommunicable': 3},
 'df_sdg04': {'education': 19,
  'ensure': 7,
  'including': 6,
  'quality': 6,
  'education ensure': 5,
  'sustainable': 5,
  'vocational': 5,
  'equal': 4,
  'increase': 4,
  'school': 4},
 'df_sdg05': {'woman': 14,
  'equal': 7,
  'gender': 7,
  'right': 7,
  'equality': 6,
  'girl': 6,
  'woman girl': 6,
  'gender equality': 5,
  'work': 5,
  'equal right': 4},
 'df_sdg06': {'water': 21,
  'sanitation': 10,
  'hygiene': 5,
  'management': 5,
  'water s

We compare the Description dictionaries with the previously created UNDP dictionaries.

In [121]:
with open(r"filtered_sdg_dictionaries.json") as file:
    sdg_undp_dict = json.load(file)

# Create a new dictionary to store the updated weighted values
weighted_sdg_dicts = {}

# Iterate over each SDG key in both dictionaries
for sdg_key in sdg_def_dict.keys():
    if sdg_key in sdg_undp_dict:  # Ensure key exists in both dictionaries
        json_dict = sdg_undp_dict[sdg_key]  # JSON dictionary for SDG
        tf_dict = sdg_def_dict[sdg_key]  # Term frequency dictionary for SDG
        
        weighted_dict = {}

        # Compute total frequency sum for normalization
        total_sdg_freq = sum(tf_dict.values())

        # Get all unique words across both dictionaries
        all_words = set(json_dict.keys()).union(set(tf_dict.keys()))

        for word in all_words:
            json_freq = json_dict.get(word, 0)  # Get frequency from JSON (float)
            tf_freq = tf_dict.get(word, 0)  # Get frequency from TF dictionary (integer)

            # Compute relative weight if word exists in both dictionaries
            if word in json_dict and word in tf_dict:
                relative_weight = tf_freq / total_sdg_freq  # Normalized weight
                weighted_dict[word] = json_freq * (1 + relative_weight)
            elif word in tf_dict and word not in json_dict:
                weighted_dict[word] = tf_freq / total_sdg_freq
            else:
                weighted_dict[word] = json_freq  # Keep original frequency

        # Sort each weighted dictionary in descending order by frequency
        sorted_weighted_dict = dict(sorted(weighted_dict.items(), key=lambda item: item[1], reverse=True))

        # Store sorted dictionary
        weighted_sdg_dicts[sdg_key] = sorted_weighted_dict

weighted_sdg_dicts


{'df_sdg01': {'social protection': 0.5204973493351615,
  'poverty line': 0.32045497673694034,
  'poverty': 0.3111111111111111,
  'poverty rate': 0.2727177739462925,
  'poverty form': 0.2475779639972804,
  'end poverty': 0.21796592149320282,
  'extreme poverty': 0.2092362398848441,
  'pension': 0.1806588660838548,
  'living poverty': 0.1587353082558311,
  'social assistance': 0.15814739970673544,
  'risk poverty': 0.1538577271900391,
  'elderly': 0.14914859874364758,
  'everywhere': 0.14179620303093257,
  'social service': 0.13444380731821753,
  'poor vulnerable': 0.12548088683033637,
  'basic service': 0.12026418701512429,
  'social exclusion': 0.11346634997546445,
  'deprivation': 0.1129170444184407,
  'resource': 0.1111111111111111,
  'social': 0.08888888888888889,
  'economic': 0.08888888888888889,
  'service': 0.06666666666666667,
  'poor': 0.06666666666666667,
  'life': 0.06666666666666667,
  'including': 0.06666666666666667,
  'vulnerable': 0.06666666666666667},
 'df_sdg02': {'st

Let's see how creating an interaction of these two sets of terms has improved our final dictionaries:

In [122]:
data = []

# Iterate through SDG keys in both dictionaries
for sdg_key in set(weighted_sdg_dicts.keys()).union(sdg_undp_dict.keys()):
    undp_dict = sdg_undp_dict.get(sdg_key, {})
    def_dict = weighted_sdg_dicts.get(sdg_key, {})

    # Get all unique words across both dictionaries
    all_words = set(undp_dict.keys()).union(set(def_dict.keys()))

    # Collect data for each word
    for word in all_words:
        undp_freq = undp_dict.get(word, 0)  # Get frequency from UNDP dictionary (default 0 if missing)
        def_freq = def_dict.get(word, 0)  # Get frequency from weighted dictionary (default 0 if missing)
        data.append([sdg_key, word, undp_freq, def_freq])

# Create DataFrame
df_sdg_terms = pd.DataFrame(data, columns=["sdg", "term", "undp_dict", "def_dict"])

# Convert SDG labels into sortable numerical values
df_sdg_terms["sdg_numeric"] = df_sdg_terms["sdg"].str.extract("(\d+)").astype(int)

# Sort DataFrame by SDG number (ascending order)
df_sdg_terms = df_sdg_terms.sort_values(by=["sdg_numeric", "def_dict"], ascending=[True, False])

# Drop the temporary numeric column
df_sdg_terms = df_sdg_terms.drop(columns=["sdg_numeric"])

# Display sorted DataFrame
df_sdg_terms


Unnamed: 0,sdg,term,undp_dict,def_dict
430,df_sdg01,social protection,0.520497,0.520497
439,df_sdg01,poverty line,0.320455,0.320455
453,df_sdg01,poverty,0.000000,0.311111
445,df_sdg01,poverty rate,0.272718,0.272718
433,df_sdg01,poverty form,0.247578,0.247578
...,...,...,...,...
233,df_sdg17,goal,0.000000,0.098592
230,df_sdg17,trade,0.000000,0.070423
234,df_sdg17,resource,0.000000,0.070423
237,df_sdg17,innovation,0.000000,0.056338


### We are satisfied with the results, so we store the Final Dictionary that we will use for analyzing our main corpus in the next notebook named "NLP_2_Analysis_Final"

In [123]:
sdg_dict_w_json = {sdg: {term: float(score) for term, score in terms.items()} for sdg, terms in weighted_sdg_dicts.items()}

with open("sdg_dict_w_json.json", "w") as f:
    json.dump(sdg_dict_w_json, f, indent=4)

print("SDG dictionaries saved as 'sdg_dict_w_json.json'")

SDG dictionaries saved as 'sdg_dict_w_json.json'
