# Part 3: Generating Vector Embeddings

In [59]:
import pandas as pd

import nltk
from nltk.corpus import stopwords
import string

In [60]:
df_UoA = pd.read_csv('../data/givenData/UoA-SDG-Keyword-List-Ver.-1.1.xlsx - SDG Keywords (initial ver.).csv')
df_UoA_keywords_sdg = df_UoA[["SDG Keywords", "SDG"]]
df_UoA_alt_sdg = df_UoA[["Alternatives", "SDG"]]
df_UoA_alt_sdg = df_UoA_alt_sdg.rename(columns={"Alternatives": "SDG Keywords"})
df_UoA = pd.concat([df_UoA_keywords_sdg, df_UoA_alt_sdg], axis=0, ignore_index=True)
df_UoA.dropna(inplace=True, ignore_index=True)
df_keywords = df_UoA.copy()




In [61]:
df_keywords

Unnamed: 0,SDG Keywords,SDG
0,Child Labor Laws,SDG1
1,Child Labour,SDG1
2,Child Support Grant,SDG1
3,Child Welfare,SDG1
4,Conditional Cash Transfer,SDG1
...,...,...
2953,Legal Remedies,SDG16
2954,(Policies AND Sustainable Development),SDG16
2955,Responsive Institutions,SDG16
2956,Transparent Institutions,SDG16


In [62]:
df_USC = pd.read_csv('../data/givenData/USC-Compiled-Keywords-for-SDG-Mapping_Final_17-05-10.xlsx - Compiled SDG Keywords.csv')
df_USC = df_USC.rename(columns={'SDG3': 'SDG 3'})
for col_name in df_USC.columns:
    col = df_USC[col_name].tolist()
    for phrase in col:
        new_row = [phrase, col_name]
        df_keywords.loc[len(df_keywords)] = new_row


        
df_keywords.dropna(inplace=True, ignore_index=True)
df_keywords


Unnamed: 0,SDG Keywords,SDG
0,Child Labor Laws,SDG1
1,Child Labour,SDG1
2,Child Support Grant,SDG1
3,Child Welfare,SDG1
4,Conditional Cash Transfer,SDG1
...,...,...
3868,Technology for sustainable development,Misc
3869,Tele-working,Misc
3870,Transboundary cooperation,Misc
3871,Water sensitive revitalisation,Misc


In [63]:
df_unique = df_keywords.drop_duplicates(subset='SDG Keywords', keep='first', ignore_index=True)
df_unique


Unnamed: 0,SDG Keywords,SDG
0,Child Labor Laws,SDG1
1,Child Labour,SDG1
2,Child Support Grant,SDG1
3,Child Welfare,SDG1
4,Conditional Cash Transfer,SDG1
...,...,...
3460,Technology for sustainable development,Misc
3461,Tele-working,Misc
3462,Transboundary cooperation,Misc
3463,Water sensitive revitalisation,Misc


In [64]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    stop_words = set(stopwords.words('english'))
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize and remove stop words
    tokens = [word for word in text.split() if word.lower() not in stop_words]
    # Join tokens back into a single string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

df_unique['SDG Keywords'] = df_unique['SDG Keywords'].apply(clean_text)
df_unique['SDG'] = df_unique['SDG'].str.replace(' ', '')
df_unique


[nltk_data] Downloading package punkt to /home/safeduck/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/safeduck/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique['SDG Keywords'] = df_unique['SDG Keywords'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique['SDG'] = df_unique['SDG'].str.replace(' ', '')


Unnamed: 0,SDG Keywords,SDG
0,Child Labor Laws,SDG1
1,Child Labour,SDG1
2,Child Support Grant,SDG1
3,Child Welfare,SDG1
4,Conditional Cash Transfer,SDG1
...,...,...
3460,Technology sustainable development,Misc
3461,Teleworking,Misc
3462,Transboundary cooperation,Misc
3463,Water sensitive revitalisation,Misc


In [65]:
df_unique.to_csv('keyword2.csv', index=False)


In [31]:
sdg_defs = {
    'SDG 1' : 'End poverty in all its forms everywhere',
    'SDG 2' : 'End hunger, achieve food security and improved nutrition, and promote sustainable agriculture',
    'SDG 3' : 'Ensure healthy lives and promote well-being for all at all ages',
    'SDG 4' : 'Ensure inclusive and equitable quality education and promote lifelong learning opportunities for all',
    'SDG 5' : 'Achieve gender equality and empower all women and girls',
    'SDG 6' : 'Ensure availability and sustainable management of water and sanitation for all',
    'SDG 7' : 'Ensure access to affordable, reliable, sustainable and modern energy for all',
    'SDG 8' : 'Promote sustained, inclusive and sustainable economic growth, full and productive employment and decent work for all',
    'SDG 9' : 'Build resilient infrastructure, promote inclusive and sustainable industrialization, and foster innovation',
    'SDG 10' : 'Reduce inequality within and among countries',
    'SDG 11' : 'Make cities and human settlements inclusive, safe, resilient, and sustainable',
    'SDG 12' : 'Ensure sustainable consumption and production patterns',
    'SDG 13' : 'Take urgent action to combat climate change and its impacts by regulating emissions and promoting developments in renewable energy',
    'SDG 14' : 'Conserve and sustainably use the oceans, seas and marine resources for sustainable development',
    'SDG 15' : 'Protect, restore and promote sustainable use of terrestrial ecosystems, sustainably manage forests, combat desertification, and halt and reverse land degradation and halt biodiversity loss',
    'SDG 16' : 'Promote peaceful and inclusive societies for sustainable development, provide access to justice for all and build effective, accountable and inclusive institutions at all levels',
    'SDG 17' : 'Strengthen the means of implementation and revitalize the global partnership for sustainable development'
}

In [1]:
import numpy as np

In [2]:
y_true = np.random.randint(2, size=(30, 5))
y_true

array([[1, 0, 1, 1, 1],
       [1, 1, 0, 0, 1],
       [0, 1, 1, 0, 0],
       [0, 0, 0, 1, 1],
       [1, 1, 0, 0, 1],
       [1, 1, 1, 0, 0],
       [0, 1, 0, 1, 0],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 1, 0, 0],
       [1, 0, 1, 0, 1],
       [0, 0, 1, 0, 1],
       [0, 1, 1, 1, 1],
       [1, 1, 0, 0, 1],
       [1, 1, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 1, 1, 1],
       [1, 0, 0, 1, 0],
       [0, 1, 1, 1, 1],
       [0, 0, 0, 0, 0],
       [1, 1, 0, 1, 1],
       [1, 1, 1, 1, 1],
       [1, 0, 1, 0, 1],
       [1, 0, 0, 1, 0],
       [0, 1, 1, 1, 0],
       [0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0],
       [1, 0, 0, 1, 1]])