### Text retriever

In [1]:
from bs4 import BeautifulSoup
import requests
import json

def fillURLs():
    with open('urlsList.json') as f:
        return json.load(f)

def file(url):
    #print("is a file", url)
    response = requests.get(str(url))
    text = response.text
    #print(text)
    return text
    
def site(url):
    #print("is a site", url)
    response = requests.get(str(url))
    html = response.text
    soup = BeautifulSoup(html, features="html.parser")
    text = soup.get_text()
    #print(text)
    return text

options = {
    "F": file,
    "S": site
}

def retrieve():
    elements = fillURLs()
    texts = []
    for element in elements:
        if( (element["type"] == "F" or element["type"] == "S") and  element["url"] != ""):
            texts.append(options[element["type"]](element["url"]))
        else:
            print("Error during URL list analysis")
    return texts

### Generating data set

In [2]:
## importing libraries
import stanza
#from stanza.server import CoreNLPClient
import os, nltk, re, random, time
from nltk.parse import CoreNLPDependencyParser
from nltk.corpus import wordnet as wn

In [4]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse,constituency')

with open('./data/options/blacklist.json') as f:
    blObj = json.load(f)
    blacklistedVerbs = blObj["blacklistedVerbs"]
    blacklistedNouns = blObj["blacklistedNouns"]
    blacklistedCouples = blObj["blacklistedCouples"]

def orderTuples(allPairs):
    tmp = []
    for pairs in allPairs:
        tmp = tmp + pairs
    tmp.sort(key=lambda x:x[1])
    tmp.sort(key=lambda x:x[0])
    return tmp

def writePairsForSDG(sdg, pairs):
    stringnum = ""
    if(sdg < 10):
        stringnum = "0"
    stringnum = stringnum + str(sdg) 
    with open('./data/dataset/'+stringnum+'pairs.txt','w') as f:
        for tup in pairs:
            text = str(str(tup[0])+" "+str(tup[1])+" "+str(tup[2])+"\n")
            f.write(text)
        

def generateDatasetFor(sdgNum, texts):
    allPairs = []
    for text in texts:
        pairs = vrbobj_pairs(text)
        allPairs.append(pairs)
    return allPairs


def vrbobj_pairs(text):
    doc = nlp(text)
    allPairs = []
    for sentence in doc.sentences:
        pairs = extrapolatePairs(sentence.words)
        #print(sentence.text)
        allPairs = allPairs + pairs
    return allPairs

def extrapolatePairs(words):
    pairs = []
    nouns = getNouns(words)
    for noun in nouns:
        verb = goBackToVerb(noun, words)
        if(verb != -1 and validate(verb.lemma, noun.lemma)):
            pairs.append((verb.lemma, noun.lemma,getWeightFor(verb.lemma,noun.lemma)))
    return pairs

def goBackToVerb(word, words):
    while word.deprel != "root":
        word = words[word.head-1]
        #This is an extra filter, verify if necessary
        if(word.upos == "NOUN"):
            return -1
        if(word.upos == "VERB"):
            return word;
    return -1

def getNouns(words):
    toReturn = []
    for word in words:
        if(word.upos == "NOUN"):
            toReturn.append(word)
    return toReturn

def validate(verb,noun):
    if(verb in blacklistedVerbs):
        return 0
    if(noun in blacklistedNouns):
        return 0
    for couple in blacklistedCouples:
        if(couple["verb"] == verb and couple["noun"] == noun):
            return 0
    return 1

def getWeightFor(verb,noun):
    return 1

2022-03-06 20:14:36 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| lemma        | combined |
| depparse     | combined |
| constituency | wsj      |

2022-03-06 20:14:36 INFO: Use device: cpu
2022-03-06 20:14:36 INFO: Loading: tokenize
2022-03-06 20:14:36 INFO: Loading: pos
2022-03-06 20:14:37 INFO: Loading: lemma
2022-03-06 20:14:37 INFO: Loading: depparse
2022-03-06 20:14:37 INFO: Loading: constituency
2022-03-06 20:14:38 INFO: Done loading processors!


### Classifier

In [5]:
## importing libraries
import os, nltk, re, random, time
from nltk.parse import CoreNLPDependencyParser
from nltk.corpus import wordnet as wn

In [19]:
# loading goals and targets
# goal regex: Goal ([0-9]+): ([a-zA-Z0-9-,.:! ]+) /// g1 = goal number /// g2 = goal text
# target regex: [0-9]+.[0-9]+: ([a-zA-Z0-9-,.:! ]+) /// g1 = target text
sdgir = dict() # SDG info raw list
classifier = {} # dictionary of classifiers goal(key)->classifier(entry)
tpairs = dict() # the storage of verb-object pairs for targets 
tdict = {} # the storage of verb-object pairs for sentences in text

def initialize():
    preload()
    init_classifiers()
    #printGeneratedCouples()
    print("\n INITIALIZATION COMPLETED \n")
            
def preload():
    for entry in os.listdir('./data/sdgs'):
        file = open('./data/sdgs/' + entry)
        line = file.readline()
        gm = re.match(r'Goal ([0-9]+): ([^\n]+)', line)
        goal = int(gm.group(1))
        sdgir[goal] = gm.group(2)
    for entry in os.listdir('./data/dataset'):
        file = open('./data/dataset/' + entry)
        goal = int(entry[0:2])
        line = file.readline()
        tpairs[goal] = []
        while line:
            tpairs[goal].append((line.split()[0], line.split()[1]))
            line = file.readline()
        file.close()

# creating feature extractor based on verb-object pair overlap
def feature_extractor(goal, text):
    features = {} # features
    fc = 0
    pairs = []
    if text in tdict.keys():
        pairs = tdict[text]
    else:
        tdict[text] = pairs = vrbobj_pairs(text)
    for target in tpairs[goal]:
        features['contains(%s)' % str(target)] = False
        for p in pairs:
            vflag, oflag = False, False
            for ss in wn.synsets(target[0]):
                if p[0] in ss.lemma_names():
                    vflag = True
                    break
            if not vflag:
                continue
            for ss in wn.synsets(target[1]):
                if p[1] in ss.lemma_names():
                    oflag = True
                    break
            if vflag and oflag:
                features['contains(%s)' % str(target)] = True
                break
    return features

In [7]:
# defining and training classifier
def init_classifiers():
    # defining classifier
    labeled_sent = [("We want to " + target.lower(), goal) for goal in sdgir.keys() for target in sdgir[goal][1]]
    random.shuffle(labeled_sent)
    tdict.clear()
    print("generating feature sets...")
    for goal in sdgir.keys():
        featuresets = [(feature_extractor(goal, e), g == goal) for (e, g) in labeled_sent]
        print('Feature sets generated for goal {}'.format(goal))
        train_set = featuresets
        classifier[goal] = nltk.NaiveBayesClassifier.train(train_set)

In [28]:
def check_sdg(text):   
    tdict.clear() 
    for goal in sdgir.keys():
        ans = classifier[goal].classify(feature_extractor(goal, text))
        if ans:
            print("{}: {}".format(goal, sdgir[goal]))

### Code testing

In [29]:
texts = retrieve()
initialize()

generating feature sets...
Feature sets generated for goal 2
Feature sets generated for goal 15
Feature sets generated for goal 4
Feature sets generated for goal 9
Feature sets generated for goal 14
Feature sets generated for goal 16
Feature sets generated for goal 3
Feature sets generated for goal 6
Feature sets generated for goal 11
Feature sets generated for goal 7
Feature sets generated for goal 12
Feature sets generated for goal 1
Feature sets generated for goal 5
Feature sets generated for goal 17
Feature sets generated for goal 13
Feature sets generated for goal 10
Feature sets generated for goal 8

 INITIALIZATION COMPLETED 



In [30]:
for text in texts:
    check_sdg(text)
    print("\n SINGLE TASK COMPLETED \n ")
print("\n THE ANALYZES HAVE BEEN COMPLETED \n")

2: End hunger, achieve food security and improved nutrition and promote sustainable agriculture
15: Protect, restore and promote sustainable use of terrestrial ecosystems, sustainably manage forests, combat desertification, and halt and reverse land degradation and halt biodiversity loss
4: Ensure inclusive and equitable quality education and promote lifelong learning opportunities for all
9: Build resilient infrastructure, promote inclusive and sustainable industrialization and foster innovation
14: Conserve and sustainably use the oceans, seas and marine resources for sustainable development 14.1
16: Promote peaceful and inclusive societies for sustainable development, provide access to justice for all and build effective, accountable and inclusive institutions at all levels
3: Ensure healthy lives and promote well-being for all at all ages
6: Ensure availability and sustainable management of water and sanitation for all
7: Ensure access to affordable, reliable, sustainable and moder