### Text retriever

In [11]:
from bs4 import BeautifulSoup
import requests
import json

def fillURLs():
    with open('urlsList.json') as f:
        return json.load(f)

def file(url):
    #print("is a file", url)
    response = requests.get(str(url))
    text = response.text
    #print(text)
    return text
    
def site(url):
    #print("is a site", url)
    response = requests.get(str(url))
    html = response.text
    soup = BeautifulSoup(html, features="html.parser")
    text = soup.get_text()
    #print(text)
    return text

options = {
    "F": file,
    "S": site
}

def retrieve():
    elements = fillURLs()
    texts = []
    for element in elements:
        if( (element["type"] == "F" or element["type"] == "S") and  element["url"] != ""):
            texts.append(options[element["type"]](element["url"]))
        else:
            print("Error during URL list analysis")
    return texts

### Generating data set

In [2]:
## importing libraries
import stanza
#from stanza.server import CoreNLPClient
import os, nltk, re, random, time
from nltk.parse import CoreNLPDependencyParser
from nltk.corpus import wordnet as wn

In [3]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse,constituency')

def vrbobj_pairs(text):
    doc = nlp(text)
    for sentence in doc.sentences:
        pairs = extrapolatePairs(sentence.words)
        return pairs

def extrapolatePairs(words):
    pairs = []
    nouns = getNouns(words)
    for noun in nouns:
        verb = goBackToVerb(noun, words)
        if(verb != -1 and validate(verb.lemma, noun.lemma)):
            pairs.append((verb.lemma, noun.lemma,getWeightFor(verb.lemma,noun.lemma)))
    return pairs

def goBackToVerb(word, words):
    while word.deprel != "root":
        word = words[word.head-1]
        if(word.upos == "VERB"):
            return word;
    return -1

def getNouns(words):
    toReturn = []
    for word in words:
        if(word.upos == "NOUN"):
            toReturn.append(word)
    return toReturn

def validate(verb,noun):
    return 1

def getWeightFor(verb,noun):
    return 1

2022-03-03 23:10:57 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| lemma        | combined |
| depparse     | combined |
| constituency | wsj      |

2022-03-03 23:10:57 INFO: Use device: cpu
2022-03-03 23:10:57 INFO: Loading: tokenize
2022-03-03 23:10:57 INFO: Loading: pos
2022-03-03 23:10:57 INFO: Loading: lemma
2022-03-03 23:10:57 INFO: Loading: depparse
2022-03-03 23:10:58 INFO: Loading: constituency
2022-03-03 23:10:59 INFO: Done loading processors!


### Classifier

In [41]:
## importing libraries
import os, nltk, re, random, time
from nltk.parse import CoreNLPDependencyParser
from nltk.corpus import wordnet as wn

In [60]:
# loading goals and targets
# goal regex: Goal ([0-9]+): ([a-zA-Z0-9-,.:! ]+) /// g1 = goal number /// g2 = goal text
# target regex: [0-9]+.[0-9]+: ([a-zA-Z0-9-,.:! ]+) /// g1 = target text
sdgir = dict() # SDG info raw list
classifier = {} # dictionary of classifiers goal(key)->classifier(entry)
tpairs = {} # the storage of verb-object pairs for targets 
tdict = {} # the storage of verb-object pairs for sentences in text

def initialize():
    preload()
    init_classifiers()
    #printGeneratedCouples()
    print("\n INITIALIZATION COMPLETED \n")

def printGeneratedCouples():
    for goal in sdgir.keys():
        for target in sdgir[goal][1]:
            print("\n")
            vrbobj_pairs("We want to " + target.lower())
            print("\n")

def preload():
    for entry in os.listdir('./data/sdgs'):
        file = open('./data/sdgs/' + entry)
        line = file.readline()
        gm = re.match(r'Goal ([0-9]+): ([^\n]+)', line)
        goal = int(gm.group(1))
        sdgir[goal] = (gm.group(2), [])
        file.readline()
        while line:
            tm = re.match(r'[0-9]+.[0-9]+: ([^\n]+)', line)
            if tm:
                sdgir[goal][1].append(tm.group(1))
            line = file.readline()
        file.close()

# creating feature extractor based on verb-object pair overlap
def feature_extractor(goal, text):
    features = {} # features
    fc = 0
    pairs = []
    if text in tdict.keys():
        pairs = tdict[text]
    else:
        tdict[text] = pairs = vrbobj_pairs(text)
    for target in tpairs[goal]:
        features['contains(%s)' % str(target)] = False
        for p in pairs:
            vflag, oflag = False, False
            for ss in wn.synsets(target[0]):
                if p[0] in ss.lemma_names():
                    vflag = True
                    break
            if not vflag:
                continue
            for ss in wn.synsets(target[1]):
                if p[1] in ss.lemma_names():
                    oflag = True
                    break
            if vflag and oflag:
                features['contains(%s)' % str(target)] = True
                break
    return features

In [61]:
# defining and training classifier
def init_classifiers():
    # initialization of the storage of verb-object pairs for targets
    tpairs.clear()
    for goal in sdgir.keys():
        tpairs[goal] = []
        print("generating tpairs for goal",goal,"...")
        for target in sdgir[goal][1]:
            tpairs[goal] += vrbobj_pairs("We want to " + target.lower())
    # defining classifier
    labeled_sent = [("We want to " + target.lower(), goal) for goal in sdgir.keys() for target in sdgir[goal][1]]
    random.shuffle(labeled_sent)
    tdict.clear()
    print("generating feature sets...")
    for goal in sdgir.keys():
        featuresets = [(feature_extractor(goal, e), g == goal) for (e, g) in labeled_sent]
        print('Feature sets generated for goal {}'.format(goal))
        train_set = featuresets[:70]
        classifier[goal] = nltk.NaiveBayesClassifier.train(train_set)

In [62]:
def check_sdg(text):   
    tdict.clear() 
    for goal in sdgir.keys():
        ans = classifier[goal].classify(feature_extractor(goal, text))
        if ans:
            print("{}: {}".format(goal, sdgir[goal][0]))

### Code testing

In [63]:
texts = retrieve()
initialize()

generating tpairs for goal 2 ...
generating tpairs for goal 15 ...
generating tpairs for goal 4 ...
generating tpairs for goal 9 ...
generating tpairs for goal 14 ...
generating tpairs for goal 16 ...
generating tpairs for goal 3 ...
generating tpairs for goal 6 ...
generating tpairs for goal 11 ...
generating tpairs for goal 7 ...
generating tpairs for goal 12 ...
generating tpairs for goal 1 ...
generating tpairs for goal 5 ...
generating tpairs for goal 17 ...
generating tpairs for goal 13 ...
generating tpairs for goal 10 ...
generating tpairs for goal 8 ...
generating feature sets...
Feature sets generated for goal 2
Feature sets generated for goal 15
Feature sets generated for goal 4
Feature sets generated for goal 9
Feature sets generated for goal 14
Feature sets generated for goal 16
Feature sets generated for goal 3
Feature sets generated for goal 6
Feature sets generated for goal 11
Feature sets generated for goal 7
Feature sets generated for goal 12
Feature sets generated fo

In [64]:
for text in texts:
    check_sdg(text)
    print("\n SINGLE TASK COMPLETED \n ")
print("\n THE ANALYZES HAVE BEEN COMPLETED \n")


 SINGLE TASK COMPLETED 
 

 SINGLE TASK COMPLETED 
 

 SINGLE TASK COMPLETED 
 

 THE ANALYZES HAVE BEEN COMPLETED 

