### Text retriever
part written by Stefano

In [1]:
from bs4 import BeautifulSoup
import requests
import json

def fillURLs():
    with open('urlsList.json') as f:
        return json.load(f)

def file(url):
    #print("is a file", url)
    response = requests.get(str(url))
    text = response.text
    return text
    #print(text)
    
def site(url):
    #print("is a site", url)
    response = requests.get(str(url))
    html = response.text
    soup = BeautifulSoup(html, features="html.parser")
    text = soup.get_text()
    return text
    #print(text)

options = {
    "F": file,
    "S": site
}

def text_retriever():
    elements = fillURLs()
    for element in elements:
        if( (element["type"] == "F" or element["type"] == "S") and  element["url"] != ""):
            return options[element["type"]](element["url"])
        else:
            print("Error during URL list analysis")

### Classifier
part written by Filip

In [18]:
## importing libraries
import os, nltk, re, random, time
from nltk.parse import CoreNLPDependencyParser
from nltk.corpus import wordnet as wn

In [3]:
# loading goals and targets
# goal regex: Goal ([0-9]+): ([a-zA-Z0-9-,.:! ]+) /// g1 = goal number /// g2 = goal text
# target regex: [0-9]+.[0-9]+: ([a-zA-Z0-9-,.:! ]+) /// g1 = target text
sdgir = dict() # SDG info raw list
for entry in os.listdir('./data/sdgs'):
    file = open('./data/sdgs/' + entry)
    line = file.readline()
    gm = re.match(r'Goal ([0-9]+): ([^\n]+)', line)
    goal = int(gm.group(1))
    sdgir[goal] = (gm.group(2), [])
    file.readline()
    while line:
        tm = re.match(r'[0-9]+.[0-9]+: ([^\n]+)', line)
        if tm:
            sdgir[goal][1].append(tm.group(1))
        line = file.readline()
    file.close()    

#### Version 1
The version using just standard parser  
*deprecated*

In [29]:
# utils for features extractor

grammar = r"""
  PP: {<IN><DT|JJ|NN.*>+}
  NP: {<DT|JJ|NN.*|CD>+}
  VP: {<RB>*<VB.*>(<TO|VB.*>|<NP|PP|,>+)*}
  """
cp = nltk.RegexpParser(grammar)

def vrbobj_pairs(text):
    sent = nltk.pos_tag(nltk.word_tokenize(text))
    tree = cp.parse(sent)
    ans = []
    for subtree in tree.subtrees():
        if subtree.label() == 'VP':
            current_vb = None
            last = ""
            for st in subtree:
                if type(st) is nltk.tree.Tree and st.label() == 'NP' and current_vb:
                    np = ""
                    for leave in st.leaves():
                        np += leave[0] + " "
                    ans.append((current_vb, np[:-1]))
                    current_vb = None
                elif type(st) is tuple and st[1].startswith('VB'):
                    current_vb = st[0]
                last = st[0]
    return ans

#### Version 2
The version using Stanford CoreNLP parser

In [30]:
def vrbobj_pairs(text):
    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
    sent = nltk.word_tokenize(text)
    parse, = dep_parser.parse(sent)
    ans = []
    for governor, dep, dependent in parse.triples():
        if dep == 'obj':
            ans.append((governor[0], dependent[0]))
    return ans

def selfreference(text):
    parser = CoreNLPParser(url='http://localhost:9000')

#### Classifier

In [34]:
# creating feature extractor based on verb-object pair overlap
tpairs = {} # the storage of verb-object pairs for targets 
for goal in sdgir.keys():
    tpairs[goal] = []
    for target in sdgir[goal][1]:
        tpairs[goal] += vrbobj_pairs("We want to " + target.lower())
        
tdict = {} # the storage of verb-object pairs for sentences in text
        
def feature_extractor(goal, text):
    features = {} # features
    fc = 0
    pairs = []
    if text in tdict.keys():
        pairs = tdict[text]
    else:
        tdict[text] = pairs = vrbobj_pairs(text)
    for target in tpairs[goal]:
        for p in pairs:
            vflag, oflag = False, False
            for ss in wn.synsets(target[0]):
                if p[0] in ss.lemma_names():
                    vflag = True
                    break
            if not vflag:
                continue
            for ss in wn.synsets(target[1]):
                if p[1] in ss.lemma_names():
                    oflag = True
                    break
            if vflag and oflag:
                fc += 1
    features['vrbobj_pair_overlap'] = fc
    return features

In [35]:
# creating and training classifier
classifier = {}
labeled_sent = [("We want to " + target.lower(), goal) for goal in sdgir.keys() for target in sdgir[goal][1]]
random.shuffle(labeled_sent)
tdict.clear()
for goal in sdgir.keys():
    featuresets = [(feature_extractor(goal, e), g == goal) for (e, g) in labeled_sent]
    print('Feature sets generated for goal {}'.format(goal))
    train_set = featuresets[:70]
    classifier[goal] = nltk.NaiveBayesClassifier.train(train_set)

Feature sets generated for goal 2
Feature sets generated for goal 15
Feature sets generated for goal 4
Feature sets generated for goal 9
Feature sets generated for goal 14
Feature sets generated for goal 16
Feature sets generated for goal 3
Feature sets generated for goal 6
Feature sets generated for goal 11
Feature sets generated for goal 7
Feature sets generated for goal 12
Feature sets generated for goal 1
Feature sets generated for goal 5
Feature sets generated for goal 17
Feature sets generated for goal 13
Feature sets generated for goal 10
Feature sets generated for goal 8


In [25]:
def check_sdg(text):
    tdict.clear()
    for goal in sdgir.keys():
        for sent in nltk.sent_tokenize(text):
            ans = classifier[goal].classify(feature_extractor(goal, sent))
            if ans:
                print("{}: {}".format(goal, sdgir[goal][0]))

# check_sdg('We want to improve education!')

### Code testing

In [36]:
def test():
    tic = time.perf_counter()
    text = text_retriever()
    toc = time.perf_counter()
    print("Text retrieved in {} seconds".format(toc - tic))
    tic = time.perf_counter()
    check_sdg(text)
    toc = time.perf_counter()
    print("Text checked in {} seconds".format(toc - tic))
    
test()

Text retrieved in 1.1769111429998702 seconds
16: Promote peaceful and inclusive societies for sustainable development, provide access to justice for all and build effective, accountable and inclusive institutions at all levels
13: Take urgent action to combat climate change and its impacts
Text checked in 10.456094774999883 seconds
