### Text retriever
part written by Stefano

In [4]:
from bs4 import BeautifulSoup
import requests

urls = []

def fillURLs():
    with open('urlsList.txt') as f:
        lines = f.readlines()
        for line in lines:
            urls.append(line)


def text_retriever():
    fillURLs()
    for url in urls:
        response = requests.get(str(url))
        html = response.text
        soup = BeautifulSoup(html, features="html.parser")
        return soup.get_text()

### Classifier
part written by Filip

In [5]:
## importing libraries
import os, nltk, re, random

In [6]:
# loading goals and targets
# goal regex: Goal ([0-9]+): ([a-zA-Z0-9-,.:! ]+) /// g1 = goal number /// g2 = goal text
# target regex: [0-9]+.[0-9]+: ([a-zA-Z0-9-,.:! ]+) /// g1 = target text
sdgir = dict() # SDG info raw list
for entry in os.listdir('./data/sdgs'):
    file = open('./data/sdgs/' + entry)
    line = file.readline()
    gm = re.match(r'Goal ([0-9]+): ([^\n]+)', line)
    goal = int(gm.group(1))
    sdgir[goal] = (gm.group(2), [])
    file.readline()
    while line:
        tm = re.match(r'[0-9]+.[0-9]+: ([^\n]+)', line)
        if tm:
            sdgir[goal][1].append(tm.group(1))
        line = file.readline()
    file.close()    

In [7]:
# utils for features extractor

grammar = r"""
  PP: {<IN><DT|JJ|NN.*>+}
  NP: {<DT|JJ|NN.*|CD>+}
  VP: {<RB>*<VB.*>(<TO|VB.*>|<NP|PP|,>+)*}
  """
cp = nltk.RegexpParser(grammar)

def vbnp_pairs(text):
    sent = nltk.pos_tag(nltk.word_tokenize(text))
    tree = cp.parse(sent)
    ans = []
    for subtree in tree.subtrees():
        if subtree.label() == 'VP':
            current_vb = None
            last = ""
            for st in subtree:
                if type(st) is nltk.tree.Tree and st.label() == 'NP' and current_vb:
                    np = ""
                    for leave in st.leaves():
                        np += leave[0] + " "
                    ans.append((current_vb, np[:-1]))
                    current_vb = None
                elif type(st) is tuple and st[1].startswith('VB'):
                    current_vb = st[0]
                last = st[0]
    return ans

In [8]:
# creating feature extractor based on vbnp pair overlap
def feature_extractor(goal, text):
    features = {} # features
    fc = 0
    pairs = vbnp_pairs(text)
    for target in sdgir[goal][1]:
        tpairs = vbnp_pairs("We want to " + target.lower())
        for tp in tpairs:
            if tp in pairs:
                fc += 1
                break
    features['vbnp_pair_overlap'] = fc
    return features

In [9]:
# creating classifier
classifier = {}
labeled_sent = [("We want to " + target.lower(), goal) for goal in sdgir.keys() for target in sdgir[goal][1]]
random.shuffle(labeled_sent)
for goal in sdgir.keys():
    featuresets = [(feature_extractor(goal, e), g == goal) for (e, g) in labeled_sent]
    print('Feature sets generated for goal {}'.format(goal))
    train_set = featuresets[:100]
    classifier[goal] = nltk.NaiveBayesClassifier.train(train_set)

Feature sets generated for goal 2
Feature sets generated for goal 15
Feature sets generated for goal 4
Feature sets generated for goal 9
Feature sets generated for goal 14
Feature sets generated for goal 16
Feature sets generated for goal 3
Feature sets generated for goal 6
Feature sets generated for goal 11
Feature sets generated for goal 7
Feature sets generated for goal 12
Feature sets generated for goal 1
Feature sets generated for goal 5
Feature sets generated for goal 17
Feature sets generated for goal 13
Feature sets generated for goal 10
Feature sets generated for goal 8


In [10]:
def check_sdg(text):
    for goal in sdgir.keys():
        ans = classifier[goal].classify(feature_extractor(goal, text))
        if ans:
            print("{}: {}".format(goal, sdgir[goal][0]))

# check_sdg('We want to eradicate extreme poverty!')

### Code testing

In [11]:
def test():
    text = text_retriever()
    check_sdg(text)
    
test()

17: Strengthen the means of implementation and revitalize the Global Partnership for Sustainable Development
13: Take urgent action to combat climate change and its impacts
