# Supervised Labeling

In [None]:
import pymongo
import random
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from collections import Counter
%run Unsupervised.ipynb
%run Analyzers.ipynb

## Get Data 

In [None]:
DOC = 0
n = 13
t = 1

In [None]:
client = pymongo.MongoClient("127.0.0.1:27017")

db = client['IRsegmentationDB3']
docsDataset = db['dataset']

#relDataset contains the relatedness score for each sentence combination in each doc
relDataset = db['relatedness']

item = docsDataset.find_one({'doc': DOC})
annot = item['annotations']
doc = item['text']

## Supervised Labeling

In [None]:
def SupervisedLabeling(SG, S, DOC):
    res = []

    # Recomposing the document based on SG
    segments = []
    
    for sg in SG: 
        segment = []
        for snt in sg: 
            segment.append(S[int(snt)])
        segments.append(segment)
        
    fx = FeatureExtraction(segments, DOC)
    fv = fx.get_feature_vector()
    
    # Functional Part Analyzer
    fpa = functionalPartAnalyzer(segments, fv, DOC)
    fpa.introduction()
    fpa.background()
    fpa.footnotes()

    res = fpa.getClassification()

    # Conclusion Recognizer
    idx = [(x[1].split("="))[1] for x in fpa.getFilteredFeatureVector()]
    
    for x in idx: 
        pfv = fx.get_feature_vector_for_sentence(x)
        cr = conclusionRecognizer(segments[int(x)], x, pfv, DOC)
        res.append(cr.recognizer())

    res = sorted(res, key=lambda d: int(d['index'])) 
    
    # Check if there are mutliple paragraph with the same labels consequently
    # if true, merge them. 
    labeling = []
    for i, r in enumerate(res):
        if(i > 0):
            if(r['label'] == labeling[len(labeling) - 1]['label']):
                labeling[len(labeling) - 1]['index'].extend([r['index']])
                continue
        r['index'] = [r['index']]
        labeling.append(r)
    
    return labeling, S, SG

In [None]:
SG, S = Unsupervised(DOC, n, t)

In [None]:
r = SupervisedLabeling(SG, S, DOC)

# Evaluation 

### Unsupervised Evaluation

In [None]:
# check if two sentences are in the same segments
def isInTheSameSegment(ns1, ns2, SG):
    for sg in SG: 
        if(ns1 in sg and ns2 in sg):
            return True
    
    return False

def isInTheSameGoldSegment(d, s1, s2, S):
    doc = docsDataset.find_one({'doc': d})
    text = doc['text'].split("¶")
        
    for p in text:
        p = p.replace("\n", "").strip()
        
        if(p.count(s1) > 0 and p.count(s2) > 0):
            return True
    
    return False

def unsupervisedEvaluation(d, k, SG, S):
    counter = 0
    for i in range(0, 100):
        n = random.randrange(0, len(S) - k)

        s1, ns1 = S[n], n
        s2, ns2 = S[n + k], n + k
        
        iss = isInTheSameSegment(ns1, ns2, SG)
        isgs = isInTheSameGoldSegment(d, s1, s2, S)
    
        if(iss == isgs):
            counter += 1
            
    return counter

### Supervised Evaluation

In [None]:
def supervisedEvaluation(d, r, k):
    c = []
    for i in range(0, 100):
        c.append(haveSameLabel(d, r, k))
        
    return c

def haveSameLabel(d, r, k): 
    doc = docsDataset.find_one({'doc': d})
    annots = docsDataset.find_one({'doc': d})['annotations']
    text = doc['text'].split("¶")
    
    S = r[1]
    n = random.randrange(0, len(S) - k)
    
    s1, ns1 = S[n], n
    s2, ns2 = S[n + k], n + k

    # Get Predicted Labels
    pLabel1 = getLabel(ns1, r)
    #pLabel2 = getLabel(ns2, r)
    
    if(pLabel1 == "NF"): #or pLabel2 == "NF"):
        return 0

    # Get real labels
    rLabel1, rLabel2 = "NF", "NF"
    
    for p in zip(text, annots):
        prg = p[0].replace("\n", "").strip()
        if(prg.count(s1) > 0):
            rLabel1 = p[1]['type']
            
            if(p[1]['type'] == 'Analysis'):
                rLabel1 = 'Conclusions'
            
        if(prg.count(s2) > 0):
            rLabel2 = p[1]['type']
            
    if(rLabel1 == "NF"):# or rLabel2 == "NF"):
        return 0
    
    if(pLabel1 == rLabel1):
        return 1
    else: 
        return 0
    
def getLabel(ns, r):
    #ciclo SG
    for i, rr in enumerate(r[2]):
        # Trovo indice segmento in cui è presente la mia frase
        if(ns in rr):
            # ciclo i segmenti finali aggregati
            for idx in r[0]:
                # se il segmento è presente nell'aggregato ritorno l'etichetta
                if(i in idx['index']):
                    if(idx['label'] == 'Analysis'):
                        return "Conclusions"
                    
                    return idx['label']
        
    return "NF"

# Evaluation

In [None]:
UE = []
SU = []
DOCS = set(sorted([r['doc'] for r in relDataset.find()]))
print(DOCS)

for d in DOCS:
    annots = docsDataset.find_one({'doc': d})['annotations']
    
    # Unsupervised
    SG, S = Unsupervised(d, n, t)
     
    #Evaluation 
    # k =  distanza tra segmenti
    k = round(len(SG) / len(annots))
    
    UE.append((d, unsupervisedEvaluation(d, k, SG, S)))
    
    r = SupervisedLabeling(SG, S, d)
    SU.append((d, supervisedEvaluation(d, r, k)))

In [None]:
acc = 0
for su in SU:
    
    print("DOC", su[0])
    acc += sum(su[1])
    
print("ACCURATEZZA SUPERVISED", round((acc * 100) / (len(SU) * 100), 2))

In [None]:
acc = 0
for su in UE:
    
    print("DOC", su[0])
    print(su[1])
    acc += su[1]
    
print("ACCURATEZZA UNSUPERVISED", round((acc * 100) / (len(SU) * 100), 2))