# Unseripervised Classification

In [None]:
import pymongo
from nltk.corpus import stopwords
from itertools import combinations
import nltk
from nltk import sent_tokenize, word_tokenize
import string
import numpy as np
import networkx as nx
from gensim.models import Word2Vec
from tqdm.notebook import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import PunktSentenceTokenizer
from collections import defaultdict
nltk.download("stopwords")
nltk.download("punkt")
punctuations = list(string.punctuation)

In [None]:
client = pymongo.MongoClient("127.0.0.1:27017")

db = client['IRsegmentationDB3']

# Dataset Documenti
docsDataset = db['dataset']

# Dataset Information Content
icDataset = db['unsIC']

#relDataset contains the relatedness score for each sentence combination in each doc
relDataset = db['relatedness']

In [None]:
class UnsupervisedClassification: 
    def __init__(self, n, t):
        self.docs = None
        self.annot = None
        self.wordDict = None
        self.S = []
        self.DOC = None
        self.g = nx.Graph()
        self.cliques = []
        self.SG = []
        self.w2v_keys = None
        self.w2v_model = None
        self.n = n
        self.t = t
        
        print("\n\nUNSUPERVISED MODULE - INITIALIZATION")
        print("N: ", self.n)
        
    def import_dataset(self):
        data = docsDataset.find()
        self.docs = [(d['text']).replace("Â¶", " ") for d in data]
        self.annot = [d['annotations'] for d in data]

        self.wordDict = [x['word'] for x in icDataset.find()]
        
        #Pre trained Word to Vector Model
        self.w2v_model = Word2Vec.load("./w2v_hyb3.model")
        self.w2v_keys = list(self.w2v_model.wv.key_to_index.keys())
        
        
    def sentence_tokenizer(self):
        self.S = [sent_tokenize(d, language='english') for d in self.docs]
        
        # mergin too small sentence
        i = 0
        while(True):
            if(len(self.S) - 1 == i):
                break
                
            j = 1
            while(True):
                if(j > len(self.S[i]) - 1):
                    break
                    
                if(len(nltk.word_tokenize(self.S[i][j])) <= 3):
                    self.S[i][j - 1] = self.S[i][j - 1]+" "+self.S[i][j]
                    del self.S[i][j]
                else:
                    j += 1
            i += 1
        
        print("Documents tokenized")
    
    def compute_relatedness(self, doc):
        self.DOC = doc
        
        print("DOC: ", self.DOC)
        print("LENGTH SENTENCES ", len(self.S[self.DOC]))
        
        if(relDataset.find_one({'doc': self.DOC})):
            print("Document "+str(self.DOC)+" already in DB. No relatedness computation needed.")
        else:
            # Constructing the similarity graph
            iterations = list(combinations(list(range(len(self.S[self.DOC]))), 2))

            for i, s1 in tqdm(enumerate(self.S[self.DOC])):
                relVal = []

                for j, s2 in enumerate(self.S[self.DOC]):
                    if(i == j):
                        continue
                        
                    if(abs(i - j) != 1):
                        continue

                    if(i, j) not in iterations:
                        # Ottimizzazione
                        for rd in relDataset.find({"doc": self.DOC, "s1": str(self.DOC)+"_"+str(j)}): 
                            for r in rd['rel']:
                                if(r['s2'] == str(self.DOC)+"_"+str(i)):
                                    relVal.append({"s2": str(self.DOC)+"_"+str(j), "value": r['value']})
                    else:
                        st1 = [x.lower().strip() for x in nltk.word_tokenize(s1) if x not in punctuations]
                        st2 = [x.lower().strip() for x in nltk.word_tokenize(s2) if x not in punctuations]

                        if(len(st1) == 0 or len(st2) == 0):
                            continue

                        crel = self.relatedness(st1, st2)
                        relVal.append({"s2": str(self.DOC)+"_"+str(j), "value": crel})

                relDataset.insert_one({"doc": self.DOC, "s1": str(self.DOC)+"_"+str(i), "rel": relVal})
            
    def relatedness(self, st1, st2):
        sr = 0

        # Compute SR
        lst1 = 0
        for ws1 in st1: 
            lst1 += 1
            if(ws1 not in self.w2v_keys):
                continue

            lst2 = 0
            
            ic_ws1 = icDataset.find_one({"word": ws1})['ic']
            sr_ws1 = np.reshape([float(wi) for wi in (list(self.w2v_model.wv[ws1]))], (1, -1))

            for ws2 in st2:
                lst2 += 1
                if(ws1 in self.wordDict and ws2 in self.wordDict and ws2 in self.w2v_keys):
                    sr_ws2 = [float(wi) for wi in (list(self.w2v_model.wv[ws2]))]

                    cs = cosine_similarity(sr_ws1, np.reshape(sr_ws2, (1, -1)))[0][0]

                    ic_ws2 = icDataset.find_one({"word": ws2})['ic']

                    min_ic = min(ic_ws1, ic_ws2)
                    sr += (cs * min_ic)
                else: 
                    sr += 0
        
        fm = sr / lst1
        sm = sr / lst2
        res = (fm + sm) / 2

        return res
    
    def rel_ths_avg_std(self):
        relScore = []
        c = 0
        for rel in relDataset.find({"doc": self.DOC}):
            for item in rel['rel']:
                if(item['value'] != 0.0):
                    relScore.append(float(item['value']))
                    c += 1

        std = round(np.std(relScore), 2)
        avg = round((sum(relScore) / c), 2)
        return avg, std

    def filtering(self):
        for i, s in enumerate(self.S[self.DOC]):
            self.g.add_node(i)

        ### Apply filtering to relatendess
        relValList = []
        t_avg, t_std = self.rel_ths_avg_std()
        
        if(self.t == 1):
            print("T = AVG - STD")
            self.t = round((t_avg - t_std), 2)
        else:
            print("T = AVG")
            self.t = round((t_avg), 2) 
        
        for rel in relDataset.find({"doc": self.DOC}):
            s1 = rel['s1'].split("_")
            for j, item in enumerate(rel['rel']):
                s2 = item['s2'].split("_")
                val = item['value']

                if(val == 0.0):
                    continue 

                relValList.append(val)
               
                if(val > self.t):
                    self.g.add_edge(int(s1[1]), int(s2[1]), rel = val)
                    
    def set_cliques(self):
        for x in nx.find_cliques(self.g):
            xs = sorted(x)
            self.cliques.append(xs)
        #print("\n", len(self.cliques), " cliques find")
    
    ##### Creating initial Segments from Cliques
    def find_nested(self, s):
        for sg in self.SG: 
            if(type(sg) == list):
                for sub in sg:
                    if(s == sub):
                        return True
            elif(s in self.SG):
                return True

        return False

    def find_nested_pos(self, s):
        for i, sg in enumerate(self.SG): 
            if(type(sg) == list):
                for sub in sg:
                    if(s == sub):
                        return i
            elif(s in self.SG):
                 return sg.index(s)
            
        return -1

    def initial_segments(self):
        print("Computing Initial Segments ... ")
        for i, q in enumerate(self.cliques): 
            for n1, n2 in list(combinations(q, 2)):
                if(n1 == n2):
                    continue

                if((n2 - n1) == 1):
                    if(self.find_nested(n1) == False and self.find_nested(n2) == False):
                        self.SG.append([n1])
                        self.SG.append([n2])
                    elif(self.find_nested(n1) == True and self.find_nested(n2) == False):
                        self.SG[self.find_nested_pos(n1)].append(n2)
                    elif(self.find_nested(n1) == False and self.find_nested(n2) == True):
                        self.SG[self.find_nested_pos(n2)].append(n1)
        
        self.sorting_SG()
        
    def sorting_SG(self):
        for i, sg in enumerate(self.SG): 
            if(type(self.SG[i]) != list):
                self.SG[i] = [self.SG[i]]
            self.SG[i].sort()
        self.SG = sorted(self.SG, key=lambda x:int(x[0]))

    def merging_adj(self):
        # Merging Adjacent Segments 
        print("Merging Adjacent Segments ... ")
        
        i = 0
        while(True):
            if(i < len(self.SG) - 1):
                pass
            else:
                break

            qc = 0
            qf = True
            
            while(qf == True):
                if(qc == len(self.cliques) - 1):
                    qf = False

                q = self.cliques[qc]
                for n1, n2 in list(combinations(q, 2)):
                    if(n1 in self.SG[i] and n2 in self.SG[i+1]):
                        sg1 = self.SG[i]
                        sg2 = self.SG[i+1]

                        self.SG[i] = sg1 + sg2
                        del self.SG[i+1]

                        qf = False
                        break
                qc += 1
            i += 1
    
    def get_relatedness_from_DB(self, s1):
        res =  relDataset.find_one({"doc": self.DOC, "s1": str(self.DOC)+"_"+str(s1)})
        
        rr = defaultdict(lambda: 0)
        
        for r in res['rel']: 
            rr[r['s2']] = r['value']
            
        return rr
    
    def compute_sgr(self, sg1, sg2):
        rel = 0
        for n1 in sg1:  
            st1 = [x.lower().strip() for x in nltk.word_tokenize(self.S[self.DOC][int(n1)]) if x not in punctuations]
            relDict = self.get_relatedness_from_DB(n1)
            
            for n2 in sg2: 
                st2 = [x.lower().strip() for x in nltk.word_tokenize(self.S[self.DOC][int(n2)]) if x not in punctuations]
                rel += int(relDict[n2])

        return (1 / (len(sg1) * len(sg2))) * rel
    
    def merging_small_segments(self):
        print("Merging Smal Segments ... ")
        
        i = 0
        while(True):
            if(i < len(self.SG) - 1):
                pass
            else:
                break

            if(len(self.SG[i]) < self.n):
                if(i == 0):
                    sg1 = self.SG[i]
                    sg2 = self.SG[i+1]

                    self.SG[i] = sg1 + sg2
                    del self.SG[i + 1]
                else:
                    if(self.compute_sgr(self.SG[i-1], self.SG[i]) > self.compute_sgr(self.SG[i], self.SG[i+1])):
                        sg1 = self.SG[i - 1]
                        sg2 = self.SG[i]

                        self.SG[i - 1] = sg1 + sg2

                        del self.SG[i]
                    else:
                        sg1 = self.SG[i]
                        sg2 = self.SG[i+1]

                        self.SG[i] = sg1 + sg2
                        del self.SG[i + 1]
            else: 
                i += 1

        self.sorting_SG()
            
    def get_SG(self):
        return self.SG
    
    def get_S(self, DOC):
        return self.S[DOC]
    
    def save(self):
        item = {"doc": self.DOC, "t": self.t, "SG": self.SG}
        resDataset.insert_one(item)

In [None]:
def Unsupervised(DOC, n, t):
    # Instantiate the Unserpvised Classificator
    # Params n,t  -> n is the minimum size for a segment and 1 is the threshold value (this value is override
    # in run time)
    cUns = UnsupervisedClassification(n, t) #0 -> avg  -- 1 -> avg  - std 
    cUns.import_dataset()
    cUns.sentence_tokenizer()
    
    cUns.compute_relatedness(DOC)
    
    # Graph's edges filtering based on the threshold value
    cUns.filtering()
 
    # Cliques computation
    cUns.set_cliques()

    # Segments
    cUns.initial_segments()
    cUns.sorting_SG()
    print(cUns.get_SG())
    
    # Merging adjacent segments (if related)
    cUns.merging_adj()
    print(cUns.get_SG())
    # Merging too small segments (based on the parameter n). 
    cUns.merging_small_segments()
    print(cUns.get_SG())
    #cUns.sorting_SG()
    
    # Save resulting SG
    #cUns.save()
    
    return cUns.get_SG(), cUns.get_S(DOC)

In [None]:
#Unsupervised(0, 7, 1) 