In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cos_sim
import sklearn
import inspect
%run Summary_Processes/Generic_Summarizer.ipynb
from math import log
import numpy as np
import seaborn
from collections import defaultdict
class Embeddings_process :
    
    def __init__(self, path_to_embeddings, path_to_words, counts, nb_docs, eigen_folder, direct_file = None, 
                 factor = -0.25, transpose1 = False, transpose2 = False, a = 1.0, b = 0.1, weighted = True,
                exponentiation = 0, method = "tr", lsanbcompfun = None, tag = None,diag = "none", bias = 0) :
        self.a = a
        self.b = b
        self.weighted = weighted
        self.exponentiation = exponentiation
        self.method = method
        self.lsanbcompfun = lsanbcompfun
        self.diag = diag
        self.bias = bias
        methodstr = "TextRank" if method == "tr" else "LSA"
        if inspect.isroutine(lsanbcompfun):
            source = inspect.getsource(lsanbcompfun).replace("\n","")
            funcstr = re.sub(r'^.*lsanbcompfun *= *([^,\)]+)[,\)]+.*$',r'\1',source) if "lsanbcompfun" in source else "-"
        else:
            funcstr = "-"
        self.__name__ = (methodstr + "_Embeddings(" + str(factor) + "," 
                         + str(transpose1) + "," + str(transpose2) + ","
                         + str(a) + "," + str(b) + ","
                         + ("weighted" if weighted else "unweighted") + ","
                         + "exp"+str(exponentiation) + "," 
                         + funcstr
                         +","+diag+","+str(bias)
                         + ")" + (("-" + tag) if tag is not None else ""))
        self.factor = factor
        wordlist = []
        with open(eigen_folder+path_to_words) as file :
            line = file.readline()
            while line :
                wordlist.append(line[:-1])
                line = file.readline()
        print("Words done")

        if direct_file :
            emb =  np.load(eigen_folder+direct_file)
        else :
            
            matrix = np.genfromtxt(eigen_folder+"eigen-vectors.csv", delimiter = ",")
            values = np.genfromtxt(eigen_folder+"eigen-values.csv", delimiter = ",")
            values = np.diag(values**(self.factor))
            if transpose1:
                matrix = matrix.T
            self.transit = np.matmul(values, matrix)
            if transpose2:
                self.transit = self.transit.T
            self.dim = matrix.shape[0]
            print("Transit done")

            with open(eigen_folder+nb_docs) as file :
                nb_doc = int(file.readline())
            print("NB done")

            ct = {}
            with open(eigen_folder+counts) as file :
                line = file.readline()
                while line :
                    data_line = line[:-1].split(",")
                    ct[data_line[0]] = [int(data_line[1])]
                    line = file.readline()
            ct_order = []
            for word in wordlist :
                ct_order.append(ct[word])
            print("Counts done")

            emb = np.load(eigen_folder+path_to_embeddings)
            print("Embeddings loaded")
            self.idf = np.log(nb_doc/np.array(ct_order))
            emb *= self.idf
            self.maxidf = np.max(self.idf)
            #np.save(eigen_folder+"embeddings_docs-pit.npy", emb)
            print("Embeddings transformed")

        self.embed = {}
        for i, word in enumerate(wordlist) :
            self.embed[word] = emb[i]

        print("Embeddings done")
       
        
        self.voc = set(self.embed.keys())
        self.D = len(emb[0])
        # TF-IDF of sentence(1,V) * embedings(V,D) * eigen(D, ?)
        
        
        
    def preprocess(self, corpus, docs_bias=None):
        pass

    def summarize(self,corpus, docs_bias=None) :
        X = self.represent(corpus)
        dist = np.array([np.arange(X.shape[0]) - i for i in range(0,X.shape[0])])
        pos = dist > 0
        neg = dist < 0
        assymfactor = (np.power(self.a, np.abs(np.multiply(dist,pos))) 
                       + np.power(self.b, np.abs(np.multiply(dist,neg))) 
                       - 1)
        sim_base = cos_sim(X,X)
        for i in range(self.exponentiation):
            sim_base = np.expm1(sim_base)
            sim_base = sim_base / np.max(sim_base)
        matrix = np.multiply(assymfactor,sim_base)
        
        return generic_summarizer(self.method, matrix, corpus, self.weighted, self.lsanbcompfun, diag =  self.diag, bias = self.bias)

    def represent(self,corpus, apply_ortho = True) :
        X = []
        unknown = 0
        for sen in corpus :
            v = np.zeros(self.D)
            wc = defaultdict(int)
            for word in sen.split() :
                wc[word]+=1.0
            for word in wc :
                if word in  self.voc :
                    v += self.embed[word] * wc[word]**0.75
                else:
                    np.random.seed(hash(word) & (2**32-1))
                    tmpv = np.random.randn(self.dim)
                    tmpv = tmpv / np.linalg.norm(tmpv)
                    v += self.maxidf * tmpv * wc[word]**0.75
                    unknown += 1
            if apply_ortho:
                v = np.matmul(self.transit,v)
            X.append(v)
        X = np.array(X)

        return(X)