In [2]:
import numpy as np
import pandas as pd
import gc
import warnings
from collections import defaultdict, Counter
import docx
import copy
from docx import Document
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [4]:
replace_list = [',', '(', ')', '\"', '.', '?', ':', ';', '“', '”']
class DocHITS():
    def norm(self, matrix):
        return matrix / np.linalg.norm(matrix)
    
    def cal_L(self, sentences, docs, belong):
        columns = set()
        for i in range(len(sentences)):
            for rep in replace_list:
                sentences[i] = sentences[i].replace(rep, ' ')
            sentences[i] = sentences[i].lower()
            sentences[i] = sentences[i].split()
            columns |= set(sentences[i])
        
        doc_length = [0] * len(docs)
        for i, s in enumerate(sentences):
            doc_length[belong[i]] += len(s)
        
        tf_sen = np.zeros((len(sentences),len(columns)))
        tf_s = pd.DataFrame(tf_sen, columns=list(columns))
        tf_doc = np.zeros((len(docs),len(columns)))
        tf_d = pd.DataFrame(tf_doc, columns=list(columns))
        
        isf_np1 = np.ones((1, len(columns)))
        isf_np2 = np.ones((1, len(columns)))
        isf_sen = pd.DataFrame(isf_np1, columns=list(columns))
        isf_doc = pd.DataFrame(isf_np2, columns=list(columns))
        
        for i,s in enumerate(sentences):
            dic = Counter(s)
            for k,v in dic.items():
                tf_s.iloc[i][k] += v / len(s)
                tf_d.iloc[belong[i]][k] += v / doc_length[belong[i]]
                isf_sen.iloc[0][k] += 1
                isf_doc.iloc[0][k] += 1
                
        for c in columns:
            isf_sen.iloc[0][c] = np.log(len(sentences) / (isf_sen.iloc[0][c] + 1))
            isf_doc.iloc[0][c] = np.log(len(docs) / (isf_doc.iloc[0][c] + 1))
            
        for i in range(len(sentences)):
            tf_s.iloc[i] = tf_s.iloc[i].mul(isf_sen.iloc[0])
        for i in range(len(docs)):
            tf_d.iloc[i] = tf_d.iloc[i].mul(isf_doc.iloc[0])
            
        tf_sen = np.array(tf_s)
        tf_doc = np.array(tf_d)
        similar =np.dot(tf_sen, tf_doc.transpose())
        inner_sen = np.sum(np.multiply(tf_sen, tf_sen), axis=1, keepdims=True)
        inner_doc = np.sum(np.multiply(tf_doc, tf_doc), axis=1, keepdims=True)
        inner = np.sqrt(np.dot(inner_sen, inner_doc.transpose()))
        self.L = similar / inner
            
    def run(self, shape_A, shape_H, delta):
        epoch = 1
        last_A = A = np.ones((shape_A, 1))
        last_H = H = np.ones((shape_H, 1))
        while True:
            A = self.norm(np.dot(self.L, last_H))
            H = self.norm(np.dot(self.L.transpose(), last_A))
            loss_A = np.sum(last_A - A)
            loss_H = np.sum(last_H - H)
            if max(loss_A, loss_H) < delta:
                return A.transpose(), H.transpose()
            print("epoch : {}, \n    loss_A = {}\n    loss_H = {}".format(epoch, loss_A, loss_H))
            print('='*20)
            epoch += 1
            last_A = A
            last_H = H
        
    
if __name__ == '__main__':
    path = '/Users/willer/Desktop/data.docx'
    document = Document(path)
    table = document.tables
    table = table[0]
    sentences = []
    docs = [""]*5
    belong = {}
    for i,line in enumerate(table.rows[1:]):
        docs[int((line.cells[1].text)[1])-1] += line.cells[-1].text + " "
        belong[i] = int((line.cells[1].text)[1])-1
        sentences.append(line.cells[-1].text)
        
    doc_model = DocHITS()
    doc_model.cal_L(sentences, docs, belong)
    vec_A, vec_H = doc_model.run(len(sentences), len(docs), 0.0001)
    
    print("Authoritier Vector : \n{}".format(vec_A))
    print("Hub Vector : \n{}".format(vec_H))

    

epoch : 1, 
    loss_A = 7.871425985689779
    loss_H = 2.9145164810236213
epoch : 2, 
    loss_A = 0.007199111899645255
    loss_H = -0.11883170218737157
epoch : 3, 
    loss_A = -0.05118293143918745
    loss_H = 0.33746858097169685
epoch : 4, 
    loss_A = 0.2943989381362246
    loss_H = -0.2126701282555522
epoch : 5, 
    loss_A = -0.19842859905944235
    loss_H = 0.5061958861500001
epoch : 6, 
    loss_A = 0.520863015722979
    loss_H = -0.2659037639357232
epoch : 7, 
    loss_A = -0.2679565909646284
    loss_H = 0.5461394333274807
epoch : 8, 
    loss_A = 0.5639813849721385
    loss_H = -0.24478964403735898
epoch : 9, 
    loss_A = -0.24531494490660258
    loss_H = 0.4617772146857329
epoch : 10, 
    loss_A = 0.4696489219942167
    loss_H = -0.18576796257186806
epoch : 11, 
    loss_A = -0.18537798455271493
    loss_H = 0.33639640229155243
epoch : 12, 
    loss_A = 0.34006574004626333
    loss_H = -0.12754571812436727
epoch : 13, 
    loss_A = -0.12771607186427353
    loss_H = 0.2

In [1]:
class LexRank():
    
    def cal_lexrank(self, sentences, threshold):
        columns = set()
        for i in range(len(sentences)):
            for rep in replace_list:
                sentences[i] = sentences[i].replace(rep, ' ')
            sentences[i] = sentences[i].lower()
            sentences[i] = sentences[i].split()
            columns |= set(sentences[i])

        tf_np = np.zeros((len(sentences),len(columns)))
        tf = pd.DataFrame(tf_np, columns=list(columns))
        isf_np = np.ones((1, len(columns)))
        isf = pd.DataFrame(isf_np, columns=list(columns))


        for i,s in enumerate(sentences):
            dic = Counter(s)
            for k,v in dic.items():
                tf .iloc[i][k] += v / len(s)
                isf.iloc[0][k] += 1
                    
        isf = np.log(len(sentences) / isf)
        for i in range(len(sentences)):
            tf.iloc[i] = tf.iloc[i].mul(isf.iloc[0])
        
        tf_isf_matrix = np.array(tf)
        similar_matrix = np.dot(tf_isf_matrix, tf_isf_matrix.transpose())
        inner_matrix = np.sum(np.multiply(tf_isf_matrix, tf_isf_matrix), axis=1, keepdims=True)
        inner_matrix = np.sqrt(np.dot(inner_matrix, inner_matrix.transpose()))
        similar_matrix /= inner_matrix
        
        degree = np.zeros(len(sentences))
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if similar_matrix[i][j] > threshold:
                    similar_matrix[i][j] = 1
                    degree[i] += 1
                else:
                    similar_matrix[i][j] = 0
           
        self.similar = copy.deepcopy(similar_matrix)
        for i in range(len(sentences)):
            self.similar[i] /= np.sum(self.similar[i])
        U = np.ones((len(sentences), len(sentences)))
        U /= len(sentences)
        self.similar = d * U + (1 - d) * self.similar

        for i in range(len(sentences)):
            for j in range(len(sentences)):
                similar_matrix[i][j] /= degree[i]
                
        return self.power_method(similar_matrix, len(sentences), 0.0001)
        
    def power_method(self, matrix, N, epsilon):
        last_p = p = np.ones(N) / N
        loss = float('inf')
        epoch = 1
        while True:
            p = np.dot(self.similar.transpose(), p)
            loss = np.sum(np.abs(p - last_p))
            if loss < epsilon:
                return p
            print("epoch {}\n    loss:{}".format(epoch, loss))
            print("="*20)
            last_p = p
            epoch += 1
            
if __name__ == '__main__':
    path = '/Users/willer/Desktop/data.docx'
    document = Document(path)
    table = document.tables
    table = table[0]
    sentences = []
    for line in table.rows[1:]:
        sentences.append(line.cells[-1].text)
        
    lex_model = LexRank()
    Lex_mat = lex_model.cal_lexrank(sentences, 0.2)
    print(Lex_mat)

NameError: name 'Document' is not defined