## Multi-document Summarization using Tensor Decomposition

### 1.Introduction
This notebook is a recurrance of the method of test labelling mentioned in the paper [http://www.scielo.org.mx/pdf/cys/v18n3/v18n3a12.pdf]

There is mainly following steps mentioned in this notebook：
- data extraction and pre-processing
- tensor construction
- tensor decomposition

### 2. data extraction and pre-processing

##### 2.1 Data extraction

In [1]:
topics = ['computer', 'sport', 'politics', 'entertainment', 'science']

In [1]:
os.mkdir('data')
for t in topics:
    os.makedirs(os.path.join('data', t))

NameError: name 'os' is not defined

In [2]:
import wikipediaapi as wikiapi
import wikipedia as wiki
import numpy as np
import nltk
import os
import time

In [3]:
wiki_api = wikiapi.Wikipedia('en')

In [4]:
topic_dict = {}
for t in topics:
    print(t)
    tag = True
    while tag:
        try:
            time.sleep(1)
            topic_dict[t] = wiki.search(t)
            tag = False
        except:
            continue

entertainment
science


In [5]:
for tk, tv in topic_dict.items():
    for pageName in tv:
        print(pageName)
        tag = True
        cnt = 0
        while tag:
            try:
                pageTmp = wiki.page(pageName)
                pageContent = pageTmp.content
                with open(os.path.join(os.path.join('data', tk), pageName + '.txt'), 'w') as w:
                    w.write(pageContent)
                tag = False
            except:
                cnt += 1
                if cnt < 10:
                    continue
                else:
                    break

Entertainment
Entertainment!
YG Entertainment
JYP Entertainment
SM Entertainment
WWE
Yuehua Entertainment
Coridel Entertainment
Sony Entertainment
InXile Entertainment
Science
Science (disambiguation)




  lis = BeautifulSoup(html).find_all('li')


Natural science
Science fiction
Social science
Forensic science
Political science
Rocket science
Branches of science
Computer science


##### 2.2 Data Preprocessing

In this part, we will preprocess our data:
- split the sentence
- remove the stopwords
- word stemming

##### 2.2.1 Split the sentence

In [3]:
def sentenceSplit(inputContent):
    res = nltk.sent_tokenize(inputContent)
    res_filter = [i for i in res if len(i) > 10]
    return res_filter

In [4]:
text_dict = {}
for tp in topics:
    fileNames = os.listdir(os.path.join('data/', tp))
    for fileName in fileNames:
        with open(os.path.join(os.path.join('data', tp), fileName)) as f:
            text_dict[fileName] = sentenceSplit(f.read())

##### 2.2.2 Remove the stop words and word stemming

In [5]:
# get the stopwords
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from collections import defaultdict
from nltk.stem.porter import PorterStemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [6]:
def sentenceRemoveStopwords(inputSentence):
    """
        input: the string of input
        output: list of stemmed tokens
    """
    # lowercase
    inputLower = inputSentence.lower()
    # get the word and remove the punctuation
    regTokenizer = RegexpTokenizer(r'\w+')
    tokens_filter = [t for t in regTokenizer.tokenize(inputLower) if t not in stop_words]
    # stem the word
    tokens_stemmer = [stemmer.stem(t) for t in tokens_filter]
    return tokens_stemmer

In [7]:
wordTokens = defaultdict(list)
for fileName, text in text_dict.items():
    for sen in text:
        wordTokens[fileName].append(sentenceRemoveStopwords(sen))

### 3 Topic Generation

In [8]:
from gensim import corpora
import gensim

In [9]:
# get the complete textual data set
text_complete = [[word for sent in sents for word in sent] for sents in wordTokens.values()]

In [10]:
dictionary = corpora.Dictionary(text_complete)

In [11]:
bag_of_word = [dictionary.doc2bow(text) for text in text_complete]

In [12]:
NUM_TOPIC = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus=bag_of_word, num_topics=NUM_TOPIC, id2word=dictionary, passes=15)

In [13]:
topic_select = ldamodel.print_topics(num_words=5)
for tp in topic_select:
    print(tp)

(0, '0.022*"entertain" + 0.016*"polit" + 0.013*"parti" + 0.008*"form" + 0.007*"social"')
(1, '0.013*"wwe" + 0.013*"graphic" + 0.011*"entertain" + 0.009*"marvel" + 0.008*"compani"')
(2, '0.028*"comput" + 0.024*"scienc" + 0.010*"use" + 0.007*"program" + 0.007*"studi"')
(3, '0.031*"sport" + 0.009*"psycholog" + 0.008*"athlet" + 0.007*"sm" + 0.006*"entertain"')
(4, '0.019*"network" + 0.010*"secur" + 0.009*"use" + 0.008*"system" + 0.008*"comput"')


In [14]:
sent_complete = [sent for sents in wordTokens.values() for sent in sents]

In [15]:
dictionary_sent = corpora.Dictionary(sent_complete)

In [16]:
bag_of_word_sent = [dictionary_sent.doc2bow(text) for text in sent_complete]

In [18]:
NUM_TOPIC = 5
ldamodel_sent = gensim.models.ldamodel.LdaModel(corpus=bag_of_word_sent, num_topics=NUM_TOPIC, id2word=dictionary_sent, passes=15)

In [19]:
topic_select_sent = ldamodel_sent.print_topics(num_words=5)
topic_select_sent

[(0,
  '0.043*"scienc" + 0.019*"comput" + 0.015*"social" + 0.012*"natur" + 0.011*"studi"'),
 (1,
  '0.015*"network" + 0.010*"use" + 0.010*"sport" + 0.009*"parti" + 0.008*"psycholog"'),
 (2,
  '0.019*"fiction" + 0.008*"form" + 0.008*"knowledg" + 0.007*"audienc" + 0.007*"film"'),
 (3,
  '0.028*"entertain" + 0.010*"wwe" + 0.010*"sport" + 0.007*"game" + 0.006*"year"'),
 (4,
  '0.010*"group" + 0.010*"entertain" + 0.010*"marvel" + 0.008*"new" + 0.006*"polit"')]

### 4.Tensor Construction

##### 4.1 build up the topic-term mapping

In [20]:
num_of_terms = len(dictionary.id2token.keys())
num_of_terms

10493

In [21]:
num_of_topics = len(topic_select_sent)
num_of_topics

5

In [22]:
num_of_doc = len(text_complete)
num_of_doc

44

In [23]:
# get the map between topics and terms
topic_term_mapping = np.zeros((num_of_terms, num_of_topics))

In [24]:
dictionary_sent

<gensim.corpora.dictionary.Dictionary at 0x1a148b46d8>

In [25]:
thresh_tp = 0.7
for sent in sent_complete:
    word_inds = dictionary_sent.doc2bow(sent)
    topic_prob = ldamodel_sent.get_document_topics(word_inds)
    for tp_prob in topic_prob:
        if tp_prob[1] > thresh_tp:
            for word_ind in word_inds:
                topic_term_mapping[word_ind[0], tp_prob[0]] = 1

In [26]:
# the total number of entry in topic_term_mapping
num_of_terms * num_of_topics

52465

In [27]:
# the valid entry of topic_term_mapping
np.sum(topic_term_mapping)

10488.0

##### 4.2 build up tf-idf mapping

In [28]:
from gensim.models import TfidfModel

In [29]:
tfidfModel = TfidfModel(dictionary=dictionary, corpus=bag_of_word)

##### 4.3 passage index to sentence index

In [30]:
passage_ind_to_sent_ind = lambda x: dictionary_sent.token2id[dictionary.id2token[x]]

In [31]:
sent_ind_to_passage_ind = lambda x: dictionary.token2id[dictionary_sent.id2token[x]]

##### 4.4 tensor construction

In [32]:
def fillTFIDF(idxList):
    res = np.zeros((num_of_terms))
    for idx, val in idxList:
        res[idx] = val
    return res

In [33]:
tensor_res = np.zeros((num_of_doc, num_of_topics, num_of_terms))

In [34]:
topic_term_mapping_transfer = np.zeros((num_of_terms, num_of_topics))
for idx_term in range(num_of_terms):
    transfered_ind = sent_ind_to_passage_ind(idx_term)
    for idx_topic in range(num_of_topics):
        if topic_term_mapping[transfered_ind, idx_topic]:
            topic_term_mapping_transfer[idx_term, idx_topic] = 1

In [35]:
len(text_complete)

44

In [36]:
for idx_text in range(len(text_complete)):
    tfidfValue = tfidfModel[dictionary.doc2bow(text_complete[idx_text])]
    tfidfList = fillTFIDF(tfidfValue)
    for idx_topic in range(len(topic_select_sent)):
        tensor_res[idx_text, idx_topic, :] = topic_term_mapping_transfer[:, idx_topic] * tfidfList

### 5 Tensor Decomposition

In [37]:
from tensorly.decomposition import tucker

Using numpy backend.


In [38]:
np.shape(tensor_res)

(44, 5, 10493)

In [39]:
np.sum(tensor_res > 0)

53979

In [None]:
import pickle
with open('tensor.pickle', 'wb') as f:
    pickle.dump(tensor_res, f)

In [None]:
from scipy.io import savemat
savemat('tensor_0_7.mat', {'t_new': tensor_res})

In [4]:
from scipy.io import loadmat
tensor_res = loadmat('tensor_0_7.mat')['t_new']
np.shape(tensor_res)

(44, 5, 10493)

In [47]:
import pickle
from tensorly.decomposition import tucker
with open('tensor.pickle', 'rb') as f:
    tensor_res = pickle.loads(f.read())

In [49]:
np.shape(tensor_res)

(5, 44, 10493)

In [51]:
topic_terms = np.sum(tensor_res, axis=2)

In [52]:
np.shape(topic_terms)

(5, 44)

In [55]:
np.argmax(topic_terms, axis=0)

array([2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 0, 4, 0, 4, 3, 4, 0, 1, 1, 0, 1, 1,
       0, 2, 2, 2, 1, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 1, 3, 2, 2])

In [7]:
decompose_result = tucker(tensor_res, ranks=[44,1,1])

In [8]:
np.shape(decompose_result[1][0])

(44, 44)

In [59]:
np.shape(decompose_result[1][1])

(44, 1)

In [60]:
np.shape(decompose_result[1][2])

(10493, 1)

In [83]:
topic_res = np.reshape(decompose_result[1][0], (5,))
doc_res = np.reshape(decompose_result[1][1], (44,))
term_res = np.reshape(decompose_result[1][2], (10493,))

In [78]:
topic_doc_mapping = np.outer(doc_res, topic_res)

In [79]:
np.shape(doc_res)

(44,)

In [80]:
np.shape(topic_doc_mapping)

(44, 5)

In [38]:
np.sum(tensor_res!=0)

83284

In [7]:
import tensorly
tensorly.default_backend
from tensorly import decomposition

In [11]:
import numpy as np
t = np.random.rand(10,15,10000)
np.shape(t)

(10, 15, 10000)

In [14]:
a, b = decomposition.parafac(tensor=t, rank=8, verbose=2, return_errors=1)

reconstruction error=0.4859553154914885, variation=4.0963404719818186e-05.
reconstruction error=0.48591808501579004, variation=3.7230475698479815e-05.
reconstruction error=0.4858842164590394, variation=3.3868556750638046e-05.
reconstruction error=0.4858533366930498, variation=3.087976598958875e-05.
reconstruction error=0.4858250902126459, variation=2.8246480403892882e-05.
reconstruction error=0.4857991543438083, variation=2.5935868837589915e-05.
reconstruction error=0.4857752463957338, variation=2.3907948074552987e-05.
reconstruction error=0.48575312488309075, variation=2.21215126430252e-05.
reconstruction error=0.4857325870411273, variation=2.053784196343056e-05.
reconstruction error=0.48571346434495094, variation=1.9122696176376852e-05.
reconstruction error=0.4856956172173956, variation=1.7847127555314923e-05.
reconstruction error=0.48567892968675985, variation=1.6687530635772774e-05.
reconstruction error=0.4856633044468172, variation=1.5625239942640423e-05.
reconstruction error=0.48

In [131]:
b

[0.8269160706104022,
 0.7956067438993916,
 0.7927929380366638,
 0.7921968471463594,
 0.7919448381652281,
 0.7918226727587331,
 0.7917586136990278,
 0.7917221958472505,
 0.7916998464486079,
 0.7916852265779694,
 0.7916751775942977,
 0.7916680073596144,
 0.7916627442290567,
 0.7916587954607478,
 0.7916557808247259,
 0.7916534462306697,
 0.791651616164578,
 0.7916501660989234,
 0.7916490057522255,
 0.7916480685259318,
 0.7916473046053221,
 0.7916466763139541,
 0.7916461549006026,
 0.7916457182648812,
 0.7916453493154372,
 0.7916450347653766,
 0.7916447642367661,
 0.7916445295878676,
 0.7916443244035017,
 0.7916441436064691,
 0.7916439831597597,
 0.7916438398374203,
 0.7916437110476827,
 0.7916435946960716,
 0.7916434890792152,
 0.7916433928023098,
 0.7916433047148477,
 0.7916432238604821,
 0.7916431494378388,
 0.7916430807698296,
 0.7916430172795732,
 0.7916429584714505,
 0.7916429039161658,
 0.7916428532389274,
 0.7916428061100651,
 0.7916427622375553,
 0.7916427213610397,
 0.79164268324

In [132]:
a[0]

array([[-1.35033892e+00, -2.76424136e-03, -6.72076213e-03,
         1.21484372e-02, -3.72876181e-03],
       [-8.33736161e-01, -9.56467681e-03,  5.80754431e-04,
         1.06436844e-02,  1.69393424e-01],
       [-8.26719391e-01, -8.55019883e-03, -1.49648024e-02,
         1.49716073e-02,  1.46198983e-01],
       [-1.40566561e+00, -1.14267190e-03, -6.03530572e-03,
        -1.33571272e-04,  1.46502609e-02],
       [-4.44692191e-01, -4.82951944e-03, -1.30539812e-02,
         1.36493010e-02, -4.53775982e-03],
       [-1.19452963e+00, -7.48483564e-04, -6.35827992e-03,
         8.27069745e-03,  2.06363773e-03],
       [-1.86055978e+00, -1.34677592e-03,  3.85696681e-03,
         5.70947877e-02,  1.76395618e-02],
       [-5.43386115e-01, -1.18432419e-02, -2.26282954e-02,
         1.98919801e-02, -7.00115946e-03],
       [-1.70727079e+00, -5.98447440e-05, -5.86137995e-03,
         1.26653390e-02,  3.56147924e-03],
       [-1.62506377e+00,  7.06226709e-04, -3.13360445e-04,
         1.78696843e-02

In [133]:
np.argmax(a[0], axis=1)

array([3, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [123]:
np.argmax(a[0], axis=1)

array([1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 4, 0, 4, 0, 0, 0, 0, 0, 4, 4, 3, 3,
       3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0])

In [86]:
np.sum(tensor_res > 0) + np.sum(tensor_res == 0)

2308460