## Multi-document Summarization using Tensor Decomposition

### 1.Introduction
This notebook is a recurrance of the method of test labelling mentioned in the paper [http://www.scielo.org.mx/pdf/cys/v18n3/v18n3a12.pdf]

There is mainly following steps mentioned in this notebook：
- data extraction and pre-processing
- tensor construction
- tensor decomposition

### 2. data extraction and pre-processing

##### 2.1 Data extraction

In [7]:
topics = ['computer', 'sport', 'politics', 'entertainment', 'science']

In [None]:
os.mkdir('data')
for t in topics:
    os.makedirs(os.path.join('data', t))

In [10]:
import wikipediaapi as wikiapi
import wikipedia as wiki
import numpy as np
import nltk
import os
import time

In [3]:
wiki_api = wikiapi.Wikipedia('en')

In [4]:
topic_dict = {}
for t in topics:
    print(t)
    tag = True
    while tag:
        try:
            time.sleep(1)
            topic_dict[t] = wiki.search(t)
            tag = False
        except:
            continue

entertainment
science


In [5]:
for tk, tv in topic_dict.items():
    for pageName in tv:
        print(pageName)
        tag = True
        cnt = 0
        while tag:
            try:
                pageTmp = wiki.page(pageName)
                pageContent = pageTmp.content
                with open(os.path.join(os.path.join('data', tk), pageName + '.txt'), 'w') as w:
                    w.write(pageContent)
                tag = False
            except:
                cnt += 1
                if cnt < 10:
                    continue
                else:
                    break

Entertainment
Entertainment!
YG Entertainment
JYP Entertainment
SM Entertainment
WWE
Yuehua Entertainment
Coridel Entertainment
Sony Entertainment
InXile Entertainment
Science
Science (disambiguation)




  lis = BeautifulSoup(html).find_all('li')


Natural science
Science fiction
Social science
Forensic science
Political science
Rocket science
Branches of science
Computer science


##### 2.2 Data Preprocessing

In this part, we will preprocess our data:
- split the sentence
- remove the stopwords
- word stemming

##### 2.2.1 Split the sentence

In [11]:
def sentenceSplit(inputContent):
    res = nltk.sent_tokenize(inputContent)
    res_filter = [i for i in res if len(i) > 10]
    return res_filter

In [12]:
text_dict = {}
for tp in topics:
    fileNames = os.listdir(os.path.join('data/', tp))
    for fileName in fileNames:
        with open(os.path.join(os.path.join('data', tp), fileName)) as f:
            text_dict[fileName] = sentenceSplit(f.read())

##### 2.2.2 Remove the stop words and word stemming

In [13]:
# get the stopwords
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from collections import defaultdict
from nltk.stem.porter import PorterStemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [14]:
def sentenceRemoveStopwords(inputSentence):
    """
        input: the string of input
        outputL list of stemmed tokens
    """
    # lowercase
    inputLower = inputSentence.lower()
    # get the word and remove the punctuation
    regTokenizer = RegexpTokenizer(r'\w+')
    tokens_filter = [t for t in regTokenizer.tokenize(inputLower) if t not in stop_words]
    # stem the word
    tokens_stemmer = [stemmer.stem(t) for t in tokens_filter]
    return tokens_stemmer

In [15]:
wordTokens = defaultdict(list)
for fileName, text in text_dict.items():
    for sen in text:
        wordTokens[fileName].append(sentenceRemoveStopwords(sen))

### 3 Topic Generation

In [16]:
from gensim import corpora
import gensim

In [17]:
# get the complete textual data set
text_complete = [[word for sent in sents for word in sent] for sents in wordTokens.values()]

In [18]:
dictionary = corpora.Dictionary(text_complete)

In [19]:
bag_of_word = [dictionary.doc2bow(text) for text in text_complete]

In [20]:
NUM_TOPIC = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus=bag_of_word, num_topics=NUM_TOPIC, id2word=dictionary, passes=15)

In [21]:
topic_select = ldamodel.print_topics(num_words=5)
for tp in topic_select:
    print(tp)

(0, '0.021*"entertain" + 0.016*"sm" + 0.010*"yg" + 0.010*"compani" + 0.007*"music"')
(1, '0.018*"fiction" + 0.016*"marvel" + 0.016*"scienc" + 0.010*"jyp" + 0.009*"entertain"')
(2, '0.035*"comput" + 0.013*"use" + 0.011*"network" + 0.009*"system" + 0.008*"program"')
(3, '0.028*"scienc" + 0.019*"polit" + 0.010*"social" + 0.010*"parti" + 0.009*"studi"')
(4, '0.018*"sport" + 0.016*"entertain" + 0.008*"perform" + 0.007*"use" + 0.006*"wwe"')


In [22]:
sent_complete = [sent for sents in wordTokens.values() for sent in sents]

In [23]:
dictionary_sent = corpora.Dictionary(sent_complete)

In [24]:
bag_of_word_sent = [dictionary_sent.doc2bow(text) for text in sent_complete]

In [25]:
NUM_TOPIC = 5
ldamodel_sent = gensim.models.ldamodel.LdaModel(corpus=bag_of_word_sent, num_topics=NUM_TOPIC, id2word=dictionary_sent, passes=15)

In [26]:
topic_select_sent = ldamodel_sent.print_topics(num_words=5)
topic_select_sent

[(0,
  '0.018*"sport" + 0.014*"parti" + 0.010*"state" + 0.007*"play" + 0.007*"world"'),
 (1,
  '0.053*"scienc" + 0.018*"social" + 0.015*"natur" + 0.015*"polit" + 0.012*"studi"'),
 (2,
  '0.020*"comput" + 0.018*"use" + 0.012*"system" + 0.009*"network" + 0.006*"inform"'),
 (3,
  '0.015*"fiction" + 0.015*"entertain" + 0.011*"comput" + 0.009*"first" + 0.009*"new"'),
 (4,
  '0.016*"wwe" + 0.009*"sport" + 0.009*"perform" + 0.007*"one" + 0.006*"event"')]

### 4.Tensor Construction

##### 4.1 build up the topic-term mapping

In [27]:
num_of_terms = len(dictionary.id2token.keys())
num_of_terms

10493

In [28]:
num_of_topics = len(topic_select_sent)
num_of_topics

5

In [29]:
num_of_doc = len(text_complete)
num_of_doc

44

In [30]:
# get the map between topics and terms
topic_term_mapping = np.zeros((num_of_terms, num_of_topics))

In [31]:
thresh_tp = 0.5
for sent in sent_complete:
    word_inds = dictionary_sent.doc2bow(sent)
    topic_prob = ldamodel_sent.get_document_topics(word_inds)
    for tp_prob in topic_prob:
        if tp_prob[1] > thresh_tp:
            for word_ind in word_inds:
                topic_term_mapping[word_ind[0], tp_prob[0]] = 1

In [32]:
# the total number of entry in topic_term_mapping
num_of_terms * num_of_topics

52465

In [33]:
# the valid entry of topic_term_mapping
np.sum(topic_term_mapping)

16921.0

##### 4.2 build up tf-idf mapping

In [34]:
from gensim.models import TfidfModel

In [35]:
tfidfModel = TfidfModel(dictionary=dictionary, corpus=bag_of_word)

##### 4.3 passage index to sentence index

In [36]:
passage_ind_to_sent_ind = lambda x: dictionary_sent.token2id[dictionary.id2token[x]]

In [37]:
sent_ind_to_passage_ind = lambda x: dictionary.token2id[dictionary_sent.id2token[x]]

##### 4.4 tensor construction

In [38]:
def fillTFIDF(idxList):
    res = np.zeros((num_of_terms))
    for idx, val in idxList:
        res[idx] = val
    return res

In [39]:
tensor_res = np.zeros((num_of_topics, num_of_doc, num_of_terms))

In [40]:
topic_term_mapping_transfer = np.zeros((num_of_terms, num_of_topics))
for idx_term in range(num_of_terms):
    transfered_ind = sent_ind_to_passage_ind(idx_term)
    for idx_topic in range(num_of_topics):
        if topic_term_mapping[transfered_ind, idx_topic]:
            topic_term_mapping_transfer[idx_term, idx_topic] = 1

In [41]:
for idx_text in range(len(text_complete)):
    tfidfValue = tfidfModel[dictionary.doc2bow(text_complete[idx_text])]
    tfidfList = fillTFIDF(tfidfValue)
    for idx_topic in range(len(topic_select_sent)):
        tensor_res[idx_topic, idx_text, :] = topic_term_mapping_transfer[:, idx_topic] * tfidfList

### 5 Tensor Decomposition

In [42]:
from tensorly.decomposition import tucker

In [43]:
np.shape(tensor_res)

(5, 44, 10493)

In [44]:
np.sum(tensor_res > 0)

81655

In [46]:
import pickle
with open('tensor.pickle', 'wb') as f:
    pickle.dump(tensor_res, f)

In [47]:
import pickle
from tensorly.decomposition import tucker
with open('tensor.pickle', 'rb') as f:
    tensor_res = pickle.loads(f.read())

In [49]:
np.shape(tensor_res)

(5, 44, 10493)

In [51]:
topic_terms = np.sum(tensor_res, axis=2)

In [52]:
np.shape(topic_terms)

(5, 44)

In [55]:
np.argmax(topic_terms, axis=0)

array([2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 0, 4, 0, 4, 3, 4, 0, 1, 1, 0, 1, 1,
       0, 2, 2, 2, 1, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 1, 3, 2, 2])

In [None]:
decompose_result = tucker(tensor_res, ranks=[44,1,1])

In [None]:
np.shape(decompose_result[1][0])

In [59]:
np.shape(decompose_result[1][1])

(44, 1)

In [60]:
np.shape(decompose_result[1][2])

(10493, 1)

In [83]:
topic_res = np.reshape(decompose_result[1][0], (5,))
doc_res = np.reshape(decompose_result[1][1], (44,))
term_res = np.reshape(decompose_result[1][2], (10493,))

In [78]:
topic_doc_mapping = np.outer(doc_res, topic_res)

In [79]:
np.shape(doc_res)

(44,)

In [80]:
np.shape(topic_doc_mapping)

(44, 5)

In [38]:
np.sum(tensor_res!=0)

83284