In [1]:
import re
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import operator

In [2]:
# create stemmer
stemmer = PorterStemmer()

def cleanData(sentence):
	#sentence = re.sub('[^A-Za-z0-9 ]+', '', sentence)
	#sentence filter(None, re.split("[.!?", setence))
	ret = []
	sentence = stemmer.stem(sentence)	
	for word in sentence.split():
		ret.append(word)
	return " ".join(ret)

In [3]:
def getVectorSpace(cleanSet):
	vocab = {}
	for data in cleanSet:
		for word in data.split():
			vocab[data] = 0
	return vocab.key

In [4]:
def calculateSimilarity(sentence, doc):
	if doc == []:
		return 0
	vocab = {}
	for word in sentence:
		vocab[word] = 0
	
	docInOneSentence = '';
	for t in doc:
		docInOneSentence += (t + ' ')
		for word in t.split():
			vocab[word]=0	
	
	cv = CountVectorizer(vocabulary=vocab.keys())

	docVector = cv.fit_transform([docInOneSentence])
	sentenceVector = cv.fit_transform([sentence])
	return cosine_similarity(docVector, sentenceVector)[0][0]

In [5]:
def concat(x):
    # print(len(x), len(x[:-2]))
    x = ' '.join(x[:-2])
    x = x.split('\n')
    x = list(filter(lambda s: not s == ' ', x))
    x = list(map(lambda s: s.strip(), x))
    return x

In [6]:
def get_sentences(texts, sentences, clean, originalSentenceOf):
    for line in texts:
        parts = line.split('.')
        for part in parts:
            cl = cleanData(part)
            sentences.append(part)
            clean.append(cl)
            originalSentenceOf[cl] = part		
    setClean = set(clean)

    return setClean

In [7]:
from termcolor import colored
def get_mmr(doc):
	alpha = 0.8
	sentences = []
	clean = []
	originalSentenceOf = {}

	cleanSet = get_sentences(doc, sentences, clean, originalSentenceOf)

	scores = {}
	for data in clean:
		temp_doc = cleanSet - set([data])
		score = calculateSimilarity(data, list(temp_doc))
		scores[data] = score

	n = 20 * len(sentences) / 100
	summarySet = []
	while n > 0:
		mmr = {}
		for sentence in scores.keys():
			if not sentence in summarySet:
				mmr[sentence] = alpha * scores[sentence] - (1-alpha) * calculateSimilarity(sentence, summarySet)	
		if mmr == {}:
			break
		selected = max(mmr.items(), key=operator.itemgetter(1))[0]	
		summarySet.append(selected)
		n -= 1

	original = [originalSentenceOf[sentence].strip() for sentence in summarySet]
	# print ('\nSummary:\n')
	# for sentence in summarySet:
	# 	print (originalSentenceOf [sentence].lstrip(' '))
	# print()

	# print ('=============================================================')
	# print ('\nOriginal Passages:\n')

	# for sentence in clean:
	# 	if sentence in summarySet:
	# 		print (colored(originalSentenceOf[sentence].lstrip(' '), 'red'))
	# 	else:
	# 		print (originalSentenceOf[sentence].lstrip(' '))
	
	return original

In [8]:
import os
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

num_clusters = 3

dir_path = 'multi_news/'
files = ['sample_train.csv', 'sample_validation.csv', 'sample_test.csv']

for file in files:
    df = pd.read_csv(os.path.join(dir_path, file))
    df['documents'] = df['documents'].progress_apply(lambda x: eval(x))
    df['concat_doc'] = df['documents'].progress_apply(lambda x: concat(x))
    df['mmr'] = df['concat_doc'].progress_apply(lambda x: get_mmr(x))

    # drop columns
    df.drop(['documents', 'concat_doc', 'num_documents'], axis=1, inplace=True)
    df.to_csv(os.path.join('.', file[:-4] + '_mmr.csv'), index=False)

# print(df.head())

100%|██████████| 500/500 [00:00<00:00, 2420.15it/s]
100%|██████████| 500/500 [00:00<00:00, 10705.11it/s]
100%|██████████| 10/10 [02:03<00:00, 12.33s/it]
100%|██████████| 250/250 [00:00<00:00, 3037.86it/s]
100%|██████████| 250/250 [00:00<00:00, 21459.51it/s]
0it [00:00, ?it/s]
100%|██████████| 250/250 [00:00<00:00, 3710.07it/s]
100%|██████████| 250/250 [00:00<00:00, 18316.70it/s]
0it [00:00, ?it/s]


In [9]:

#print str(time.time() - start)
	
# print ('\nSummary:\n')
# for sentence in summarySet:
# 	print (originalSentenceOf [sentence].lstrip(' '))
# print()

# print '============================================================='
# print '\nOriginal Passages:\n'
# from termcolor import colored

# for sentence in clean:
# 	if sentence in summarySet:
# 		print colored(originalSentenceOf[sentence].lstrip(' '), 'red')
# 	else:
# 		print originalSentenceOf[sentence].lstrip(' ')
	