In [2]:
import re
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import operator

In [3]:
# create stemmer
stemmer = PorterStemmer()

def cleanData(sentence):
	#sentence = re.sub('[^A-Za-z0-9 ]+', '', sentence)
	#sentence filter(None, re.split("[.!?", setence))
	ret = []
	sentence = stemmer.stem(sentence)	
	for word in sentence.split():
		ret.append(word)
	return " ".join(ret)

In [4]:
def getVectorSpace(cleanSet):
	vocab = {}
	for data in cleanSet:
		for word in data.split():
			vocab[data] = 0
	return vocab.key

In [5]:
def calculateSimilarity(sentence, doc):
	if doc == []:
		return 0
	vocab = {}
	for word in sentence:
		vocab[word] = 0
	
	docInOneSentence = '';
	for t in doc:
		docInOneSentence += (t + ' ')
		for word in t.split():
			vocab[word]=0	
	
	cv = CountVectorizer(vocabulary=vocab.keys())

	docVector = cv.fit_transform([docInOneSentence])
	sentenceVector = cv.fit_transform([sentence])
	return cosine_similarity(docVector, sentenceVector)[0][0]

In [6]:
def concat(x):
    # print(len(x), len(x[:-2]))
    x = ' '.join(x)
    x = x.split('\n')
    x = list(filter(lambda s: not s == ' ', x))
    x = list(map(lambda s: s.strip(), x))
    return x

In [7]:
def get_sentences(texts, sentences, clean, originalSentenceOf):
    for line in texts:
        parts = line.split('.')
        for part in parts:
            cl = cleanData(part)
            sentences.append(part)
            clean.append(cl)
            originalSentenceOf[cl] = part		
    setClean = set(clean)

    return setClean

In [8]:
import signal

# Define the handler function to raise an exception
def handler(signum, frame):
    raise Exception("Function execution took too long")

# Set the signal handler
signal.signal(signal.SIGALRM, handler)

<Handlers.SIG_DFL: 0>

In [9]:
# from termcolor import colored
def get_mmr(doc, alpha):
	try:
		# set an alarm for 60 seconds
		signal.alarm(60)
		sentences = []
		clean = []
		originalSentenceOf = {}

		cleanSet = get_sentences(doc, sentences, clean, originalSentenceOf)

		scores = {}
		for data in clean:
			temp_doc = cleanSet - set([data])
			score = calculateSimilarity(data, list(temp_doc))
			scores[data] = score

		n = 20 * len(sentences) / 100
		summarySet = []
		while n > 0:
			mmr = {}
			for sentence in scores.keys():
				if not sentence in summarySet:
					mmr[sentence] = alpha * scores[sentence] - (1-alpha) * calculateSimilarity(sentence, summarySet)	
			if mmr == {}:
				break
			selected = max(mmr.items(), key=operator.itemgetter(1))[0]	
			summarySet.append(selected)
			n -= 1

		original = [originalSentenceOf[sentence].strip() for sentence in summarySet]
		# print ('\nSummary:\n')
		# for sentence in summarySet:
		# 	print (originalSentenceOf [sentence].lstrip(' '))
		# print()

		# print ('=============================================================')
		# print ('\nOriginal Passages:\n')

		# for sentence in clean:
		# 	if sentence in summarySet:
		# 		print (colored(originalSentenceOf[sentence].lstrip(' '), 'red'))
		# 	else:
		# 		print (originalSentenceOf[sentence].lstrip(' '))
		
		return original
	except Exception as e:
		return []

In [10]:
import wandb

api = wandb.Api()
artifact = api.artifact('ire-shshsh/mdes/multi_x_science:v0', type='dataset')
path_to_file = artifact.get_path('test.csv').download()

In [11]:
path_to_file

'./artifacts/multi_x_science:v0/test.csv'

In [12]:
# import pandas as pd
# df = pd.read_csv(path_to_file)
# df['documents'] = df['documents'].apply(lambda x: eval(x))
# df['concat_doc'] = df['documents'].apply(lambda x: concat(x))
# df.head()

In [None]:
import os
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# dir_path = 'multi_news/'
# dir_path = './'
# files = ['sample_train.csv', 'sample_validation.csv', 'sample_test.csv']
# files = ['test.csv']

run = wandb.init(entity='ire-shshsh', project='mmr', job_type='mmr')

for alpha in [0.2, 0.5, 0.8]:
    df = pd.read_csv(path_to_file)
    df['abstracts'] = df['abstracts'].progress_apply(lambda x: eval(x))
    df['concat_doc'] = df['abstracts'].progress_apply(lambda x: concat(x))

    df['mmr'] = ''

    # Write the header to the file
    df.iloc[0:0].drop(['abstracts', 'concat_doc', 'cite_N'], axis=1).to_csv(f'test_{alpha}.csv', index=False)

    for i, row in tqdm(df.iterrows()):
        df.at[i, 'mmr'] = get_mmr(df.at[i, 'concat_doc'], alpha)
        if df.at[i, 'mmr'] == []:
            continue

        row = df.iloc[i].drop(['abstracts', 'concat_doc', 'cite_N'])

        # Save the current row to the file
        row.to_frame().T.to_csv(f'test_{alpha}.csv', mode='a', header=False, index=False)
    
    artifact = wandb.Artifact(name=f'multi_news_test_{alpha}', type='dataset')
    artifact.add_file(f'test_{alpha}.csv')
    run.log_artifact(artifact)

wandb.finish()

In [16]:

#print str(time.time() - start)
	
# print ('\nSummary:\n')
# for sentence in summarySet:
# 	print (originalSentenceOf [sentence].lstrip(' '))
# print()

# print '============================================================='
# print '\nOriginal Passages:\n'
# from termcolor import colored

# for sentence in clean:
# 	if sentence in summarySet:
# 		print colored(originalSentenceOf[sentence].lstrip(' '), 'red')
# 	else:
# 		print originalSentenceOf[sentence].lstrip(' ')
	