In [13]:
# Import the PorterStemmer from nltk
from nltk.stem import PorterStemmer

# Create an instance of PorterStemmer
stemmer = PorterStemmer()

# Define a function to clean and stem the data
def cleanData(sentence):
    # Use the stemmer to stem the sentence
    return stemmer.stem(sentence)

In [15]:
# Import the necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define a function to calculate the cosine similarity between a sentence and a document
def calculateSimilarity(sentence, doc):
    # If the document is empty, return 0
    if doc == []:
        return 0

    # Initialize an empty dictionary to hold the vocabulary
    vocab = {}

    # For each word in the sentence, add it to the vocabulary dictionary
    for word in sentence:
        vocab[word] = 0

    # Initialize an empty string to hold the document in one sentence
    docInOneSentence = ''

    # For each term in the document, add it to the docInOneSentence string
    # and add each word in the term to the vocabulary dictionary
    for t in doc:
        docInOneSentence += (t + ' ')
        for word in t.split():
            vocab[word]=0

    # Initialize a CountVectorizer with the vocabulary dictionary as the vocabulary
    cv = CountVectorizer(vocabulary=vocab.keys())

    # Transform the docInOneSentence string and the sentence into vectors
    docVector = cv.fit_transform([docInOneSentence])
    sentenceVector = cv.fit_transform([sentence])

    # Return the cosine similarity between the docVector and the sentenceVector
    return cosine_similarity(docVector, sentenceVector)[0][0]

In [None]:
# Define a function to concatenate and clean a list of strings
def concat(x):
    # Join all the strings in the list into one string separated by spaces
    x = ' '.join(x)
    
    # Split the string into a list of strings at each newline character
    x = x.split('\n')
    
    # Filter out any strings in the list that are just a space
    x = list(filter(lambda s: not s == ' ', x))
    
    # Remove leading and trailing whitespace from each string in the list
    x = list(map(lambda s: s.strip(), x))
    
    # Return the cleaned list of strings
    return x

In [None]:
# Define a function to get sentences from a text, clean them, and map them to their original form
def get_sentences(texts, sentences, clean, originalSentenceOf):
    # Split the text into parts at each period
    parts = texts.split('.')
    
    # For each part in the parts list
    for part in parts:
        # Clean the part using the cleanData function
        cl = cleanData(part)
        
        # Append the original part to the sentences list
        sentences.append(part)
        
        # Append the cleaned part to the clean list
        clean.append(cl)
        
        # Map the cleaned part to the original part in the originalSentenceOf dictionary
        originalSentenceOf[cl] = part
    
    # Remove duplicates from the clean list by converting it to a set
    setClean = set(clean)

    # Return the set of cleaned parts
    return setClean

In [7]:
# import signal

# def handler(signum, frame):
#     raise Exception("Function execution took too long")

# signal.signal(signal.SIGALRM, handler)

<Handlers.SIG_DFL: 0>

In [18]:
# Import the signal module
import signal

# Define a handler function that raises an exception when called
def handler(signum, frame):
    raise Exception("Function execution took too long")

# Set the alarm signal handler to the handler function
# When the alarm signal is received, the handler function will be called
signal.signal(signal.SIGALRM, handler)

<Handlers.SIG_DFL: 0>

In [20]:
# Import the icecream module for debugging
from icecream import ic
import operator

# Define a function to get the Maximal Marginal Relevance (MMR)
def get_mmr(doc, alpha):
    try:
        # Set an alarm for 60 seconds
        signal.alarm(60)
        
        # Initialize lists and a dictionary to hold sentences, cleaned sentences, and original sentences
        sentences = []
        clean = []
        originalSentenceOf = {}

        # Get the set of cleaned sentences from the document
        cleanSet = get_sentences(doc, sentences, clean, originalSentenceOf)

        # Initialize a dictionary to hold the scores of the sentences
        scores = {}
        
        # For each cleaned sentence, calculate its score and add it to the scores dictionary
        for data in clean:
            temp_doc = cleanSet - set([data])
            score = calculateSimilarity(data, list(temp_doc))
            scores[data] = score

        # Calculate the number of sentences to include in the summary
        n = 20 * len(sentences) / 50

        # Initialize a list to hold the summary sentences
        summarySet = []
        
        # While there are still sentences to add to the summary
        while n > 0:
            # Initialize a dictionary to hold the MMR of the sentences
            mmr = {}
            
            # For each sentence, calculate its MMR and add it to the mmr dictionary
            for sentence in scores.keys():
                if not sentence in summarySet:
                    mmr[sentence] = alpha * scores[sentence] - (1-alpha) * calculateSimilarity(sentence, summarySet)	
            
            # If the mmr dictionary is empty, break the loop
            if mmr == {}:
                break
            
            # Select the sentence with the highest MMR and add it to the summary set
            selected = max(mmr.items(), key=operator.itemgetter(1))[0]	
            summarySet.append(selected)
            
            # Decrease the number of sentences to add to the summary by 1
            n -= 1

        # Get the original form of the sentences in the summary set
        original = [originalSentenceOf[sentence].strip() for sentence in summarySet]
        
        # Return the original sentences
        return original
    except Exception as e:
        # If an exception occurs, return an empty list
        return []

In [None]:
# Import the wandb module for experiment tracking
# import wandb

# Create an API object to interact with the Weights & Biases service
# api = wandb.Api()

# Get the artifact named 'ire-shshsh/mdes/multi_x_science:v0' of type 'dataset'
# artifact = api.artifact('ire-shshsh/mdes/multi_x_science:v0', type='dataset')

# Download the file 'test.csv' from the artifact and get the path to the downloaded file
# path_to_file = artifact.get_path('test.csv').download()

In [10]:
# path_to_file = './our_dataset - Sheet1.csv'
path_to_file = '../our_dataset.csv'
path_to_file

'../our_dataset.csv'

In [11]:
# import pandas as pd
# df = pd.read_csv(path_to_file)
# df['documents'] = df['documents'].apply(lambda x: eval(x))
# df['concat_doc'] = df['documents'].apply(lambda x: concat(x))
# df.head()

In [12]:
import pandas as pd
from icecream import ic

df = pd.read_csv(path_to_file)
df['concat_doc'] = df['doc1'] + ' ' + df['doc2'] + ' ' + df['doc3']
df.drop(['doc1', 'doc2', 'doc3'], axis=1, inplace=True)
df

Unnamed: 0,name,concat_doc
0,IND vs AUS prophecy,The Cricket World Cup semi-final witnessed a c...
1,PM take on Israel-Hamas Conflict,Prime Minister Narendra Modi on Friday condemn...


In [None]:
# Import necessary libraries
import os
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# Initialize a Weights & Biases run
# run = wandb.init(entity='ire-shshsh', project='mmr', job_type='mmr')

# Loop over different alpha values
for alpha in [0.2, 0.5, 0.8]:
    # Load the data from the CSV file
    # df = pd.read_csv(path_to_file)
    
    # Convert the 'abstracts' column from string to list
    # df['abstracts'] = df['abstracts'].progress_apply(lambda x: eval(x))
    
    # Concatenate the abstracts in each row
    # df['concat_doc'] = df['abstracts'].progress_apply(lambda x: concat(x))
    
    # Concatenate the documents in each row
    # df['concat_doc'] = df['doc1'] + df['doc2'] + df['doc3']

    # Initialize an empty 'mmr' column
    df['mmr'] = ''

    # Write the header to the file
    df.iloc[0:0].to_csv(f'test_{alpha}.csv', index=False)

    # Loop over the rows in the DataFrame
    for i, row in tqdm(df.iterrows()):
        # Calculate the MMR for the concatenated document in the current row
        df.at[i, 'mmr'] = get_mmr(df.at[i, 'concat_doc'], alpha)
        
        # If the MMR is an empty list, skip this row
        if df.at[i, 'mmr'] == []:
            continue

        # Drop the 'concat_doc' and 'name' columns from the current row
        row = df.iloc[i].drop(['concat_doc', 'name'])

        # Save the current row to the file
        row.to_frame().T.to_csv(f'test_{alpha}.csv', mode='a', header=False, index=False)
    
    # Drop the 'concat_doc' and 'name' columns from the DataFrame
    # df.drop(['concat_doc', 'name'], axis=1, inplace=True)
    
    # Save the DataFrame to a CSV file
    # df.to_csv(f'test_{alpha}.csv', index=False)
    
    # Create a Weights & Biases artifact for the CSV file
    # artifact = wandb.Artifact(name=f'multi_news_test_{alpha}', type='dataset')
    
    # Add the CSV file to the artifact
    # artifact.add_file(f'test_{alpha}.csv')
    
    # Log the artifact to the Weights & Biases run
    # run.log_artifact(artifact)

# Finish the Weights & Biases run
# wandb.finish()

In [None]:
# Uncomment to print the time taken by the process
# print str(time.time() - start)

# Uncomment to print the summary
# print ('\nSummary:\n')
# for sentence in summarySet:
# 	print (originalSentenceOf [sentence].lstrip(' '))
# print()

# Print a separator
# print '============================================================='
# print '\nOriginal Passages:\n'

# Import the termcolor module for colored output
# from termcolor import colored

# For each sentence in the cleaned data
# for sentence in clean:
# 	# If the sentence is in the summary set, print it in red
# 	if sentence in summarySet:
# 		print colored(originalSentenceOf[sentence].lstrip(' '), 'red')
# 	# Otherwise, print it in the default color
# 	else:
# 		print originalSentenceOf[sentence].lstrip(' ')