In [18]:
import pandas as pd

# execute this cell to generate Bugfree_fulltrain.csv from the base fulltrain.csv, which is used either as
# direct input, or to generate the summarized articles for the transformer-based classifiers

df = pd.read_csv('fulltrain.csv')

df.iloc[:, 0] = pd.to_numeric(df.iloc[:, 0], errors='coerce')

df = df[df.iloc[:, 0].notnull()]

df.to_csv('Bugfree_fulltrain.csv', index=False)

train_data = pd.read_csv('Bugfree_fulltrain.csv', header=None, names=['label', 'text'])
test_data = pd.read_csv('balancedtest.csv', header=None, names=['label', 'text'])

X_train = train_data['text']
y_train = train_data['label']
# minus all labels by one, so that range is now [0,3] for classification
y_train -= 1
X_test = test_data['text']
y_test = test_data['label']
y_test -= 1

In [19]:
import gensim.downloader as api

# load Google News pretrained word embeddings for sentence vectorization
w2v_model = api.load("word2vec-google-news-300")

In [22]:
import re
import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

nlp = spacy.load("en_core_web_md")

# get the word embedding of a given sentence based on the pretrained word embeddings
def get_word_embeddings(sentences):
    split_sentences = [s.split() for s in sentences]
    results = []
    for i in range(len(split_sentences)):
        words = split_sentences[i]
        words_vecs = [w2v_model[word] for word in words if word in w2v_model]
        if len(words_vecs) == 0:
            results.append(np.zeros(300))
        else:
            words_vecs = np.array(words_vecs)
            sent_vec = words_vecs.mean(axis=0)
            results.append(sent_vec)
            
    return results

def preprocess(sentences):
    results = []
    for s in sentences:
        processed = re.sub("\s+", " ", s)
        # Remove leading and trailing whitespaces
        processed = processed.strip()
        results.append(processed)
    
    return results

def get_sentences(article):
    doc = nlp(article)
    # Remove leading and trailing whitespaces
    sentences = [str(s).strip() for s in doc.sents]
    return sentences

def top_5_summarize(article):
    k = 5
    sentences = get_sentences(article)
    if len(sentences) < k:
        return article
    # first preprocess
    sentences_processed = preprocess(sentences)
    # convert into word embedding vectors
    embeddings = get_word_embeddings(sentences_processed)

    # get pairwise cosine similarities
    similarities = cosine_similarity(embeddings)
    
    # get sum of cosine similarities for each sentence with all other sentences
    # e.g. index x stores for sentence x: sim(x, a) + sim(x, b) + sim(x, c) + ...
    summed_similarities = np.sum(similarities, axis=1)

    # get top k sentences with highest sum of cosine similarities
    top_k_indices = np.argsort(summed_similarities)[-k:]
    
    # sort the top k indices to preserve ordering of sentences in the summary
    sorted_top_k = np.sort(top_k_indices)

    # extract non-preprocessed top k sentences and return as the summary
    summary = [sentences[i] for i in sorted_top_k]
    summary = "".join(summary)
    return summary

In [23]:
# generate summarized datasets

X_train_summ = X_train.apply(top_5_summarize)

X_test_summ = X_test.apply(top_5_summarize)

In [24]:
# save summarized datasets with corresponding class labels to csv file

merged_train_summ = pd.concat([X_train_summ, y_train], axis=1)
merged_train_summ.columns = ['text', 'label']

merged_test_summ = pd.concat([X_test_summ, y_test], axis=1)
merged_test_summ.columns = ['text', 'label']

merged_train_summ.to_csv('merged_summarized_fulltrain.csv', index=False)
merged_test_summ.to_csv('merged_summarized_test.csv', index=False)