In [2]:
import networkx as nx
import nltk
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import tqdm

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\peeyu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:

def textrank(document):
    # Tokenize the sentences
    sentences = sent_tokenize(document)

    # Vectorize the sentences using TF-IDF
    vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
    X = vectorizer.fit_transform(sentences)

    # Create a graph where nodes are sentences
    graph = nx.Graph()

    # Add nodes to the graph
    for i in range(X.shape[0]):
        for j in range(X.shape[0]):
            # Nodes are connected if they share similar words
            if cosine_similarity(X[i], X[j]) > 0.2:
                graph.add_edge(i, j)

    # Rank the sentences
    ranks = nx.pagerank(graph)
    # print(graph)
    # Sort the sentences by rank
    # print(sorted(ranks.keys()), sentences[19])
    ranked_sentences = sorted(((ranks[i], s) 
                               for i, s in enumerate(sentences) 
                               if i in ranks), reverse=True)

    # Return the top 5 sentences as the summary
    return ' '.join([s for rank, s in ranked_sentences[:min(5, len(sentences))]])



In [4]:
from datasets import load_dataset

raw_datasets = load_dataset("allenai/cord19", "fulltext", "abstract", trust_remote_code=True)

In [5]:
import numpy as np
def select_datapints(dataset):
    dataset['select'] = (len(dataset['fulltext']) > 0) and (len(dataset['abstract']) > 0)
    return dataset

raw_datasets['train'] = raw_datasets['train'].map(select_datapints)
indices_required = np.where(np.array(raw_datasets['train']['select']) == True)[0]
raw_datasets['train'] = raw_datasets['train'].select(indices_required)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['cord_uid', 'sha', 'source_x', 'title', 'doi', 'abstract', 'publish_time', 'authors', 'journal', 'url', 'fulltext', 'select'],
        num_rows: 105097
    })
})

In [6]:
raw_datasets = raw_datasets.remove_columns(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'publish_time', 'authors', 'journal', 'url'])

import gc
gc.collect()

from datasets import DatasetDict
train_testvalid = raw_datasets['train'].train_test_split(test_size=0.2)

# Further split the test+validation into test and validation sets evenly
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

gc.collect()

test_valid = test_valid['test']

In [7]:
import rouge

def get_rouge_scores(document):
    import rouge
    from nltk.tokenize import sent_tokenize, word_tokenize
    import networkx as nx
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    from nltk.corpus import stopwords

    def textrank(document):
        # Tokenize the sentences
        sentences = sent_tokenize(document)

        # Vectorize the sentences using TF-IDF
        vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
        X = vectorizer.fit_transform(sentences)

        # Create a graph where nodes are sentences
        graph = nx.Graph()

        # Add nodes to the graph
        for i in range(X.shape[0]):
            for j in range(X.shape[0]):
                # Nodes are connected if they share similar words
                if cosine_similarity(X[i], X[j]) > 0.2:
                    graph.add_edge(i, j)

        # Rank the sentences
        ranks = nx.pagerank(graph)
        # print(graph)
        # Sort the sentences by rank
        # print(sorted(ranks.keys()), sentences[19])
        ranked_sentences = sorted(((ranks[i], s) 
                                for i, s in enumerate(sentences) 
                                if i in ranks), reverse=True)

        # Return the top 5 sentences as the summary
        return ' '.join([s for rank, s in ranked_sentences[:min(5, len(sentences))]])


    generated_text = textrank(document['fulltext'])
    sentence_rouge_score = rouge.Rouge().get_scores(generated_text, 
                                                    document['abstract'])[0]
    for rog in sentence_rouge_score:
        document[rog] = sentence_rouge_score[rog]['f']
    return document



In [None]:
test = test_valid.map(get_rouge_scores, num_proc=16)

np.mean(test['rouge-1']), np.mean(test['rouge-2']), np.mean(test['rouge-l'])
