In [1]:
from allennlp.predictors.predictor import Predictor
model_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
predictor = Predictor.from_path(model_url)

error loading _jsonnet (this is expected on Windows), treating C:\Users\srina\AppData\Local\Temp\tmpvz58h3ko\config.json as plain json
Some weights of BertModel were not initialized from the model checkpoint at SpanBERT/spanbert-large-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import re
import pandas as pd
import numpy as np
import rouge
ROUGE = rouge.Rouge(metrics=['rouge-n'],
                           max_n=1)
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import ngrams

In [3]:
df = pd.read_csv('datasets/SummarizationCSTitleAbstract03.csv', lineterminator='\n')

In [4]:
df = df.drop(['Unnamed: 0'], axis = 1)
df = df.head(100)
df.head()

Unnamed: 0,title,abstract
0,A Microbiological Survey and Characterization,"In our study, two dairy compost heaps and one ..."
1,eb 2 00 6 Non-Commutative Formal Groups in Pos...,We describe geometric non-commutative formal g...
2,An Alternating Mesh Quality Metric Scheme for ...,In the numerical solution of partial different...
3,Researching Distance Learning Experiences Usin...,Qualitative case study is hardly a research te...
4,Un Motor de Transformación de Modelos con Sopo...,Resumen. En la actualidad están apareciendo un...


In [5]:
# import json
# from pycorenlp import StanfordCoreNLP

# nlp = StanfordCoreNLP('http://localhost:9000')


# def resolve(corenlp_output):
#     """ Transfer the word form of the antecedent to its associated pronominal anaphor(s) """
#     for coref in corenlp_output['corefs']:
#         mentions = corenlp_output['corefs'][coref]
#         antecedent = mentions[0]  # the antecedent is the first mention in the coreference chain
#         for j in range(1, len(mentions)):
#             mention = mentions[j]
#             if mention['type'] == 'PRONOMINAL':
#                 # get the attributes of the target mention in the corresponding sentence
#                 target_sentence = mention['sentNum']
#                 target_token = mention['startIndex'] - 1
#                 # transfer the antecedent's word form to the appropriate token in the sentence
#                 corenlp_output['sentences'][target_sentence - 1]['tokens'][target_token]['word'] = antecedent['text']


# def get_resolved(corenlp_output):
#     """Return the "resolved" output sentence"""
#     possessives = ['hers', 'his', 'their', 'theirs']
#     output_sentence = ""  # Empty string to accumulate the output sentence
    
#     for sentence in corenlp_output['sentences']:
#         for token in sentence['tokens']:
#             output_word = token['word']
#             if token['lemma'] in possessives or token['pos'] == 'PRP$':
#                 output_word += "'s"
#             output_word += token['after']
#             output_sentence += output_word  # Append the output_word to the output_sentence
    
#     return output_sentence

In [8]:
def summarize_text(dataframe):
    # Filling in Nan values
    dataframe['abstract'] = dataframe['abstract'].fillna('This abstract does not exist')

    # Getting tf-idf tables
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(dataframe['abstract'])
    feature_names = vectorizer.get_feature_names_out()

    # Generating best title for each abstract
    summaries = []
    scores = []
    resolved_abstracts = []
    for i in range(len(dataframe)):
        abstract = dataframe.loc[i, 'abstract']
        #print('\nOriginal:', abstract)
        
#         # Stanford Anaphora Resolution
#         output = nlp.annotate(abstract, properties= {'annotators':'dcoref','outputFormat':'json','ner.useSUTime':'false'})
#         output = json.loads(output)
#         resolve(output)
#         abstract = get_resolved(output)
#         print('\nResolved:', abstract)
        
        # Allen Anaphora Resolution
        prediction = predictor.predict(document=abstract)  # get prediction
        abstract = predictor.coref_resolved(abstract)  # resolved text
        resolved_abstracts.append(abstract)
#         print('\nResolved:', abstract)
        
        title = dataframe.loc[i, 'title']
        tfidf_scores = tfidf_matrix[i].toarray().flatten()

        # Sentence Tokenization
        pattern = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s'
        sentences = re.split(pattern, abstract)
        sentences = [sentence for sentence in sentences if any(char.isalpha() for char in sentence)]

        # N-gram scoring based on tf-idf values
        max_score = -1
        for sentence in sentences:
            words = sentence.split(' ')
            ngrams_list = list(ngrams(words, len(title.split())))
            for ngram in ngrams_list:
                score = sum(tfidf_scores[vectorizer.vocabulary_.get(word.lower(), -1)] for word in ngram)
                if (score > max_score):
                    max_score = score
                    summary = ' '.join(ngram)
        
        # Best n-gram is taken as title
        summaries.append(summary)
        # Rouge-1 score calculation
        score = ROUGE.get_scores(summary, title)
        p = score["rouge-1"]["p"]
        r = score["rouge-1"]["r"]
        f1 = score["rouge-1"]["f"]
        scores.append([p, r, f1])

    dataframe['resolved abstract'] = resolved_abstracts
    dataframe['summary'] = summaries
    dataframe['rouge-1 score'] = scores
    
    return dataframe

In [9]:
df = summarize_text(df)

  num_effective_segments = (seq_lengths + self._max_length - 1) // self._max_length


In [10]:
df

Unnamed: 0,title,abstract,resolved abstract,summary,rouge-1 score
0,A Microbiological Survey and Characterization,"In our study, two dairy compost heaps and one ...","In our study, two dairy compost heaps and one ...",and VRE in compost were,"[0.2, 0.2, 0.20000000000000004]"
1,eb 2 00 6 Non-Commutative Formal Groups in Pos...,We describe geometric non-commutative formal g...,We describe geometric non-commutative formal g...,formal groups in terms of a geometric commutat...,"[0.4, 0.36363636363636365, 0.380952380952381]"
2,An Alternating Mesh Quality Metric Scheme for ...,In the numerical solution of partial different...,In the numerical solution of partial different...,inefficient mesh quality metric with a more ef...,"[0.5454545454545454, 0.5454545454545454, 0.545..."
3,Researching Distance Learning Experiences Usin...,Qualitative case study is hardly a research te...,Qualitative case study is hardly a research te...,is typical of most qualitative research to emp...,"[0.3125, 0.35714285714285715, 0.3333333333333333]"
4,Un Motor de Transformación de Modelos con Sopo...,Resumen. En la actualidad están apareciendo un...,Resumen. En la actualidad están apareciendo un...,la arquitectura de un motor de transformación ...,"[0.6428571428571429, 0.6428571428571429, 0.642..."
...,...,...,...,...,...
95,ExoMol molecular line lists-XXVI : spectra of ...,Line lists for the sulphur-containing molecule...,Line lists for the sulphur-containing molecule...,NS calculated line list includes around 2.8 mi...,"[0.36363636363636365, 0.4, 0.380952380952381]"
96,Multiround Private Information Retrieval: Capa...,Private information retrieval (PIR) is the pro...,Private information retrieval (PIR) is the pro...,The capacity of PIR has recently been characte...,"[0.125, 0.125, 0.125]"
97,Efficient Integral Equation Algorithms and The...,Efficient Integral Equation Algorithms and The...,Efficient Integral Equation Algorithms and The...,frequency domain surface integral equation pro...,"[0.3, 0.3, 0.3]"
98,Are the Determinants of Markup Size Industry-S...,The aim of this paper is to identify factors t...,The aim of this paper is to identify factors t...,affect the pricing policy in Slovenian manufac...,"[0.5384615384615384, 0.5, 0.5185185185185186]"


In [11]:
print(df['abstract'][98])

The aim of this paper is to identify factors that affect the pricing policy in Slovenian manufacturing firms in terms of the markup size and, most of all, to explicitly account for the possibility of differences in pricing procedures among manufacturing industries. Accordingly, the analysis of the dynamic panel is carried out on an industry-by-industry basis, allowing the coefficients on the markup determinants to vary across industries. We find that the oligopoly theory of markup determination for the most part holds for the manufacturing sector as a whole, although large variability in markup determinants exists across industries within the Slovenian manufacturing. Our main conclusion is that each industry should be investigated separately in detail in order to assess the precise role of markup factors in the markup-determination process.


In [12]:
print(df['summary'][98])

affect the pricing policy in Slovenian manufacturing firms in terms of the markup


In [13]:
print(df['title'][98])

Are the Determinants of Markup Size Industry-Specific? The Case of Slovenian Manufacturing Firms


In [14]:
p_sum = r_sum = f1_sum = 0
for score in df['rouge-1 score']:
    p_sum += score[0]
    r_sum += score[1]
    f1_sum += score[2]
total = len(df['rouge-1 score'])
avg = [p_sum/total, r_sum/total, f1_sum/total]
print(avg)

[0.26607181165927296, 0.2665639314432894, 0.26567089086657686]


In [15]:
# Convert DataFrame to CSV file
df.to_csv('combined.csv', index=False)