In [34]:
from transformers import pipeline
from rouge import Rouge
import re
rouge = Rouge()

In [2]:
from datasets import load_dataset

In [16]:

def split_into_chunks(text, max_length):
    """
    Splits a string into chunks of text with complete sentences, where each chunk
    has a maximum length of `max_length` characters.
    """
    sentences = re.findall(r'[^\n.!?]+[.!?]', text)  # Split into sentences
    chunks = []
    current_chunk = ''
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            # If adding the sentence doesn't exceed max_length, add to current chunk
            current_chunk += sentence
        else:
            # If adding the sentence exceeds max_length, start a new chunk
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    
    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks



def get_chunks(input_text):
    max_length = 1025
    chunks = split_into_chunks(input_text, max_length)
    
    summary_temps=[]
    
    for i in chunks:
        summary_temps.append(summarizer(i,max_length=16))
        
    summary_temps_ = [i[0]['summary_text'] for i in summary_temps]
        
    return '. '.join(summary_temps_)

In [29]:
def get_rogue_scores(generated_summary, reference_summary):
    scores = rouge.get_scores(generated_summary, reference_summary)

    # Extract relevant ROUGE scores
    rouge_1 = scores[0]['rouge-1']['f']*100
    rouge_2 = scores[0]['rouge-2']['f']*100
    rouge_l = scores[0]['rouge-l']['f']*100

    return [rouge_1, rouge_2, rouge_l]

In [13]:
data_files = {"test": "1000_test.json"}

dataset = load_dataset("PrathameshPawar/summary_2k", data_files=data_files)


Found cached dataset json (/Users/prathameshpawar/.cache/huggingface/datasets/PrathameshPawar___json/PrathameshPawar--summary_2k-c9ec564ecb7c9e74/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/1 [00:00<?, ?it/s]

In [44]:
test_dataset = dataset['test'].shuffle(seed=2)

In [45]:
test_dataset = test_dataset[:10]

In [46]:
test_dataset

{'topic': ['Drake Passage',
  'Translational regulation',
  'Troubridge Point',
  'University of Madras',
  'Instrumental chemistry',
  'List of mammals displaying homosexual behavior',
  'Tripartite symbiosis',
  'International Union of Crystallography',
  'Animal sexual behaviour',
  'Homosexual behavior in animals'],
 'summary': ['The Drake Passage (referred to as Mar de Hoces ["Hoces Sea"] in Spanish-speaking countries) is the body of water between South America\'s Cape Horn, Chile, Argentina and the South Shetland Islands of Antarctica. It connects the southwestern part of the Atlantic Ocean (Scotia Sea) with the southeastern part of the Pacific Ocean and extends into the Southern Ocean. The passage is named after the 16th-century English explorer and privateer Sir Francis Drake.\nThe Drake Passage is considered one of the most treacherous voyages for ships to make. Currents at its latitude meet no resistance from any landmass, and waves top 40 feet (12 m), hence its reputation as

In [47]:
models  = ['facebook/bart-base','PrathameshPawar/bart_raw','PrathameshPawar/bart_traditional','PrathameshPawar/bart_custom','PrathameshPawar/bart_combined']
    

In [48]:
['PrathameshPawar/pegasus_raw','PrathameshPawar/pegasus_traditional','PrathameshPawar/pegasus_custom','PrathameshPawar/pegasus_combined']

['PrathameshPawar/pegasus_raw',
 'PrathameshPawar/pegasus_traditional',
 'PrathameshPawar/pegasus_custom',
 'PrathameshPawar/pegasus_combined']

In [49]:
test_dataset.keys()

dict_keys(['topic', 'summary', 'content', 'content_traditional', 'custom_approach', 'combined_approach'])

In [24]:
summarizer = pipeline("summarization", model="PrathameshPawar/bart_combined",)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/344 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [53]:
results = {}
reference_summaries = test_dataset['summary']
topics = test_dataset['topic']

for model in models[3:]:
    mod = model.split('/')[-1]
    results[mod] = {}
    
    if 'raw' in model:
        tests = test_dataset['content']
        summarizer = pipeline("summarization", model=model,)
        
        for n,test in enumerate(tests):
            summary_temp = get_chunks(tests[n])
            scores = get_rogue_scores(summary_temp,reference_summaries[n])
            results[mod][topics[n]] = scores
            
            print("raw_"+str(n)+"is done")
            
        print('raw for model'+mod+ 'is done')
            
    elif 'traditional' in model:
        
        tests = test_dataset['content_traditional']
        summarizer = pipeline("summarization", model=model,)
        
        print(tests[0])
        for n,test in enumerate(tests):
            summary_temp = get_chunks(tests[n])
            print(summary_temp)
            scores = get_rogue_scores(summary_temp,reference_summaries[n])
            results[mod][topics[n]] = scores
            print("traditional_"+str(n)+"is done")
        print('traditional for model'+mod+ 'is done')
            
    elif 'custom' in model:
        tests = test_dataset['custom_approach']
        summarizer = pipeline("summarization", model=model,)
        
        for n,test in enumerate(tests):
            try:
                summary_temp = get_chunks(tests[n])
                scores = get_rogue_scores(summary_temp,reference_summaries[n])
                results[mod][topics[n]] = scores
            except:
                print(tests[n])
            print("custom_"+str(n)+"is done")
        print('custom for model'+mod+ 'is done')
            
    elif 'combined' in model:
        tests = test_dataset['combined_approach']
        summarizer = pipeline("summarization", model=model,)
        
        for n,test in enumerate(tests):
            summary_temp = get_chunks(tests[n])
            scores = get_rogue_scores(summary_temp,reference_summaries[n])
            results[mod][topics[n]] = scores
            print("combined_"+str(n)+"is done")
        print('combined for model'+mod+ 'is done')
    else:
        pass
    
    print(results[mod])
    print(mod)
    
    



custom_0is done
custom_1is done
custom_2is done
custom_3is done
custom_4is done

custom_5is done
custom_6is done
custom_7is done
custom_8is done
custom_9is done
custom for modelbart_customis done
{'Drake Passage': [21.19205254646727, 11.162790268642526, 19.86754923520899], 'Translational regulation': [13.861385797862964, 1.6260159460639116, 11.881187778060983], 'Troubridge Point': [40.90909055785124, 19.999999704800004, 36.3636360123967], 'University of Madras': [18.09523782385488, 11.39240476760536, 18.09523782385488], 'Instrumental chemistry': [20.689654806183118, 3.1249996762695647, 17.241378944114157], 'Tripartite symbiosis': [6.89655139595721, 0.0, 6.89655139595721], 'International Union of Crystallography': [36.363635865123975, 17.647058325259533, 36.363635865123975], 'Animal sexual behaviour': [19.047618636085925, 3.342617982480011, 18.25396784243513], 'Homosexual behavior in animals': [15.189872973277089, 4.142011410577404, 15.189872973277089]}
bart_custom


ValueError: Hypothesis is empty.

In [51]:
test_dataset['custom_approach']

['It has also been shown that present-day distribution of dissolved inorganic carbon can be obtained only with an open Drake Passage.In short, not only the Drake Passage must be open to allow the Antarctic Circumpolar Current to flow around Antarctica, but also the current topography is the only one that allows enough transport from the Southern Ocean to sustain a North Atlantic Deep Water cell, thus allowing a sufficiently strong thermohaline circulation. The boundary between the Atlantic and Pacific Oceans is sometimes taken to be a line drawn from Cape Horn to Snow Island (130 kilometres (81 mi) north of mainland Antarctica), though the International Hydrographic Organization defines it as the meridian that passes through Cape Horn—67° 16′ W. Both lines lie within the Drake Passage. The passage hosts whales, dolphins and seabirds including giant petrels, other petrels, albatrosses and penguins.The presence of the Drake Passage allows the three main ocean basins (Atlantic, Pacific an