In [1]:
!pip install -q -U transformers accelerate bitsandbytes

In [2]:
import torch
import pandas as pd
# from datasets import Dataset

import warnings
warnings.filterwarnings("ignore")

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
df = pd.read_csv('/content/gdrive/MyDrive/CodingData/Euclid_Assignment/chapter_wise_clean.xlsx', index_col = 'Unnamed: 0')
df.head()

Unnamed: 0,book,chapter,text,chapter_num
0,Harry Potter and the Sorcerer's Stone,1,"THE BOY WHO LIVED Mr. and Mrs. Dursley, of num...",CHAPTER ONE
1,Harry Potter and the Sorcerer's Stone,2,THE VANISHING GLASS Nearly ten years had passe...,CHAPTER TWO
2,Harry Potter and the Sorcerer's Stone,3,THE LETTERS FROM NO ONE The escape of the Braz...,CHAPTER THREE
3,Harry Potter and the Sorcerer's Stone,4,THE KEEPER OF THE KEYS BOOM. They knocked agai...,CHAPTER FOUR
4,Harry Potter and the Sorcerer's Stone,5,DIAGON ALLEY Harry woke early the next morning...,CHAPTER FIVE


Loading the model

In [6]:
model_name = 'mistralai/Mistral-7B-v0.1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
        num_beams=4,
        no_repeat_ngram_size=2,
    )

pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer = tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



In [10]:
input_text = "Summarize: "+ df.text.iloc[0]
print(input_text)

Summarize: THE BOY WHO LIVED Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley's sister, but they

In [11]:
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
input_ids.shape

torch.Size([1, 6673])

In [12]:
sequence = pipeline(
                    input_text,
                    max_length=1024,
                    do_sample=True,
                    top_k=10,
                    num_return_sequences=1,
                    eos_token_id=tokenizer.eos_token_id,
                )
output = sequence[0]['generated_text']
print(output)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 210.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 95.06 MiB is free. Process 30723 has 14.65 GiB memory in use. Of the allocated memory 13.99 GiB is allocated by PyTorch, and 554.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [13]:
import gc
gc.collect()

271

In [None]:
class SummarizerMistral:
    def __init__(self, tokenizer, pipeline, chunking_token_limit):
        self.chunking_token_limit = chunking_token_limit

        self.tokenizer = tokenizer
        self.pipeline = pipeline

    def count_tokens(self, text):
      input_ids = self.tokenizer(text, return_tensors="pt").input_ids
      return input_ids.shape[1]

    def chunk_text(self, text, token_limit):
        chunk_list = []
        input_ids = self.tokenizer(text, return_tensors="pt").input_ids[0]
        chunks_length = len(input_ids)//token_limit+1
        print('Input Length: ', len(input_ids))
        print('Chunks: ', chunks_length)

        for i in range(chunks_length):
            starting_index = i*token_limit
            chunk_text = self.tokenizer.decode(input_ids[starting_index:starting_index + token_limit], skip_special_tokens=True)
            chunk_list.append(chunk_text.strip().strip('\n').strip())
        return chunk_list

    def chunked_prompt(self, text_chunk):
      prompt = '''Summarize the below chunk of text, taken from a Harry Potter book, in strictly one sentence. \nKeep only the most relevant information, in the context of the whole book. \nText chunk: "<text>" \nSummarized output:'''
      return prompt.replace("<text>", text_chunk)

    def generate_model_output(self, input_text):
      sequence = self.pipeline(
                    input_text,
                    max_length=2048,
                    do_sample=True,
                    top_k=10,
                    num_return_sequences=1,
                    eos_token_id=self.tokenizer.eos_token_id,
                )
      output = sequence[0]['generated_text'].replace(input_text, '')
      return output

    def get_chapter_summary(self, chapter_text, book):
      if self.count_tokens(chapter_text)<250:
        return chapter_text

      chapter_chunk_list = self.chunk_text(text = chapter_text, token_limit = self.chunking_token_limit)
      # prompted_chunks = [self.chunked_prompt(book = book, text_chunk = chunk) for chunk in chapter_chunk_list]
      summarized_chunks = [self.generate_model_output(self.chunked_prompt(text_chunk = chunk)) for chunk in chapter_chunk_list]
      summarized_merged_text = '\n'.join(summarized_chunks)
      return self.get_chapter_summary(chapter_text = summarized_merged_text, book = book)

    def summarize_chapters(self, df, mode = None):
      # token_limit = 1400, if total context window = 2048
      # token_limit = 3400, if total context window = 4096
      file_save_location = '/content/gdrive/MyDrive/CodingData/Euclid_Assignment/Mchapter_reduced_summary.xlsx'

      df['chapter_summary'] = ''
      count=0
      for idx, row in df.iterrows():
        # if row.chapter_summary is None:
        chapter_text = row.text
        book = row.book
        chapter_summary = self.get_chapter_summary(chapter_text, book)
        df['chapter_summary'].iloc[idx] = chapter_summary
        print(chapter_summary)

        df.to_excel(file_save_location)
        count+=1
        print('idx: ',idx)

      df.to_csv(file_save_location)
      return df

    # here onwards book summary from chapter summary
    def prompting_for_book_summary(self, text):
      prompt = """Summarize the below chunk of chapter summaries taken from Harry Potter book in strictly below 3 lines, keeping only the most relevant information and theme, to get the essence of the whole book. \nChapter wise text chunk: "<text>" \nSummarized output:"""
      return prompt.replace('<text>', text)

    def summarize_book(self, chapter_text_list):
      input_length = 0
      chap_summary_chunk = ''
      output_sub_summary = ''
      final_book_prompt = "Summarize the below summary of Harry Potter book capturing the primary setting, themes, context, characters development, challenges faced, significant events, revelation, conflicts and resolution in utmost detail in strictly 6 lines. \nText: '<text>' \nSummarized output:"

      total_chapters = len(chapter_text_list)
      print('Total chapters', len(chapter_text_list))

      for chap_summary in chapter_text_list:
        input_length_chunk = self.count_tokens(chap_summary)

        if input_length<1200:
          chap_summary_chunk+='\n' + chap_summary
          input_length+=input_length_chunk
          print(input_length)
          continue
        else:
          input_prompt = self.prompting_for_book_summary(chap_summary_chunk)
          print('input_length_chunk: ', input_length)
          # print('input_prompt: ', input_prompt)
          output_sub_summary = self.generate_model_output(input_prompt)
          print('output_sub_summary: ', output_sub_summary)
          rolling_summary = output_sub_summary + '\n' + chap_summary
          chap_summary_chunk = rolling_summary
          print('rolling_summary: ', rolling_summary)
          input_length = self.count_tokens(rolling_summary)

      if output_sub_summary == '' or rolling_summary!=chap_summary_chunk:
        final_summary = self.generate_model_output(final_book_prompt.replace('<text>', chap_summary_chunk))
        print(final_summary)

      return final_summary

    def book_summary(self, df) -> pd.DataFrame:
      books = []
      book_summary_list = []

      for book, df_book in df.groupby('book'):

        books.append(book)
        chapter_summary_list = df_book.chapter_summary.to_list()
        print('Total chapters', len(chapter_summary_list))
        book_summary = self.summarize_book(chapter_summary_list)
        book_summary_list.append(book_summary)

      data = {'book':books,
              'summary': book_summary_list}
      book_summary_df = pd.DataFrame(data)

      return book_summary_df

    def collective_summary(self, book_summary_df):
      final_prompt = "Summarize the below summary of Harry Potter books capturing the primary setting, themes, context, characters development, challenges faced, significant events, revelation, conflicts and resolution in utmost detail in strictly 10 lines. \nText: '<text>' \nSummarized output:"
      # final_prompt = "Summarize the below text in detail in strictly 10 lines. \nText: '<text>' \nSummarized output:"
      text = ''
      for idx, row in book_summary_df.iterrows():
        book = row.book
        summary = row.summary
        # text+= 'Book: "' + book + '" \n' + summary + '\n'
        text+= '\n' + summary + '\n'

      final_input = final_prompt.replace('<text>', text)
      print(final_input)
      final_summary = self.generate_model_output(final_input)

      return final_summary

obj = SummarizerMistral(tokenizer, pipeline, 1400)

In [None]:
chap_df = obj.summarize_chapters(df)

In [None]:
book_summary = obj.book_summary(chap_df)

In [None]:
print(obj.collective_summary(book_summary))