### imports

In [1]:
%reload_ext autoreload
%autoreload 2

import sys
import os

In [2]:
SRC_SUBDIR = '../'
SRC_SUBDIR = os.path.abspath(SRC_SUBDIR)
if SRC_SUBDIR not in sys.path:
    print(f'Adding source directory to the sys.path: {SRC_SUBDIR!r}')
    sys.path.insert(1, SRC_SUBDIR)

Adding source directory to the sys.path: 'd:\\Work\\Job_Hunt\\Vihave.ai\\meeting-transcript-summarization'


In [3]:
from meeting_transcript_summarization.src.summarize import read_transcript

### read file

In [4]:
transcript = read_transcript(r'../data/sample_transcript.txt')

In [5]:
display(transcript)

'Date: June 14, 2024\nTime: 10:00 AM - 11:00 AM\nParticipants:\n • Alice (Product Manager)\n • Bob (Software Engineer)\n • Charlie (QA Lead)\n • David (Customer Support)\n • Emma (UI/UX Designer)\n\n10:00 AM - 10:05 AM: Introductions and Agenda Setting\nAlice: Hey everyone, thanks for jumping on this call at such short notice. So, we\'ve got a pretty pressing issue with the login functionality after the last update. I want us to go over what\'s happening, why it\'s happening, and how we can fix it quickly. Let’s start with a quick hello for anyone who’s new.\nDavid: Hi all, I’m David from Customer Support. Nice to meet everyone!\nCharlie: Hey, Charlie here, QA Lead.\nEmma: Hi, I\'m Emma, the UI/UX Designer.\nBob: And I’m Bob, the Software Engineer.\n10:05 AM - 10:15 AM: Issue Description\nDavid: So, here\'s the deal. We\'ve been getting a lot of complaints from users who can\'t log in. They keep seeing an "Invalid credentials" error even though they\'re sure their details are right. Th

### download and unzip tar.gz

In [6]:
from urllib import request

In [33]:
def download(url, target):
    # Download the file from `url` and save it locally under `file_name`

    # check if the file already exists
    if os.path.exists(target):
        print(f'The file already exists: {target!r}')
        return

    with request.urlopen(request.Request(url), timeout=15.0) as response:
        if response.status == 200:
            # check if the target directory exists
            # if not, create the directory
            target_dir = os.path.dirname(target)
            if not os.path.exists(target_dir):
                os.makedirs(target_dir)

            # save the file
            with open(target, "wb") as f:
                f.write(response.read())


# URL = r'https://dl.fbaipublicfiles.com/fairseq/models/bart.large.cnn.tar.gz'
# TARGET = r'../models/bart.large.cnn.tar.gz'
# download(URL, TARGET)

In [8]:
import tarfile

In [34]:
def unzip_file(tar_file, target_dir):
    # check if the file is already extracted
    if os.path.exists(tar_file[:-7]):
        print(f'The file is already extracted: {tar_file[:-7]!r}')
        return


    # check if the target directory exists
    # if not, create the directory
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    print(f'Extracting the file: {tar_file!r}')
    with tarfile.open(tar_file, 'r:gz') as f:
        f.extractall(target_dir)


# tar_file = TARGET
# target_dir = os.path.dirname(tar_file)
# unzip_file(tar_file, target_dir)

Extracting the file: '../models/bart.large.cnn.tar.gz'


### pipeline

In [10]:
%pip install numpy==1.26.4

Note: you may need to restart the kernel to use updated packages.


In [11]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [16]:
%pip install transformers




saving desired model from huggingface hub to local folder

In [20]:
!git clone https://huggingface.co/facebook/bart-large-cnn

Cloning into 'bart-large-cnn'...
Updating files:  38% (5/13)
Updating files:  46% (6/13)
Updating files:  53% (7/13)
Updating files:  61% (8/13)
Updating files:  69% (9/13)
Updating files:  76% (10/13)
Updating files:  84% (11/13)
Updating files:  92% (12/13)
Updating files: 100% (13/13)
Updating files: 100% (13/13), done.
Filtering content:  40% (2/5)
Filtering content:  40% (2/5), 3.02 GiB | 265.26 MiB/s
Filtering content:  60% (3/5), 3.02 GiB | 265.26 MiB/s
Filtering content:  60% (3/5), 4.54 GiB | 273.33 MiB/s
Filtering content:  80% (4/5), 4.54 GiB | 273.33 MiB/s
Filtering content:  80% (4/5), 6.05 GiB | 33.01 MiB/s 
Filtering content: 100% (5/5), 6.05 GiB | 33.01 MiB/s
Filtering content: 100% (5/5), 7.95 GiB | 52.88 MiB/s
Filtering content: 100% (5/5), 7.95 GiB | 7.12 MiB/s, done.


In [29]:
MODEL_LOCAL_DIR = r'./bart-large-cnn'

In [21]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

from_pretrained supports:

either the models on the huggingface hub. 

or the models saved using huggingface library.


so, loading models from local folder requires to meet either of the conditions.

In [31]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_LOCAL_DIR)

In [32]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_LOCAL_DIR)

In [27]:
inputs = tokenizer(transcript, return_tensors="pt").input_ids
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
tokenizer.decode(outputs[0], skip_special_tokens=True)

'An issue with login functionality is causing users to see an "Invalid credentials" error. The issue is with the authentication microservice. The solution is to rollback the changes and deploy the previous version. Draft a communication plan for affected users, explaining what happened and how we\'re fixing it.'

In [28]:
class Summarizer:
    def __init__(self, tokenizer_checkpoint, model_checkpoint):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

    def summarize(self, text, max_new_tokens=100):
        inputs = self.tokenizer(text, return_tensors="pt").input_ids
        outputs = self.model.generate(inputs, max_new_tokens=max_new_tokens, do_sample=False)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)