# Meeting Summariser - `gpt-neo-2.7B`

This is the notebook for meeting summarizer network built solely with one neural network. It was built on NBX-platform machine with following configuration: `8Core/30GB + T4 GPU`.

In [1]:
import re
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np
from tqdm import trange

In [2]:
def set_seed(seed: int):
    import random, numpy as np, torch
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [3]:
%%time
name = "EleutherAI/gpt-neo-2.7B"
# get the model and tokenizer, 
# EleutherAI/gpt-neo-2.7B = 9.9GB compressed
# gpt2-xl (1.5Bn Params) = 6.7GB compressed
# always cache these models, reduces useless bandwidth.
tokenizer = AutoTokenizer.from_pretrained(name, cache_dir = "../hf-cache/")
model = AutoModelForCausalLM.from_pretrained(name, cache_dir = "../hf-cache/")
print("Model loaded ..., moving to GPU")
device = torch.device("cuda:0") if torch.cuda.is_available() else "CPU"
model = model.to(device)
model = model.eval()

Model loaded ..., moving to GPU
CPU times: user 1min 39s, sys: 14.3 s, total: 1min 53s
Wall time: 3min 16s


In [4]:
tokenizer.vocab_size

50257

In [5]:
%%time
# simple forward pass with the model
model.eval()

prompt = """Correct the sentence in each input and return properly formatted sentence
###
sentence: "everyone in this part of the world thinks i am a fraud but i know who i am"
correct: "Everyone in this part of the world thinks I am a fraud, but I know who I am."
###
sentence: "hey everybody welcome to the all in podcast it was a slow news week so we decided we'd  give you a special episode we're gonna go around the horn with our special picks we're each gonna"
correct: "Hey everybody, welcome to the all in podcast. It was a slow news week so we decided we'd give you a special episode. We're gonna go around the horn with our special picks, we're each gonna"
###
sentence:"""

query = ''' "and while india struggles to shake off the virus the developed world is taking off the masks the countries who've been able to vaccinate a sizeable number of people those living in the united states for instance they can ditch the masks at most public places now if they have taken both the shots"
correct:'''

input_ids = tokenizer(prompt + query, return_tensors = "pt")["input_ids"].to(device)
out = model.generate(
    input_ids,
    max_length = len(input_ids[0]) + 128,
    do_sample = True,
    early_stopping = True,
    temperature = 0.5,
    top_p = 0.9,
    num_return_sequences=1,
    pad_token_id = tokenizer.eos_token_id,

    output_scores = False,
    output_hidden_states = False,
    return_dict_in_generate = True,
)

CPU times: user 11.4 s, sys: 119 ms, total: 11.5 s
Wall time: 14.7 s


In [6]:
for x in tokenizer.batch_decode(out.sequences, skip_special_tokens = True):
    print(x)
    print("-"* 70)

Correct the sentence in each input and return properly formatted sentence
###
sentence: "everyone in this part of the world thinks i am a fraud but i know who i am"
correct: "Everyone in this part of the world thinks I am a fraud, but I know who I am."
###
sentence: "hey everybody welcome to the all in podcast it was a slow news week so we decided we'd  give you a special episode we're gonna go around the horn with our special picks we're each gonna"
correct: "Hey everybody, welcome to the all in podcast. It was a slow news week so we decided we'd give you a special episode. We're gonna go around the horn with our special picks, we're each gonna"
###
sentence: "and while india struggles to shake off the virus the developed world is taking off the masks the countries who've been able to vaccinate a sizeable number of people those living in the united states for instance they can ditch the masks at most public places now if they have taken both the shots"
correct: "and while India strugg

In [173]:
class Response():
    """Class that makes getting generated results chill, simply `print(out)`"""
    def __init__(self, out, t):
        self.t = t
        self.sequences = out.sequences.cpu().tolist()
        self.scores = [x.cpu().numpy() for x in out.scores]  if out.scores != None else None
        self.hidden_states = [
            [y.cpu().numpy() for y in x]
            for x in out.hidden_states
        ] if out.hidden_states != None else None
        self.attentions = [
            [y.cpu().numpy() for y in x]
            for x in out.attentions
        ] if out.attentions != None else None

        self.decoded = self.t.batch_decode(self.sequences, skip_special_tokens = True)

    def __repr__(self):
        str_ = ""
        for x in self.decoded:
            str_ += x + "\n"
            str_ += "-"* 70 + "\n"
        return str_

    def __len__(self):
        return len(self.decoded)

    def __getitem__(self, i):
        return self.decoded[i]
    
    def __iter__(self):
        for x in self.decoded:
            yield x

        

class GPT():
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.eot_id = tokenizer.eos_token_id

    def printout(self, out):
        out = out if isinstance(out, torch.Tensor) else out.sequences
        for x in self.tokenizer.batch_decode(out, skip_special_tokens = True):
            print(x)
            print("-"* 70)

    @torch.no_grad()
    def __call__(
        self,
        prompt: str,
        n: int = 16, # number of tokens
        r: int = 1,  # number of sequences
        do_sample = True,
        temp = 0.9,
        top_p = 0.9,
        top_k = None,
        output_scores = None,
        output_hidden_states = None,
        output_attentions = None,
        stop_sequence = None,
        return_response = True,
        **gen_kwargs
    ):
        t = self.tokenizer
        m = self.model
        
        # tokenize the input prompt and stop token if provided
        input_ids = t(prompt, return_tensors = "pt")["input_ids"].to(device)
        if stop_sequence is not None:
            eos_token_id = t(stop_sequence)["input_ids"][0]
        else:
            eos_token_id = self.eot_id
            
        # generate the items
        out = m.generate(
            input_ids,
            max_length = len(input_ids[0]) + n,
            temperature = temp,
            top_p=top_p,
            top_k=top_k,
            num_return_sequences=r,
            pad_token_id = self.eot_id,
            output_scores = output_scores,
            output_hidden_states = output_hidden_states,
            output_attentions = output_attentions,
            do_sample = do_sample,
            early_stopping = True,
            return_dict_in_generate = True,
            eos_token_id = eos_token_id,
            **gen_kwargs
        )

        # return items or 
        if return_response:
            return Response(out, t)
        else:
            return out


    def classify(
        self,
        prompt,
        labels,
        softmax_temp = 0.9,
        add_unknown = False,
        return_prompt = False,
        **gen_kwargs,
    ):
        # we will use the same format that OpenAI uses for GPT-3
        # read: https://beta.openai.com/docs/guides/classifications
        # We normalize all labels by `label.strip().lower().capitalize()` at the API
        # backend. Thus corresponding output labels are always capitalized.
        unq_options = set([x.strip().lower().capitalize() for x in labels])
        unq_options = sorted(list(unq_options))

        # each label must have a distinct first token, because classification
        # works by looking only one step ahead. Also encode the labels with extra
        # white space prepended.
        label_ids = [tokenizer.encode(" " + x)[0] for x in unq_options]
        out = self(prompt, **gen_kwargs, n = 1, r = 1, output_scores = True, return_response = False)
        logits = out.scores[0][0]
        logits = (logits / softmax_temp)[label_ids].softmax(-1).cpu()
        logits = logits.numpy()

        if add_unknown:
            # fill the Probability for the special "Unknown" token
            scores = {o:i for o,i in zip(unq_options, logits)}
            scores["Unknown"] = 1 - sum(scores.values())
        else:
            scores = {o:i for o,i in zip(unq_options, logits)}
        out = {
            "scores": scores,
            "prompt": query if return_prompt else None
        }
        return out

In [174]:
gpt = GPT(model, tokenizer)

In [9]:
%%time

# simple generation
gpt("GPT2 is a state of the art neural language model, that can", n = 16, r = 3)

CPU times: user 1.35 s, sys: 6.22 ms, total: 1.36 s
Wall time: 1.38 s


GPT2 is a state of the art neural language model, that can predict text from large corpora in a matter of seconds. It has been shown
----------------------------------------------------------------------
GPT2 is a state of the art neural language model, that can be trained to make good quality predictions for many different tasks including text classification and machine
----------------------------------------------------------------------
GPT2 is a state of the art neural language model, that can be used in many applications such as speech recognition, translation, and machine translation.
----------------------------------------------------------------------

In [149]:
%%time
# perform classification with a simple built in function
gpt.classify("""This is a tweet sentiment classifier
Tweet: "I loved the new Batman movie!"
Sentiment: Positive
###
Tweet: "I hate it when my phone battery dies 💢"
Sentiment: Negative
###
Tweet: "My day has been 👍"
Sentiment: Positive
###
Tweet: "This month has been very hard on me"
Sentiment:""", labels = ["Positive", "Negative"], add_unknown=False)

CPU times: user 241 ms, sys: 23.8 ms, total: 265 ms
Wall time: 263 ms


{'scores': {'Negative': 0.7159987, 'Positive': 0.28400132}, 'prompt': None}

In [110]:
# Summarisation pipeline

# - classify the input based on language
# - classify the sentences according to quality of english
# - if not of good grammar, clean up the sentences
# - break into points
# - break into keywords

from captions import *

def get_captions(caption_string):
    captions = []
    for x in caption_string.split("\n\n"):
        _id, _time, _content = x.split("\n")
        _time = _time.split("-->")
        _from = date_parse(_time[0])
        _to = date_parse(_time[1])
        # \xa0 is actually non-breaking space in Latin1 (ISO 8859-1), also chr(160)
        _content = _content.replace(u'\xa0', u' ').strip()
        captions.append({"id": _id, "from": _from, "to": _to, "content": _content})
    return captions


with open("./sample.srt") as f:
    captions = get_captions(f.read())
    
capstr = " ".join([x["content"] for x in captions])
capstr[:1000]

"rain man david hey everybody welcome to the all in podcast  it was a slow news week so we decided we'd give you a special episode we're gonna go around  the horn with our special picks we're each gonna pick three picks everybody we're gonna pick our  favorite recipe our favorite new hobby and our favorite streaming guilty pleasure because there  was no news uh with us today the dictator chamath palihapatiya rainman david sachs with his new  track from young spielberg just ripping across the charts uh young spielberg added again this time  with a a track focused on the rayman himself and the queen of quinoa who everybody says  we should upgrade to the king of quinoa that's so sexist why is that an upgrade the  queen of quinoa i don't know people just felt i was being i don't know how people could say that  anointing him as queen would be derogatory i think these people are not woke and they need to be  canceled jason here you go again making a lot of assumptions about people's pronouns

In [111]:
# now we build flow related functions

def check_english_language(sentence):
    if isinstance(sentence, list):
        return [
            check_english_language(s) for s in sentence
        ]
    prompt = """This classifies whether the input sequence into it's language
###
sentence: "GPT2 is a state of the art neural language model, that can be trained to make good quality predictions for many different tasks including text classification and machine translation."
language: English
###
sentence: "कोरोना पर होगा ड्रोन अटैक: ICMR की योजना- दुर्गम इलाकों में ड्रोन से होगी वैक्सीन की डिलीवरी, तेलंगाना सरकार ने ऐसा प्रोजेक्ट लॉन्च किया"
language: Hindi
###
sentence: "фантастический роман Алексея Николаевича Толстого о путешествии землян на Марс. Текст написан в основном в эмиграции, первое издание вышло в Петрограде в 1923 году и неоднократно перепечатывалось."
language: Russian
###
sentence: {sentence}
language:"""
    p = prompt.format(sentence = sentence)
    out = gpt(p, n = 4, r = 1, stop_sequence="###", temp=0.7, do_sample = False)[0]
    out = out[len(p):].strip().split("\n")[0]
    return out

# checks
c1 = check_english_language([
    'बदलेगी बिहार की सियासत: पशुपति पारस केन्द्र में मंत्री बन सकते हैं; खुद को BJP का हनुमान कहने वाले चिराग को RJD और कांग्रेस ने दिया ऑफर',
    'Толстого, воплощавшая его характерные стилистические и идеологические мотивы, выражавшая творческую свободу и возможность жанрово-стилистических экспериментов. После 1939 года повесть постоянно переиздаётся и превратилась в классику детско-юношеской литературы.',
    '请您说得慢些好吗',
    " ".join(capstr.split()[:54]) #### --> this should be English
])
c1

['Hindi', 'Russian', 'English', 'English']

In [85]:
def is_good_english(sentence):
    good_english_prompt = '''This corrects the input sentence
###
sentence: "everyone in this part of the world thinks i am a fraud but i know who i am"
grammar: Wron
###
sentence: "I loved the new Batman movie! It was really really good"
grammar: Correct
###
sentence: "HI, I AM HERE. SITTING IN THE RAIN, WHILE THE WORLD SLEEPS"
grammar: Wrong
###
sentence: "{sentence}"
grammar:'''
    p = good_english_prompt.format(sentence = sentence)
    out = gpt.classify(
        p,
        labels = ["Correct", "Wrong"],
        add_unknown=False,
        temp = 0.7,
        do_sample = True
    )
    return out
    
sentence = "and while india struggles to shake off the virus the developed world is taking off the masks the countries who've been able to vaccinate a sizeable number of people those living in the united states for instance they can ditch the masks at most public places now if they have taken both the shots"
is_good_english(sentence)

{'scores': {'Correct': 0.8130481, 'Wrong': 0.18695185}, 'prompt': None}

In [149]:
from time import sleep

def format_sentence(sentence):
    if isinstance(sentence, list):
        out = []
        for s in sentence:
            out.append(format_sentence(s))
            sleep(0.5)
        return out

    # try keeping len(sentence.split()) ~ 54
    set_seed(90)
    prompt = """Correct the sentence in each input and return properly formatted sentence
###
sentence: "everyone in this part of the world thinks i am a fraud but i know who i am."
correct: "Everyone in this part of the world thinks I am a fraud, but I know who I am."
###
sentence: "hey everybody welcome to the all in podcast it was a slow news week so we decided we'd  give you a special episode we're gonna go around the horn with our special picks we're each gonna."
correct: "Hey everybody, welcome to the all in podcast. It was a slow news week so we decided we'd give you a special episode. We're gonna go around the horn with our special picks, we're each gonna"
###
sentence:"{sentence}"
correct:"""

    p = prompt.format(sentence = sentence)
    n = len(gpt.tokenizer.tokenize(sentence)) + 10 # margin of error
    g = gpt(
        p,
        n,
        r=1,
        stop_sequence = "\n",
        temp = 1.0,
        top_p = 1.0
    )[0]
    
    # clean up the response
    s = g.split("###")[-2 if g.endswith('###') else -1]
    res = s.split("\n")[2][8:].replace('"', '').strip()
    return res[:-1]
  
# corr = 0
# for _ in range(10):
#     r = format_sentence([
#         "and while india struggles to shake off the virus the developed world is taking off the masks the countries who've been able to vaccinate a sizeable number of people those living in the united states for instance they can ditch the masks at most public places now if they have taken both the shots.",
#         " ".join(capstr.split()[:54]) + ".",
#         " ".join(capstr.split()[150:250]) + "."
#     ])[-1]
#     print("-" * 70)
#     print(r)
#     x = int(r == "As queen would be derogatory I think these people are not woke and they need to be canceled, Jason here you go again making a lot of assumptions about people's pronouns, Yeah they they queen of quinoa they I take no offense I take no offense your insults to me and uh today I'm having the emotion of excitement and I am ready for the conversation, good we got the firmware upgraded all right so I think we might as well start with I don't know if you guys caught this but there's a red subreddit called")
#     print(x)
#     corr += x
# corr / 10

In [176]:
r = format_sentence(" ".join(capstr.split()[250:350]) + ".")
r

"Wall Street bets and what they do on wall street bets is they find angles and an thesis and then they bet on a stock the stock they picked for the past couple of months has been gamestop and boy jason hold on a second um that's that's that's not true so um do you uh i had actually a guy on my team put together two important documents and i'm just going to read them because it's full of so much interesting and then we can talk about where are these from did you o"

In [202]:
def get_keywords(text, r = 4):
    prompt = """Get keywords from each text

###

Text: "Black-on-black ware is a 20th- and 21st-century pottery tradition developed by the Puebloan Native American ceramic artists in Northern New Mexico. Traditional reduction-fired blackware has been made for centuries by pueblo artists. Black-on-black ware of the past century is produced with a smooth surface, with the designs applied through selective burnishing or the application of refractory slip. Another style involves carving or incising designs and selectively polishing the raised areas. For generations several families from Kha'po Owingeh and P'ohwhóge Owingeh pueblos have been making black-on-black ware with the techniques passed down from matriarch potters. Artists from other pueblos have also produced black-on-black ware. Several contemporary artists have created works honoring the pottery of their ancestors."

Keywords: Pueblo, art, pottery, black, black ware

###

Text: "{text}"

keywords:"""

    p = prompt.format(text = text)
    words = set()
    
    # prevents GPU OOM by batching instead of parallel requests
    for _ in range(0, r+1, 2):
        out = gpt(
            p,
            r = 2,
            n = 20,
            stop_sequence="\n",
            temp = 0.9,
            repetitive_penalty = 0.9 # don't repeat the same thing over and over again
        )

        for s in out:
            ws = s.split("###")[2].split("\nkeywords:")[-1].split(",")
            for w in ws:
                w = w.strip().lower()
                # print("--->", w, w in ["pueblo", "art", "pottery", "black", "black ware"])
                if w and len(w.split()) < 4 and w not in ["pueblo", "art", "pottery", "black", "black ware"]:
                    words.add(w)
    return list(words)
        
get_keywords(r)

['stock options',
 'stock prices',
 'hold',
 'stock',
 'stock trading',
 'wall street',
 'thesis',
 'stock news',
 'stock market',
 'bet',
 'stock quotes',
 'stock broker',
 'stock market crash',
 'document',
 'stock trading tips',
 'gambling']

In [222]:
%%time
cap_str = [
    (" ".join(capstr.split()[i:i+100]) + ".")
    for i in range(550, 1050, 100)
]
outs = format_sentence(cap_str)
print("---> Got sentences.")
words = []
for _, o in zip(trange(len(outs)), outs):
    words.append(get_keywords(o))

  0%|          | 0/5 [00:00<?, ?it/s]

---> Got sentences.


100%|██████████| 5/5 [01:02<00:00, 12.52s/it]

CPU times: user 1min 51s, sys: 4.81 s, total: 1min 56s
Wall time: 1min 59s





In [211]:
words

[['reddity',
  'redditt',
  'a red list',
  'the',
  'redditor',
  'culture',
  'community',
  'redditor list',
  'a redditor',
  'reddiquette',
  'redditr',
  'queer',
  'r/quinoa',
  'quinoa',
  'reddit',
  'redditer',
  'reddits'],
 ['street',
  'wall',
  'stock',
  'wall street',
  'hedge fund',
  'stock pick',
  'thesis',
  'stock analyst',
  'bet',
  'obey',
  'angle',
  'stock tip',
  'quote',
  'new york',
  'gamestop'],
 ['dictatorship',
  'corporatist',
  'financial',
  'dictator',
  'black-on-black ware',
  'scumb',
  'wall street',
  'corporatism',
  'goldman sachs',
  'wall street crash',
  'robin hood',
  "ceo's compensation",
  'bats',
  'ceo',
  'black-on-',
  "ceo's"],
 ['podcasts',
  'podcast',
  'episode',
  'all in',
  'all i',
  'special podcast',
  'all in podcast',
  'all ian',
  'in',
  '"all i"']]

In [229]:
def clean_keywords(keywords):
    
    if isinstance(keywords[0], list):
        return [clean_keywords[x] for x in keywords]
    
    
    # make the keywords a sentence for prompt
    k = ", ".join(keywords)
    k = re.sub(r"[^\w\'\s,]", "", k)
        
    prompt = '''This app removed duplicate words from a list of words

###

Sentence: "reddity, redditt, a red list, the, redditor, culture, community, redditor list, a redditor, reddiquette, redditr, queer, r/quinoa, quinoa, reddit, redditer, reddits"

Important words: reddit, culture, community, quinoa, r/quinoa

###

Setence: "podcasts, podcast, episode, all in, all i, special podcast, all in podcast, all ian, in, all i"

Important words: all in podcast

###

Sentence: "street, wall, stock, wall street, hedge fund, stock pick, thesis, stock analyst, bet, obey, angle, stock tip, quote, new york, gamestop"

Important words: wall street, hedge fund, new york, gamestop

###

Sentence: {sentence}

Important words:'''
    
    print("-->>", k)
    
    p = prompt.format(sentence = k)

    out = gpt(
        p,
        n = 32,
        r = 5,
        temp = 1.0,
        top_p = 0.9,
        stop_sequence="\n",
    )
    
    words = set()
    for x in out:
        ws = x.split("###")[-1].split("Important words:")[-1].strip().split(",")
        for w in ws:
            words.add(w.strip())
    return list(words)
    
    
w2 = []
for w in words:
    out = clean_keywords(w)
    print(out)
    w2.append(out)
    


-->> stock options, share, stocks, ban, stock, wall street, block, securities, wall stree, stock market, markets, 2019, many, market, trading, june
['', 'stock options', 'share', 'stocks', 'wall', 'stock', 'ban', 'wall street', 'block', 'securities', 'wall stree', 'stock market', 'markets', '2019', 'many', 'market', 'trading', 'june']
-->> stocks, stock, stock trading, money, texas, forum, stock market, bet, market, investing, trading
['texas', 'forum', 'stock market', 'bet', 'market', 'investing', 'trading']
-->> video, deep value, fund, momentum, screenshot, hedge fund, capital, investment, dislocations, strategy, market, hedge fund capital, hedge funds
['fund', 'momentum', 'hedge fund', 'screenshot', 'capital', 'investment', 'dislocations', 'market', 'hedge fund capital', 'hedge funds']
-->> position, gme yolo, gme yolo update, michael, august, free cash flow, this guy, company, update, bury, value oriented thesis, big short, yolo, michael bury, is, gme, alex michael bury, 2019, buy