1. Get a somewhat parsed full-text from xDD team, preferably had section label, citation label, etc.
2. Query-based text summarization (Similar to Petal's AI table)
3. Sort into support and counter arguments based on semantics
4. Return sorted evidences 

In [None]:
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever, Seq2SeqGenerator
from haystack.pipelines import GenerativeQAPipeline

In [None]:
from askem.preprocessing import TextProcessor, convert_files_to_docs

tp = TextProcessor()

In [None]:
doc_dir = "data/covid1000/"
docs = convert_files_to_docs(
    dir_path=doc_dir, clean_func=tp.to_paragraphs, split_paragraphs=True
)

document_store = FAISSDocumentStore(embedding_dim=128, faiss_index_factory_str="Flat")
document_store.write_documents(docs)
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="vblagoje/dpr-question_encoder-single-lfqa-wiki",
    passage_embedding_model="vblagoje/dpr-ctx_encoder-single-lfqa-wiki",
)
document_store.update_embeddings(retriever)


In [None]:
generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")
pipe = GenerativeQAPipeline(generator, retriever)

In [None]:
generator.__dict__

In [None]:
y = pipe.run("Is covid chloroquine treatment effective?")

In [None]:
y.keys()

In [None]:
y["query"]


In [None]:
y["answers"][0].__dict__.keys()


In [None]:
len(y["answers"][0].meta["content"][0])


In [None]:
from datasets import load_dataset

dataset = load_dataset("covid_qa_deepset")


In [None]:
dataset

In [None]:
# pip install -q transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

checkpoint = "bigscience/bloomz-7b1"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint)

inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt")
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))

In [None]:
pip install triton

### PDF Parser

In [None]:
# from tqdm.autonotebook import tqdm
# import pathlib
# from pdf_parser import parse_pdf
# pdfs = pathlib.Path('data/').glob('*.pdf')

# for i, pdf in tqdm(enumerate(pdfs)):
#     paper = parse_pdf(pdf)
#     paper.save(f'data/parsed/{i}.pkl.gz')

In [None]:
from pdf_parser import Paper
from transformers import AutoTokenizer, AutoModel

# Load parsed PDF
p = Paper.load("data/parsed/0.pkl.gz")
print(p)

In [None]:
len(text)

### Long T-5
- https://huggingface.co/google/long-t5-tglobal-base
- https://huggingface.co/google/long-t5-tglobal-xl
- https://huggingface.co/google/long-t5-tglobal-large


Relevant but not directly useful... also buggy

In [None]:
from transformers import AutoTokenizer, LongT5Model

tokenizer = AutoTokenizer.from_pretrained("google/long-t5-tglobal-base")
model = LongT5Model.from_pretrained("google/long-t5-tglobal-base")

inputs = tokenizer("Based on this context", return_tensors="pt")
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state


### Run GPT summarization with `compress`

In [None]:
from backend import compress, get_answer

# s = compress(p.chunks)
# short_text = '\n'.join(s)

# with open('tmp/tmp.txt', 'w') as f:
#     f.write(short_text)


### GPT3.5 baseline

In [None]:
# write short_text to short_text.txt

with open("tmp/tmp.txt", "r") as f:
    short_text = f.read()

In [None]:
get_answer(context=short_text, question="What is QSPR modeling workflow?")


### Subset to small test toy example

### Long-T5
https://huggingface.co/google/long-t5-tglobal-base

### mt0

 | Params | Checkpoint |
 | ------ | ---------- |
 | 300M   | mt0-small  |
 | 580M   | mt0-base   |
 | 1.2B   | mt0-large  |
 | 3.7B   | mt0-xl     |
 | 13B    | mt0-xxl    |

### BloomZ

 | Params | Checkpoint  |
 | ------ | ----------- |
 | 560M   | bloomz-560m |
 | 1.1B   | bloomz-1b1  |
 | 1.7B   | bloomz-1b7  |
 | 3B     | bloomz-3b   |
 | 7.1B   | bloomz-7b1  |
 | 176B   | bloomz      |

In [None]:
short_text


In [None]:
# pip install -q transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

# checkpoint = "bigscience/bloomz"
checkpoint = "bigscience/bloomz-560m"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint)

In [None]:
inputs = tokenizer.encode(
    "based on this article, what is QSPR modeling workflow? " + short_text,
    return_tensors="pt",
)
outputs = model.generate(inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0]))

### m-t0

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

checkpoint = "bigscience/mt0-base"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(
    checkpoint, torch_dtype="auto", device_map="auto"
)

In [None]:
inputs = tokenizer.encode(
    "based on this article, what is QSPR modeling workflow? " + short_text,
    return_tensors="pt",
)
outputs = model.generate(inputs, max_new_tokens=1000)
print(tokenizer.decode(outputs[0]))

# References

### Paragraph segmentation

In [None]:
from transformers import pipeline

pipe = pipeline("text-classification", model="dennlinger/bert-wiki-paragraphs")
pipe("{First paragraph} [SEP] {Second paragraph}")

### Embedding from distilbert

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")


def get_sentence_embeddings(sentences, tokenizer, model):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings

# Paragraph cleaner

1. Convert text into sentences
2. Use `bert-wiki-paragraphs` to classify whether last sentence is talking about the same topic or not
3. If same, concatenate
4. If not, start a new paragraph
5. If a paragraph is too long, start a new paragraph

In [None]:
from transformers import pipeline

pipe = pipeline("text-classification", model="dennlinger/bert-wiki-paragraphs")

pipe("{First paragraph} [SEP] {Second paragraph}")