In [1]:
import datasets
import torch
import json
import transformers
import pprint as pp
import numpy as np
from tqdm import tqdm

np.random.seed(2024)

model_name = "HuggingFaceH4/zephyr-7b-alpha"
save_path = './processed_qa_scientific_papers-zephyr2.json'

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache()
device

'cuda:0'

In [2]:
model = transformers.AutoModelForCausalLM.from_pretrained(model_name).half().to(device)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [3]:
MAX_LENGTH = 512

In [4]:
ds = datasets.load_dataset('scientific_papers', 'arxiv', split='validation')
ds.info

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetInfo(description='\nScientific papers datasets contains two sets of long and structured documents.\nThe datasets are obtained from ArXiv and PubMed OpenAccess repositories.\n\nBoth "arxiv" and "pubmed" have two features:\n  - article: the body of the document, pagragraphs seperated by "/n".\n  - abstract: the abstract of the document, pagragraphs seperated by "/n".\n  - section_names: titles of sections, seperated by "/n".\n\n', citation='\n@article{Cohan_2018,\n   title={A Discourse-Aware Attention Model for Abstractive Summarization of\n            Long Documents},\n   url={http://dx.doi.org/10.18653/v1/n18-2097},\n   DOI={10.18653/v1/n18-2097},\n   journal={Proceedings of the 2018 Conference of the North American Chapter of\n          the Association for Computational Linguistics: Human Language\n          Technologies, Volume 2 (Short Papers)},\n   publisher={Association for Computational Linguistics},\n   author={Cohan, Arman and Dernoncourt, Franck and Kim, Doo Soon and Bu

In [5]:
for article in tqdm(ds.select(range(10))):
    sentences = article['article'].split("\n")
    if len(sentences) < 256:
        pass
    paragraphs = [" ".join(sentences[i:i+7]) for i in range(0, len(sentences), 7)]

100%|██████████| 10/10 [00:00<00:00, 9700.06it/s]


In [17]:
pp.pprint(ds.select(range(10))[9]['article'])

('coupled - cluster ( cc ) method  @xcite is a powerful and ubiquitous '
 'technique for solving quantum many - body problem .\n'
 'let us briefly recapitulate general features of the cc method , so we can '
 'motivate our further discussion\n'
 '. at the heart of the cc method lies the exponential ansatz for the exact '
 'many - body wavefunction @xmath5 here @xmath6 is the cluster operator '
 'involving amplitudes @xmath7 of @xmath8-fold particle - hole excitations '
 'from the reference slater determinant @xmath9 . the parametrization   is '
 'derived from rigorous re - summation of many - body perturbation theory ( '
 'mbpt ) series . from solving the eigenvalue equation one determines the '
 'cluster amplitudes and the associated energies .\n'
 'while the ansatz  ( [ eq : ccparam ] ) contains an _ infinite _ number of '
 'terms due to expansion of the exponent , the resulting equations for cluster '
 'amplitudes @xmath7 contain a _\n'
 'finite _ number of terms .\n'
 'this simplif

In [15]:
paragraphs[0]

'coupled - cluster ( cc ) method  @xcite is a powerful and ubiquitous technique for solving quantum many - body problem . let us briefly recapitulate general features of the cc method , so we can motivate our further discussion . at the heart of the cc method lies the exponential ansatz for the exact many - body wavefunction @xmath5 here @xmath6 is the cluster operator involving amplitudes @xmath7 of @xmath8-fold particle - hole excitations from the reference slater determinant @xmath9 . the parametrization   is derived from rigorous re - summation of many - body perturbation theory ( mbpt ) series . from solving the eigenvalue equation one determines the cluster amplitudes and the associated energies .'

In [5]:
processed_data = []

for article in tqdm(ds.select(range(10))):
    processed_article = []
    
    sentences = article['article'].split("\n")
    paragraphs = [" ".join(sentences[i-1:i+4]) for i in range(0, len(sentences), 3)]
    
    # Processing each chunk
    for chunk_index, chunk in enumerate(paragraphs):
        if chunk_index == 0:
            prompt = [
                {"role": "system", "content": f"You produce the next paragraph in high quality wikipedia-like English based on the content the user provides. You change any formatting into Wikipedia style. You rephrase, don't summarize. Do not start with an introductory phrase, simply start writing the content, which should simply be a rephrased version of what the user provides."},{"role": "user", "content": chunk}
            ]
        else:
            # Use the last paragraph processed as context for continuity
            context = processed_article[-1]  # Using the most recent paragraph as context
            prompt = [
                 {"role": "system", "content": f"You produce the next paragraph in high quality wikipedia-like English based on the content the user provides. You change any formatting into Wikipedia style. You rephrase, don't summarize. Do not start with an introductory phrase, simply start writing the content, which should simply be a rephrased version of what the user provides."},{"role": "user", "content": chunk}
            ]

        # Generate response from the model
        text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
        model_inputs = tokenizer([text], return_tensors="pt", padding=True).to(device)
        generated_ids = model.generate(
            model_inputs.input_ids,
            attention_mask=model_inputs.attention_mask,
            max_new_tokens=MAX_LENGTH,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
        generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        pp.pprint(response)
        processed_article.append(response)

    # Store processed article
    processed_data.append({'article': " ".join(processed_article), 'abstract': article['abstract'], 'section_names': article['section_names']})



# Save results to a file
try:
    with open(save_path, 'w') as f:
        json.dump(processed_data, f)
    print(f"Processed dataset saved to {save_path}")
except IOError as e:
    print(f"Failed to save the file: {e}")


  0%|          | 0/10 [00:00<?, ?it/s]

('The interest in anchoring phenomena and confined nematic liquid crystal '
 'systems has been primarily driven by their potential application in liquid '
 'crystal display devices. An excellent example is the twisted nematic liquid '
 'crystal cell, which consists of a nematic liquid crystal confined between '
 'two parallel walls with homogeneous planar anchoring but perpendicular easy '
 'directions. As this technology has improved, it has been utilized in a wide '
 'range of display devices, from computer monitors and televisions to mobile '
 'phones and digital signage.')
("The configuration of nematic liquid crystal's director is manipulated by the "
 'application of external electric or magnetic fields. A precise control of '
 'the surface alignment over extensive areas is integral to the functional '
 'operation of these devices. The majority of research in this field has '
 'concentrated on nematic liquid crystals in close proximity with laterally '
 'uniform substrates. Howev

  0%|          | 0/10 [00:54<?, ?it/s]


KeyboardInterrupt: 

In [None]:

# Define the path to save the dataset
save_path = 'processed_scientific_papers.json'

# Save the processed data
with open(save_path, 'w') as f:
    json.dump(processed_data, f)

print(f"Processed dataset saved to {save_path}")
