In [None]:
!pip install transformers datasets evaluate rouge_score



In [None]:
from huggingface_hub import notebook_login, login

login("TOKEN KEY")

In [None]:
from datasets import load_dataset

govReport = load_dataset("ccdv/govreport-summarization", split="train")

In [None]:
govReport = govReport.train_test_split(test_size=0.2)

In [None]:
govReport["train"][0]

In [None]:
from transformers import AutoTokenizer

checkpoint = "microsoft/prophetnet-large-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
prefix = "summarize: "

def preprocess_data(examples):
    # Prepend prefix to each report in examples
    documents = examples["article"]
    prepended_documents = [prefix + document for document in documents]

    # Tokenize the prepended documents
    tokenized_inputs = tokenizer(prepended_documents, max_length=1024, truncation=True)

    # Tokenize the summary labels
    summaries = examples["summary"]
    tokenized_labels = tokenizer(text_target=summaries, max_length=128, truncation=True)

    # Assign the tokenized labels to the "labels" key in the tokenized_inputs dictionary
    tokenized_inputs["labels"] = tokenized_labels["input_ids"]
    
    return tokenized_inputs

In [None]:
tokenized_govReport = govReport.map(preprocess_function, batched=True, num_proc=100)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
import evaluate

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

In [None]:
import numpy as np
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from nltk.translate.meteor_score import meteor_score
from concurrent.futures import ThreadPoolExecutor

def compute_bleu_score(pred_ref_pair):
    pred, ref = pred_ref_pair
    smoothing = SmoothingFunction().method1 
    return sentence_bleu([ref], pred, smoothing_function=smoothing)

def metrics_evaluation(evaluation_predictions):
    predicted_outputs, associated_labels = evaluation_predictions
    decoded_predicted_outputs = tokenizer.batch_decode(predicted_outputs, skip_special_tokens=True)
    processed_labels = np.where(associated_labels != -100, associated_labels, tokenizer.pad_token_id)
    decoded_processed_labels = tokenizer.batch_decode(processed_labels, skip_special_tokens=True)

    # Use Rouge to compute scores for the decoded predictions and decoded labels
    computed_rouge_scores = rouge.compute(predictions=decoded_predicted_outputs, references=decoded_processed_labels, use_stemmer=True)

    return computed_rouge_scores

    # Compute BLEU scores
    with ThreadPoolExecutor() as executor:
        bleu_scores = list(executor.map(compute_bleu_score, zip(decoded_preds, decoded_labels)))
    result["bleu"] = np.mean(bleu_scores)

    # Compute average length of generated predictions
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
!apt install git-lfs

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="Prophetnet_GovReport_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=14,  # Adjusted for 3 GPUs with 26 GB memory each
    per_device_eval_batch_size=14,   # Adjusted for 3 GPUs with 26 GB memory each
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    dataloader_num_workers=80,  # Adjusted for 48 CPU cores
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_govReport["train"],
    eval_dataset=tokenized_govReport["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

In [None]:
text = "summarize: People in North Korea have told the BBC food is so scarce their neighbours have starved to death. Exclusive interviews gathered inside the world's most isolated state suggest the situation is the worst it has been since the 1990s, experts say. The government sealed its borders in 2020, cutting off vital supplies. It has also tightened control over people's lives, our interviewees say. Pyongyang told the BBC it has always prioritised its citizens' interests. The BBC has secretly interviewed three ordinary people in North Korea, with the help of the organisation Daily NK which operates a network of sources in the country. They told us that since the border closure, they are afraid they will either starve to death or be executed for flouting the rules. It is extremely rare to hear from people living in North Korea. The interviews reveal a devastating tragedy is unfolding in the country, said Sokeel Park from Liberty in North Korea (LiNK), which supports North Korean escapees. One woman living in the capital Pyongyang told us she knew a family of three who had starved to death at home. We knocked on their door to give them water, but nobody answered, Ji Yeon said. When the authorities went inside, they found them dead, she said. Ji Yeon's name has been changed to protect her, along with those of the others we interviewed. A construction worker who lives near the Chinese border, whom we have called Chan Ho, told us food supplies were so low that five people in his village had already died from starvation. At first, I was afraid of dying from Covid, but then I began to worry about starving to death, he said. North Korea has never been able to produce enough food for its 26 million people. When it shut its border in January 2020, authorities stopped importing grain from China, as well as the fertilisers and machinery needed to grow food. Meanwhile, they have fortified the border with fences, while reportedly ordering guards to shoot anyone trying to cross. This has made it nearly impossible for people to smuggle in food to sell at the unofficial markets, where most North Koreans shop. A market trader from the north of the country, whom we have named Myong Suk, told us that almost three quarters of the products in her local market used to come from China, but that it was empty now. She, like others who make their living selling goods smuggled across the border, has seen most of her income disappear. She told us her family has never had so little to eat, and that recently people had been knocking on her door asking for food because they were so hungry. From Pyongyang, Ji Yeon told us she had heard of people who had killed themselves at home or disappeared into the mountains to die, because they could no longer make a living."

In [None]:
text ="White-nose syndrome is one of the most lethal wildlife diseases, killing over 5 million North American bats since it was first reported in 2006. The causal agent of the disease is a psychrophilic filamentous fungus, Pseudogymnoascus destructans. The fungus is widely distributed in North America and Europe and has recently been found in some parts of Asia, but interestingly, no mass mortality is observed in European or Asian bats. Here we report a novel double-stranded RNA virus found in North American isolates of the fungus and show that the virus can be used as a tool to study the epidemiology of White-nose syndrome. The virus, termed Pseudogymnoascus destructans partitivirus-pa, contains 2 genomic segments, dsRNA 1 and dsRNA 2 of 1. 76 kbp and 1. 59 kbp respectively, each possessing a single open reading frame, and forms isometric particles approximately 30 nm in diameter, characteristic of the genus Gammapartitivirus in the family Partitiviridae. Phylogenetic analysis revealed that the virus is closely related to Penicillium stoloniferum virus S. We were able to cure P. destructans of the virus by treating fungal cultures with polyethylene glycol. Examination of 62 isolates of P. destructans including 35 from United States, 10 from Canada and 17 from Europe showed virus infection only in North American isolates of the fungus. Bayesian phylogenetic analysis using nucleotide sequences of the viral coat protein geographically clustered North American isolates indicating fungal spread followed by local adaptation of P. destructans in different regions of the United States and Canada. This is the first demonstration that a mycovirus potentially can be used to study fungal disease epidemiology. Pseudogymnoascus destructans (Pd; previously named Geomyces destructans) is an emerging fungal pathogen responsible for a fatal disease, white-nose syndrome (WNS) in hibernating bats in North America [1–3]. Experts estimate over 5 millions bats died from WNS in North America since the disease was first noted in New York in 2006 [4–6]. Currently WNS has spread to at least 29 states in the United States (plus three additional states where Pd presence has been confirmed, but not WNS) and five provinces in Canada [4]. The fungus is widely distributed in Europe [6,7] and recently has been reported from the northeast of China and Siberia [8,9], but no mass mortality has been reported in European or Asian bats [6,8]. Pd’s lethal effect on North American bats coupled with its clonal genotype in North American isolates [10,11], its single mating type [12] and the absence of close relatives [13] led many researchers to hypothesize a recent introduction to North America [1,6, 14,15]. Pd is confirmed in seven North American [1,4] and 13 European species of bats [4,9]. The natural history of the genus Pseudogymnoascus and its allies indicate that they are commonly isolated from soils in colder regions of the world [16]. Unlike Pd many of its close relatives are cellulolytic saprobes and non-pathogenic [16,17]. Mycoviruses associated with fungi have drawn interest because of their potential roles in fungal biology and pathogenicity [18]. Mycoviruses are very frequent in fungi and generally maintain a persistent lifestyle [19]. Horizontal transmission is very rare, and is likely restricted to closely related strains, although phylogenetic studies indicate transmission among species has occurred [20]. Transmission has only been documented in a few cases outside the laboratory [21]. Most mycoviruses are cryptic with no known biological effects on their fungal hosts, although there is a lack of research in this area. However, there are significant examples where mycoviruses play important roles in fungal biology and ecology [22]. Here we used mycoviruses of Pd as a tool to study the epidemiology of WNS. We investigated mycoviruses in Pd and show that population variation of a Pd-mycovirus can be useful in tracing the spread of WNS. We examined 62 isolates of Pd from North American and European bats for mycoviruses (Table 1). The isolates were cultured from four North American and one European species of bats and were collected from 2008 to 2015. North American isolates included 14 from Pennsylvania, seven from New York, six from West Virginia, three from North Carolina, three from Vermont, one from Ohio, one from Indiana and 10 from New Brunswick, Canada. We screened 16 isolates of Pd from the Czech Republic and one isolate from Slovakia in Europe. Double-stranded RNA (dsRNA) extracted from all North American isolates showed two bands—a larger band close to 1. 8 kb (RNA 1) and a smaller band close to 1. 6 kb (RNA 2) in electrophoretic analysis (Fig 1A). None of the European isolates contained these dsRNAs, although two, CCF-4127 and CCF-4128, had dsRNAs profiles different from that of the North American isolates (Fig 1B). We found no dsRNAs of viral origin in five isolates of Geomyces sp. from Antarctic soil or in six isolates of Pseudogymnoascus sp. from cave soils in Pennsylvania (S1 Table). The dsRNA enrichment method is based on the premise that uninfected plants or fungi normally do not contain detectable amounts of high molecular weight dsRNA, and, when present, dsRNA is an indicator of a viral genome [23]. Sanger sequencing of cDNA clones from RNAs 1 and 2 of the North American isolates of Pd obtained from random primed RT-PCR provided nearly complete genomic sequences; ends were determined by 5' - primer ligated RNA ligase mediated-rapid amplification of cDNA ends (RLM-RACE) [24] providing consensus genomic sequences for RNAs 1 and 2 of 1761 bp and 1590 bp. Northern-blots using cDNA clones from RNA 1 or RNA 2 as probes confirmed the identity of the dsRNA bands (Fig 1C). We named this new virus Pseudogymnoascus destructans partitivirus-pa (PdPV-pa; the pa indicates the sequenced isolate is from Pennsylvania). A BLASTx search of GenBank showed closest similarity of RNA 1 of PdPV-pa with RNA 1 of Penicillium stoloniferum virus S (PsV-S), with 76% amino acid (aa) identity. Similarly, RNA 2 of Pd showed closest similarity with the RNA 2 of PsV-S with 67% aa identity. PsV-S is the type species of the genus Gammapartitivirus in the family Partitiviridae [25]. Sequence analysis of RNA 1 of PdPV-pa predicted a single open reading frame (ORF) of 540 aa (60 kDa) that codes for a putative RNA-dependent RNA polymerase (RdRp) (Fig 2A). RNA 2 also contained a single ORF of 470 aa (52 kDa) that codes for a putative coat protein (CP) (Fig 2B). Amino acid level sequence identity of PdPV-pa RdRp and CP with the approved members of genus Gammapartitivirus in the family Partitiviridae ranges from 58% - 76% and 36% - 67% respectively, which are within the species demarcation criteria (RdRp ≤ 90%; CP ≤ 80%) of the genus [42]. Further, the 5' termini of PdPV-pa RNAs 1 and 2 coding strand share a conserved CGCAAAA… sequence, where G is followed by A, U, or C but not G in the next 5 to 6 nucleotide positions, characteristic of the genus Gammmapartitivirus [25] (Fig 2C). Similarly, the 3' terminal 50 nucleotides of RNAs 1 and 2 were adenosine (A) rich in the range (7–24 nt) typical of members of the Gammapartitivirus genus [25] (Fig 2D). PdPV-pa particles were purified from mycelia of Pd and negative-stain transmission electron microscopy showed isometric particles of approximately 30 nm diameter, characteristic of members of the Partitiviridae (Fig 3A). PdPV-pa dsRNAs were also extracted from the purified virus particles to reconfirm their presence as genomic RNAs (Fig 3B). Bayesian trees constructed using aa sequences from the RdRp and CP of PdPV-pa clustered PdPV-pa with other members of genus Gammapartitivirus in the Partitiviridae family (Fig 4A & 4B). In both RdRp and CP trees, PdPV-pa appeared as a sister branch to PsV-S with strong posterior probability support of 92% and 100% respectively suggesting PdPV-pa is evolutionary close to PsV-S. The genome structure of PdPV-pa, conserved features in its RNAs explained above, its particle morphology, its RdRp and CP amino acid sequence identity within species demarcation criteria, and phylogenetic analyses all confirmed that PdPV-pa is a novel partitivirus belonging to genus Gammapartitivirus in the family Partitiviridae. We attempted several methods including single spore isolation, hyphal tip culture, protoplast culture, heat therapy and nutritional and chemical stress that involved application of the antiviral drugs cycloheximide or ribavirin, to cure Pd of the PdPV-pa infection. However, only partial success was achieved with high concentrations of cycloheximide (25 μg/ml) and ribavirin (300 μM) treatments after three passages. PdPV-pa remained suppressed in the fungus treated with cycloheximide or ribavirin when grown in media with the drug but once the fungus was transferred to drug-free media the virus reappeared. Finally, our attempt to cure the fungus using polyethylene glycol (PEG) -induced matric potential in minimal nutrition media made PdPV-pa undetectable. PdPV-pa infection in Pd was checked under matric potential gradients starting from -2MPa, -3MPa to -4MPa. We did not observe visible germination of Pd conidia or mycelia growth at -5MPa and -6MPa. PdPV-pa was undetected in PEG treated Pd isolates when evaluated by dsRNA extraction and RT-PCR with RdRp specific primers for PdPV-pa (Fig 5A & 5B). The detection limit of PdPV-pa in Pd was determined to be approximately 380 copies per cell (S1 Appendix). We enriched the viral dsRNA from total nucleic acid extracted from a defined number of Pd conidia followed by measurement of dsRNA concentration, and serial dilutions to determine the end-point of detection. Pd isolates where PdPV-pa was undetected after PEG treatment lost the characteristic gray pigmentation of wild type Pd and appeared white (Fig 6A). The virus-free isolate also produced significantly less conidia in comparison to wild type isolate (Fig 6B). Although PEG treatments were successful in obtaining a PdPV-pa free isolate of Pd, PdPV-pa tolerance to many other stresses mentioned above indicate that PdPV-pa is tightly associated with the Pd isolates from North America. Genetic variability of the RdRp and CP regions was analyzed in 45 North American isolates of PdPV-pa by amplification using specific primers followed by sequence analysis (Fig 7A & 7B). Using a 930 bp region of RdRp amplicons after trimming and alignment, we found the average percentage identity ranged from 99. 7 to 99. 9 among the 45 isolates. The high level of conservation in the RdRp is also reflected by a total of only 15 segregating sites, including seven singletons among the isolates examined. For the CP, nucleotide variability was higher: in a 1088 bp of amplicon of the CP, the average percent identity ranged from 96. 8 to 98. 4 and included 127 segregating sites out of which 69 were singletons. The Bayesian tree based on the RdRp nucleotide sequences of 45 North American isolates of PdPV-pa produced a largely unresolved tree with no clusters with significant support. However, the Bayesian tree constructed from the nucleotide sequences of the CP clustered the 45 PdPV-pa isolates into two major clades based on their geographical distribution (Fig 8). One clade was comprised of Canadian isolates; the other clade included isolates from the USA, although the posterior probability of this separation was lower than for other branching in the tree. The USA clade further included well supported clusters of isolates from New York, Pennsylvania, West Virginia, North Carolina, Vermont, Indiana and Ohio. Indiana and Ohio had one isolate each and separated as sister branches. The separate topologies of USA and Canadian clusters indicate independent diversification of Pd isolates subsequent to movement to particular regions. Within each major clade there were examples of sub-branching topologies representing isolates based on their local distribution although the pattern was not consistent throughout. The phylogeny of the PdPV-pa isolates showed no structure based on the taxonomy of the bats indicating that Pd is a generalist pathogen that is transmitted readily across bat species. In this study, we isolated and characterized a novel virus, PdPV-pa, from the pathogenic filamentous fungus causing WNS in North American bats. Based on the nucleotide sequence, sequence properties at the 5' and 3' termini, genome organization, morphology of the virus particle and phylogenetic analysis, PdPV-pa was confirmed as a new member of the genus Gammapartitivirus, family Partitiviridae. PdPV-pa shows closest similarity with PsV-S within Gammapartitivirus. The branch supports of over 90% in posterior probability in the RdRp and 100% in the CP Bayesian trees separating PdPV-pa from PsV-S (Fig 4A & 4B) and Gammapartitivirus species delimitation criteria (≤ 90% aa-sequence identity in RdRp and/or ≤ 80% aa-sequence identity in CP [26]) confirmed PdPV' s taxonomic placement into a distinct species [25]. The occurrence of PdPV-pa infection in Pd isolates from diverse geographical locations and time suggests PdPV-pa is widely spread in North America. We could not rule out the possibility of PdPV-pa incidence in Europe considering the sample size of 17 isolates that we examined in this study. Previously, Warneke et al. [14] reported a Pd isolate from Germany (MmyotGER2) showing similar mortality effects to North American isolates when inoculated onto North American little brown bat (M. lucifugus) under experimental conditions. Unfortunately, we were not able to obtain the German isolate to evaluate the presence of PdPV-pa. However the close association of PdPV-pa in a diverse subset of the North American population of Pd sampled (35 isolates from 7 states) may provide some indications of the roles of PdPV-pa in WNS. Many mycoviruses have been reported to elicit phenotypic changes, including both hypovirulence and hypervirulence in their fungal hosts [18]. For example, the presence of Helminthosporium victoriae 145S virus (chrysovirus) in the plant pathogenic fungus, Helminthosporium victoria increased virulence in oat plants. The viral dsRNAs up-regulated Hv-p68, an alcohol oxidase/RNA-binding protein in the fungus that is likely responsible for the disease development [27]. Similarly, a high level of virulence was reported in the presence of a six kbp mycoviral dsRNA in Nectria radicicola, the causal fungus of ginger root rot [28]. The opportunistic fungal pathogen, Aspergillus fumigatus causing lung disease in immunocompromised humans and animals also exhibited hypervirulence in the presence of the uncharacterized A78 mycovirus [29]. We have not explored the roles of PdPV-pa in WNS in the present study, but some indirect evidence, including the difficulties in curing the fungus of PdPV-pa, the stability of the virus after numerous generations of laboratory cultures, the changes in pigmentation and the significantly reduced production of conidia in the virus-free isolate indicate close biological relationships between the fungus and the virus; hence future investigation on potential biological effects of PdPV-pa will be important. In our attempts to cure PdPV-pa, PEG-induced stress on the matric potential was found effective. PEG being non-toxic and metabolically inert to fungi is an ideal compound to manipulate matric-induced water stress in media [30]. Matric potential influences water availability of substrates through capillary actions and particle adsorptive forces [31]. Raudabaugh & Miller [32] showed that Pd is sensitive to matric induced water stress beyond -5MPa, which is consistent with our results. In addition to the Pd growth response, normal growth at lower matric stress and significant growth inhibition as negative values of matric potential increases are characteristic of most soil fungi [32,33]. It is possible that Pd may have originated as a soil fungus and the adaptive pressure due to competition expanded its niche. The capacity of a human pathogenic fungus, Cryptococccus neoformans, to infect several animals including cats, dogs, dolphins, sheep and many birds was explained based on the environmental selective pressures imposed on it while surviving in its primary niche: soil [34]. The recent findings that Pd is capable of surviving on various substrates like harvestmen, fungus gnats, moss, and cave soils in addition to bat skin [32,35,36], support this argument. Whether or not Pd susceptibility to matric stress is related to its origin, the inhibitory effect of the matric stress on both Pd and PdPV-pa confirms parallel biological response of both the virus and the fungus. The genetic variation in the RdRp (<1%) and the CP (2–3%) of North American populations of PdPV-pa seems low, but in fact is quite high for partitiviruses. In studies with plant partitiviruses we find less than 1% divergence after extended periods of evolution (MR, personal observation). This higher level of variation implies a recent introduction of PdPV-pa. According to our results, only one species of this virus appears to occur in the North American isolates of Pd. The phylogenetic analysis based on a Bayesian algorithm of CP nucleotide sequences showed geographical clustering of 45 North American isolates into two main clades: USA and Canada. This indicates the diversification of PdPV-pa isolates is the outcome of geographical separation followed by sequence variation. No bat host specialization was observed. This finding is consistence with the clonal populations of Pd reported previously [10,11] with only one mating type [12] despite its infection in several species of bats in North America. The phylogenetic signatures of PdPV-pa isolates relating to geography provide valuable insights on the spread of WNS. The phylogeny supports two major clusters and many sub-clusters corresponding to US States of PdPV-pa isolation, suggesting connections among North American isolates, which is valuable in tracing WNS. Additionally, clustering of Pd isolates based on location was observed in several occasions within the USA clades followed by divergence, most likely for local adaptation. This analysis can be successfully expanded incorporating CP sequences of PdPV-pa from wider geographical locations to study the spread of WNS. Pseudogymnoascus destructans (Pd) was isolated from diseased bat wing tissue, live bat wing punches (2-5mm diameter) or wing swabs, cultured on 0. 5X (7. 5 g/L) Sabouraud dextrose agar (SDA) plates with 20 μg/ml of ampicillin, streptomycin and tetracycline at 10° C for 3 weeks in the dark. Identification of Pd was confirmed based on the species morphological characters i. e. , the presence of curved conidia [1] and DNA sequences from conserved regions: internal transcribed spacer1 (ITS1), elongation factor 1α (EF-1α) and glyceraldehyde 3-phosphate dehydrogenase (gdp) genes. The pure cultures of Pd were obtained either by single spore isolation or hyphal tip cultures. For single spore cultures, actively growing Pd plates (100 mm X 15 mm) of over three weeks old were flooded with 2 ml of sterile water and gently swirled to release the spores (conidia). The spore suspension was vortexed for one minute to avoid clumping of spores. The spore suspension was then picked using an inoculating loop and spread over water agar plate (19 g/L). About 1 ml of sterile water was added in the process to help to spread the spores uniformly. The plate was viewed under a dissecting microscope and concentration of the spore suspension was adjusted so that each plate had 20–30 spores. The plate was then cultured at 7°-10°C in the dark and checked for germination every alternate day. Once the spores germinated, an agar plug was cut containing hyphae from the single germinating spore without damaging growing hyphae and then plated on a regular SDA plate to culture. For hyphal tip culture, we used the protocols described by Kanematsu et. al. [37] with some modification. We plated spore suspension on regular SDA plates as described above but when spores geminated and mycelia mats were formed they were gently overlaid with sterile Whatman cellulose filter paper soaked in SDB. The plates were then cultured for an additional two weeks until the fungal hyphae penetrated the filter paper and started growing on the upper surface. At that point the filter paper was removed and its upper surface was scraped gently and hyphal segments were suspended in sterile water. The method produced hyphal segments ranging from 4–8 cells in length that were appropriate for the hyphal tip culture. The hyphal segment suspension was then plated on SDA plates adjusting the concentration so that each plate had uniform distribution of 20–30 hyphal segments. Finally agar plugs grown from individual hyphal segments were cultured in separate plates to obtain a pure culture. The fungal isolates were stored in SDA plates for short-term storage at 4°C and at -80°C in the form of mycelia in 50% glycerol for long-term storage. All Pd isolates from Pennsylvania, one from Vermont and one from Indiana used in this study were isolated and cultured in our laboratory. The substrates (bat wings, wing punches, swabs) for these cultures were obtained from routine surveys of the Pennsylvania Game Commission (http: //www. pgc. pa. gov/Wildlife/Wildlife-RelatedDiseases/WhiteNoseSyndrome). The isolates from New York, West Virginia, North Carolina, Ohio, the remaining two isolates from Vermont and all European isolates were obtained as sub-cultures from the Center for Forest Mycology Research, United States Forest Service, Madison, WI (http: //www. fpl. fs. fed. us/research/centers/mycology/culture-collection. shtml). The Canadian isolates were obtained as sub-cultures from New Brunswick Museum collections, New Brunswick, Canada (http: //www. nbm-mnb. ca). In addition, we obtained five isolates of Geomyces sp. collected from Antarctic soil from Dr. Robert A. Blanchette’s collection at the University of Minnesota and we used six isolates of Pseudogymnoascus sp. from cave soil in Pennsylvania for this study. We extracted dsRNAs from lyophilized mycelia of Pd with a minor modification in the protocol described by Márquez et. al. [38], specifically Pd was cultured using mycelial plugs or spores in 150 ml of 0. 5X Sabouraud dextrose broth (SDB) supplemented with 20 μg/ml of ampicillin, streptomycin and tetracycline in a shaker at 10°C under dark conditions for three weeks prior to lyophilization. In addition to binding to CF11 cellulose (Whatman) in the presence of ethanol, the chemical nature of the dsRNA was confirmed by its resistance to DNase and RNase with NaCl concentration > 0. 3M. Approximately 2 μg of dsRNA were mixed with 2 μM of random primer-dN10 with a linker sequence (5' CCTTCGGATCCTCCN103' ), 0. 5 mM of Tris-EDTA (pH 8. 0) and nuclease-free water to a final volume of 12 μl, and boiled for 2 min. The mixture was incubated on ice, and 8 μl of Reverse Transcriptase (RT) mix (SuperScriptTM III RT 200U, 5X First-Strand buffer 4 μl, 0. 1M DTT 1 μl and dNTP 0. 5 mM as recommended by the manufacturer) were added and incubation continued at 50°C for 1. 5 hours. The newly synthesized cDNA mixture was then incubated with 10 μg of boiled RNase A (Sigma) for 15 min. at room temperature and cleaned with E. Z. N. A Cycle Pure Kit (Omega Bio-tech) according to the manufactures instruction. About 0. 5 μg of cleaned cDNA was used as a template for a 25 μl polymerase chain reaction (PCR) with Taq DNA Polymerase (ThermoFisher Scientific), buffers, dNTPs supplied with 1μM concentration of the primer (5' CCTTCGGATCCTCC 3' ). The PCR was completed in a Idaho Technologies Rapid Cycler with a slope setting of 5, using the following cycles: 1 cycle of 94°C, 1 min. ; 25 cycles of 94°C, 0 sec. , 45°C, 0 sec. , and 72°C, 15 sec. ; 1cycle of 72°C, 5 min. ; 1 cycle of 37°C, 5 min. The PCR product was cleaned and cloned into the pGEM-T Easy Vector System (Promega) according to the manufacturers instructions. Sequence analysis of the cDNA plasmid clones were done by the Genomic Core Facility of Pennsylvania State University, University Park, PA. The sequences obtained were trimmed for plasmid and primer sequences and assembled using de novo assembly in Geneious version 8. 0. 2 [39]. All cloning and sequence analysis was based on the dsRNA from the LB-01 isolate cultured from a little brown bat from Pennsylvania. RNA ligase mediated-rapid amplification of cDNA ends (RLM-RACE) was performed to determine the terminal sequences of the PdPV-pa dsRNA segments. A 5' -phosphorylated oligodeoxynucletide (5' -PO4-GGAGGATCCGAATTCAGG 3' ) was ligated to the dsRNA termini as an adaptor before synthesizing cDNAs using a complementary primer (5' CCTGAATTCGGATCCTCC3' ) in combination with the internal primers designed for PdPV-pa RNA1 and RNA2 (RNA 1: 5' TTCAAGTTCGCCCTGTACC3' F, 5' TGAGCGAATGGAAGGTTG3' R; RNA 2: 5' CGCGTAATCATGACGACC3' F, 5' CCGAGGAGCACACACTATC3' R) in RLM-RACE. Ligation reactions were done in 50% PEG with 2 U of T4 RNA ligase 2 (New England BioLabs) mixed with approximately 2 μg of dsRNA along with the primers mentioned above and buffer supplied according to the manufacturers instructions, and incubated at 4°C overnight. RT-PCR of the primer-ligated dsRNA was performed exactly like described in the cDNA synthesis above except the enzyme used was Avian Myeloblastosis Virus (AMV) RT (New England BioLabs). The amplicons were cloned followed by sequence determination using Sanger sequencing. The complete nucleotide sequences of PdPV-pa RNA 1 and PdPV-pa RNA 2 have been deposited in GenBank with accession numbers KY20754 and KY207544, respectively. Consensus sequences for PdPV-pa RNA 1 and RNA 2 were analyzed for the open reading frames (ORFs) using ORF finding operation in Geneious version 8. 0. 2. A sequence similarity search was conducted with BLASTn and BLASTx available online from the National Center for Biotechnology Information (NCBI). Northern blotting was performed using non-radioactive isotopes probes, digoxigenin (DIG) -11-dUTP-labeled DNA fragments according to the manufacturers instructions (Roche Diagnostics). Representative clones of PdPV-pa RNA 1 and RNA 2 in the range of 500–700 bp were selected and the labeling was done in a PCR with DIG-11-dUTP and dNTPs mix (DIG-11-dUTP: dTTP = 1: 3; with equimolar amount of dATP, dCTP and dGTP), Taq DNA Polymerase (ThermoFisher Scientific), specific primers and buffer in Idaho Technologies Rapid Cycler as described above. About 2 μg of PdPV-pa dsRNA was electrophoresed in 1. 2% agarose gels and subsequently denatured by saturating with freshly prepared 50mM NaOH for 30 min followed by neutralization in 50mM sodium borate for 5 min. The cycle was repeated three times before dsRNA was transferred to a nylon membrane (Hybond N+ Amersham) by capillary action overnight. The membranes were UV-cross-linked in a Stratalinker at 200 J. Hybridization and washings were carried out as described by Li et al. [40] except we performed prehybridization and hybridization at 52°C instead of 42°C. The blots were incubated in antibody solution, anti-DIG-AP Conjugate (Roche) and CDP-STAR (Roche) for chemiluminescence detection. Virus particles were purified following methods described by Sanderlin and Ghabrial [41] with some modifications. Eight g of lyophilized mycelia of Pd isolate BB-06 was ground to powder in the presence of liquid nitrogen. The homogenates were mixed with extraction buffer (0. 1 M sodium phosphate. pH 7. 6 containing 0. 5% (v/v) thioglycolic acid) and mixed with chloroform followed by low speed centrifugation at 7000 rpm for 15 min at 4°C. The virus containing supernatant was then subjected to two cycles of differential centrifugations (low speed at 7000 rpm for 15 min and ultracentrifuge at 35,000 for 1. 5 hours). During the ultracentrifuge cycle, the virus containing supernatant was underlaid with a 10% sucrose cushion. The final pellets were suspended in 1 ml of 0. 03 M sodium phosphate buffer pH 7. 6. The virus preparation was examined under JEOL 1400 transmission electron microscope after negatively staining with uranyl formate in the Microscopy and Imaging Facility at Penn State College of Medicine, Hersey, PA. For the heat stress, actively growing Pd plates in three replicates were exposed to room temperature (22–23°C), 37°C and 42°C for 2,6, 12 and 24 hours before culturing the mycelia plugs from each treatments in liquid medium (SDB) under normal laboratory culture conditions for Pd described above. During the treatments, Pd plates in three replicates were also grown under normal culture condition as controls. The fungal mycelia were then harvested after three weeks to extract dsRNAs. However, only samples treated at room temperature and 37°C for 2 hours grew. Single spore isolation and hyphal tip cultures were done as described under the section, fungal isolation and culture. The protoplast isolation from Pd was performed on mycelia (~ 1. 7 g) harvested from SDB culture after two weeks at 10°C in a shaker (200 rpm) in the dark. The fungal mycelia were collected by centrifugation at 90 × g for 5 min followed by washing with KCl buffer (0. 6 M, pH 5. 8) as an osmotic stabilizing agent. The mycelia was treated with lysing enzyme mixture (Lysing enzyme from Trichoderma harzianum 20 mg/ml and driselase 20 mg/ml from Sigma) prepared in KCl buffer and incubated at 10°C at 70 rpm in the dark. Protoplast production was checked every half an hour until 35–40 protoplasts were observed under a 40X field with 10 μl of the mixture. The mixture was then passed through double-layered miracloth (VWR) soaked in STC buffer (1. 2 M Sorbitol; 10 mM Tris-HCl, pH 7. 5; 20 mM CaCl2) to filter out the cell debris. The filtrate was centrifuged at 90 × g for 5 min to collect the protoplasts which were resuspended in regeneration media (0. 5% yeast extract, 2% glucose, 0. 6 M Sorbitol and 25 mM CaCl2) followed by incubation at 10°C at 70 rpm in the dark. Once the protoplasts recovered completely with cell wall growth, they were transferred to agar supplemented regeneration media (0. 5% yeast extract, 2% glucose, 20% sucrose and 1% agar) and the concentration adjusted so that each plate had 25–30 uniformly distributed cells. The plates were then incubated under normal culture condition for Pd until hyphae developed uniformly around each protoplast without touching each other. Individual colonies were then picked and cultured in SDA. We also treated Pd with the antiviral drugs cycloheximide and ribavirin at different concentrations in SDA media. Cycloheximide was used at 2 μg/ml, 5 μg/ml, 10 μg/ml, 15 μg/ml and 25 μg/ml concentrations. Ribavirin treatment was at 80 μM, 100 μM, 150 μM, 200 μM and 300 μM concentrations. Three passages with both cycloheximide and ribavirin were also performed with higher concentrations. For PEG induced matric stress on water availability we used PEG 8000 (Fisher BioRegents) in a modified Spezieller Nährstoffarmer liquid media (SN: 0. 02 g sucrose, 0. 02 g glucose, 0. 08 g KNo3,0. 08 g KH2Po4,0. 04 g MgSo4. 7H2O and 0. 04 g NaCl/L) to make media with water potential gradients of -1 MPa, -2 MPa, -3MPa, -4 MPa, -5 MPa and -6 MPa. The amount of PEG 8000 in gram/gram of water was calculated based on Michel [42] equation: Ψ (water potential) = 1. 29 [PEG]2T – 140[PEG]2–4 [PEG] and the value was adjusted to the Pd culture temperature of 10°C. An agar plug containing actively growing Pd was placed in 50 ml autoclaved modified SN liquid media with a targeted amount of PEG 8000 (-1 MPa: ~ 0. 075 PEG g/g of water, -2 MPa: ~ 0. 11 PEG g/g of water, -3 MPa: ~ 0. 14 PEG g/g of water, -4 MPa: ~ 0. 16 PEG g/g of water, -5 MPa: ~ 0. 19 PEG g/g of water and -6 MPa: ~ 0. 21 PEG g/g of water) and grown as described above. After three weeks, pieces of newly growing mycelia of Pd were transferred to normal SBD routinely used to culture Pd and the fungus was harvested after a normal culture period. The fungi from different treatments were examined for PdPV-pa both by dsRNAs gel electrophoresis and RT-PCR with PdPV-pa specific primers. In all methods Pd isolate LB-01 was used. Genetic variation in North American PdPV-pa isolates were determined by sequence analysis of RdRp and CP segments amplified in RT-PCR using specific primers. The primer pairs specific to RdRp (5' ATGGAAGTATCTCCTTTCG3' F, 5' GTATAGAAGATTGAGTGCC3' R) and CP (5' ACTCTGTGTTAACGGAGG3' F, 5' CTGTAGTTGACACCTGTACC3' R) were designed from the consensus sequences of RNA 1 and RNA 2 assembled from LB-01 isolate cloned sequences. PCR products using RdRp and CP specific primers from 45 North American PdPV isolates were sequenced and aligned with MUSCLE default settings in the program Geneious 8. 0 [39]. The RdRp sequences have been deposited in GenBank under accession numbers KY207498 to KY207552 and the CP sequences have been deposited in GenBank under accession numbers KY207453 to KY207497. The alignment was visually corrected as necessary before recording segregating and singleton sites. The average percentage identity for each sequence was calculated by taking the average from a pairwise percentage identity matrix generated from the sequence alignment. Phylogenetic analysis was performed using MrBayes [43] implemented via a plug-in in Geneious. The amino acid sequences were used in studying the evolutionary relationships of PdPV-pa within the genus Gammapartitivirus. The tree was constructed using amino acid sequence (RdRp and CP) of 10 approved species of Gammapartitivirus available in the GenBank. The sequences of Pepper cryptic virus 1, type member of genus Deltapartitivirus, which is the closest group to Gammapartitivirus in Partitiviridae family was used as outgroup. We used nucleotide sequences (CP) to study phylogenetic relationships of PdPV-pa in North American population. The nucleotide sequence of PsV-S CP was used as outgroup in the analysis. In Bayesian trees construction using amino acid sequence of the RdRp and CP ORFs, Jukes-Cantor substitution model was applied and for nucleotide sequences of CP General time-reversible (GTR) model with gamma rate variation was used based on the best model tested out of 28 models."

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="usakha/Pegasus_MedPaper_model", max_length=5000)
summarizer(text)

In [None]:
from transformers import pipeline

# Initialize the summarization pipeline
summarizer = pipeline(
    "summarization",
    model="usakha/Pegasus_MedPaper_model",
)

# Set the maximum input length
max_input_length = 1024

# Tokenize the input text with truncation and custom max_length
#long_text = """Your long text goes here..."""
tokens = summarizer.tokenizer(
    text,
    truncation=True,  # Truncate the input text if it exceeds max_length
    max_length=max_input_length,  # Set the maximum input length
    return_tensors="pt",  # Return tensors in PyTorch format
    #num_beams=4,  # Set the number of beams for beam search
    #num_beam_groups=2,  # Set the number of beam groups for diverse beam search
    #diversity_penalty=0.5,  # Set the diversity penalty for diverse beam search
    #early_stopping=True,
)

# Check if the input length exceeds the model's supported length
if tokens.input_ids.shape[1] > summarizer.model.config.max_position_embeddings:
    raise ValueError(
        f"Input length ({tokens.input_ids.shape[1]}) exceeds the model's maximum supported length "
        f"({summarizer.model.config.max_position_embeddings}). Please truncate the input text."
    )

# Generate the summary
summary_ids = summarizer.model.generate(**tokens)
summary_text = summarizer.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(summary_text)

In [None]:
from transformers import pipeline

# Initialize the summarization pipeline
summarizer = pipeline(
    "summarization",
    model="usakha/Pegasus_MedPaper_model",
    #device=0  # Set to 0 to use GPU, -1 for CPU
)

# Tokenize the input text
#text = "Your input text goes here..."
input_tokens = summarizer.tokenizer(text, return_tensors="pt")

# Generate the summary with num_beams and num_beam_groups
summary_ids = summarizer.model.generate(
    input_tokens["input_ids"],
    attention_mask=input_tokens["attention_mask"],
    max_length=1024,
    min_length=50,
    #num_beams=4,  # Set the number of beams for beam search
    num_beam_groups=2,  # Set the number of beam groups for diverse beam search
    diversity_penalty=0.5,  # Set the diversity penalty for diverse beam search
    early_stopping=True,
)

# Decode the generated summary
summary_text = summarizer.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(summary_text)