In [None]:
!pip install transformers datasets sacremoses


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 k

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset

# Load dataset
dataset = load_dataset('text', data_files={'train': '/content/comorbidity_recommendation_dataset.txt'})

# Load BioGPT tokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/biogpt')

# Tokenize
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=128)

tokenized_dataset = dataset['train'].map(tokenize_function, batched=True, remove_columns=['text'])


Map:   0%|          | 0/1121 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Load BioGPT model
model = AutoModelForCausalLM.from_pretrained('microsoft/biogpt')

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir='/mnt/data/biogpt_comorbidity_model',
    num_train_epochs=6,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    fp16=True,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()


Step,Training Loss
100,1.9408
200,1.5147
300,1.4266
400,0.8679
500,1.1216
600,0.8163
700,0.5532
800,0.5734
900,0.4922
1000,0.3969


TrainOutput(global_step=1680, training_loss=0.6957948310034615, metrics={'train_runtime': 893.9779, 'train_samples_per_second': 7.524, 'train_steps_per_second': 1.879, 'total_flos': 301055017721856.0, 'train_loss': 0.6957948310034615, 'epoch': 5.9812667261373775})

In [None]:
model.save_pretrained('/content/biogpt_comorbidity_model_final')
tokenizer.save_pretrained('/content/biogpt_comorbidity_model_final')
print('✅ Model and tokenizer saved successfully.')


✅ Model and tokenizer saved successfully.


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Ruta donde se guardó el modelo entrenado
model_path = "/content/biogpt_comorbidity_model_final"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Crear pipeline de generación
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)


Device set to use cuda:0


In [None]:
# Ejemplo: para diabetes
condition = "diabetes"
prompt = (
    f"<|startoftext|>\n"
    f"[CONDITION]: {condition}\n"
    "[PROMPT]: Provide a COVID-19-specific health recommendation.\n"
    "[RECOMMENDATION]:"
)

# Generar recomendación
output = generator(prompt, max_length=100, do_sample=True, temperature=0.7)
recommendation = output[0]['generated_text'].split("[RECOMMENDATION]:")[-1].split("<|endoftext|>")[0].strip()

print(f"✅ Recommendation for {condition.upper()}:\n{recommendation}")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


✅ Recommendation for DIABETES:
Hello, For diabetes, i do not recommend to self-isolate. Self-quarantine is important, but not a substitute for vaccine. Get tested if you develop symptoms. If you have a non-resolving pneumonia, then do not recommend to self-isolate. Self-quarantine


In [None]:
comorbidities = ["diabetes", "asthma", "cardiovascular", "obesity", "smoking"]

for cond in comorbidities:
    prompt = (
        f"<|startoftext|>\n"
        f"[CONDITION]: {cond}\n"
        "[PROMPT]: Provide a COVID-19-specific health recommendation.\n"
        "[RECOMMENDATION]:"
    )
    output = generator(prompt, max_length=100, do_sample=True, temperature=0.7)
    recommendation = output[0]['generated_text'].split("[RECOMMENDATION]:")[-1].split("<|endoftext|>")[0].strip()
    print(f"\n🩺 Recommendation for {cond.upper()}:\n{recommendation}")



🩺 Recommendation for DIABETES:
Hello, For your diabetes, keep A1c as low as possible, do everything you can to support gut health and immunity, increase supplements that increase resistance to viruses, and use social distancing to minimize contact with others. If you have a sore throat for a few days that gets worse every

🩺 Recommendation for ASTHMA:
Hello, According to the history it might be a viral situation or an allergy. Continue the current treatment guidelines as they were developed. Get tested if you start a new treatment or change of treatment. If you have a viral situation or an allergy you should go for a test and start

🩺 Recommendation for CARDIOVASCULAR:
Thanks for your question on Healthcare Magic. I can understand your concern. Since you are a well controlled diabetic, your symptoms are not under control. Regular exercise and adequate sleep are important for your health. But, if you get infected, your symptoms are more likely to be due to uncontrolled infections

🩺 Re

In [None]:
import re

# Cargar tu dataset estructurado (lista de ejemplos tipo <|startoftext|>...)
with open("comorbidity_recommendation_dataset.txt", "r", encoding="utf-8") as f:
    raw_data = f.read().split("<|startoftext|>")

filtered_examples = []

for entry in raw_data:
    if "[RECOMMENDATION]:" not in entry:
        continue

    match = re.search(r"\[RECOMMENDATION\]:(.*)<\|endoftext\|>", entry, re.DOTALL)
    if not match:
        continue

    rec = match.group(1).strip().lower()

    # Frases que indican lenguaje no médico o fórmulas sociales
    bad_phrases = [
        "thank you", "hello", "ask a doctor", "healthcare magic",
        "wish you good health", "thanks. thanks.", "have a nice day",
        "i can understand your concern", "i will be happy to help"
    ]

    if any(phrase in rec for phrase in bad_phrases):
        continue

    # Descartar si la recomendación es muy corta
    if len(rec.split()) < 8:
        continue

    filtered_examples.append("<|startoftext|>" + entry.strip())

# Guardar nuevo dataset limpio
with open("cleaned_comorbidity_dataset.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(filtered_examples))

print(f"✅ Cleaned dataset with {len(filtered_examples)} examples saved.")


✅ Cleaned dataset with 82 examples saved.


In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset

# Load dataset
dataset = load_dataset('text', data_files={'train': '/content/cleaned_comorbidity_dataset.txt'})

# Load BioGPT tokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/biogpt')

# Tokenize
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=128)

tokenized_dataset = dataset['train'].map(tokenize_function, batched=True, remove_columns=['text'])


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/352 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Load BioGPT model
model = AutoModelForCausalLM.from_pretrained('microsoft/biogpt')

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir='/mnt/data/biogpt_comorbidity_model',
    num_train_epochs=6,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    fp16=True,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()


Step,Training Loss
100,2.1371
200,1.0242
300,0.6496
400,0.3586
500,0.2475


TrainOutput(global_step=528, training_loss=0.8484438441016457, metrics={'train_runtime': 366.6676, 'train_samples_per_second': 5.76, 'train_steps_per_second': 1.44, 'total_flos': 114121353314304.0, 'train_loss': 0.8484438441016457, 'epoch': 6.0})

In [None]:
model.save_pretrained('/content/clean_biogpt_comorbidity_model_final')
tokenizer.save_pretrained('/content/clean_biogpt_comorbidity_model_final')
print('✅ Model and tokenizer saved successfully.')


✅ Model and tokenizer saved successfully.


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Ruta donde se guardó el modelo entrenado
model_path = "/content/clean_biogpt_comorbidity_model_final"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Crear pipeline de generación
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)


Device set to use cuda:0


In [None]:
# Ejemplo: para diabetes
condition = "diabetes"
prompt = (
    f"<|startoftext|>\n"
    f"[CONDITION]: {condition}\n"
    "[PROMPT]: Provide a COVID-19-specific health recommendation.\n"
    "[RECOMMENDATION]:"
)

# Generar recomendación
output = generator(prompt, max_length=100, do_sample=True, temperature=0.7)
recommendation = output[0]['generated_text'].split("[RECOMMENDATION]:")[-1].split("<|endoftext|>")[0].strip()

print(f"✅ Recommendation for {condition.upper()}:\n{recommendation}")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


✅ Recommendation for DIABETES:
diabetes. High-affinity receptor for coronavirus, the receptor for the virus, is expressed in the lung, and if you are well controlled one's immunity to COVID should not be compromised. Poorly controlled diabetics, especially with complications, may be at greater risk for complications. Regular glucose control and intermittent ph


In [None]:
comorbidities = ["diabetes", "asthma", "cardiovascular", "obesity", "smoking"]

for cond in comorbidities:
    prompt = (
        f"<|startoftext|>\n"
        f"[CONDITION]: {cond}\n"
        "[PROMPT]: Provide a COVID-19-specific health recommendation.\n"
        "[RECOMMENDATION]:"
    )
    output = generator(prompt, max_length=100, do_sample=True, temperature=0.7)
    recommendation = output[0]['generated_text'].split("[RECOMMENDATION]:")[-1].split("<|endoftext|>")[0].strip()
    print(f"\n🩺 Recommendation for {cond.upper()}:\n{recommendation}")



🩺 Recommendation for DIABETES:
Diabetes. Type 1 or 2 if managed well and treated can still maintain good immunity. If either are out of control then infections of any type more likely......................, and also if you are in contro

🩺 Recommendation for ASTHMA:
In brief: Asthma is a chronic lung disease which carries some increased risk with COVID-19. Although the evidence is clear that people with chronic lung disease do worse in a respiratory disease outbreak, it is not clear if this is because they are more fragile or have difficulty breathing. High risk with underlying medical

🩺 Recommendation for CARDIOVASCULAR:
Coronavirus-specific health recommendation. The general population should start to get infections early in the course of the disease. If you have fever over 100F, call your PCP right away. Follow the following guidelines: https: / / www.healthtap.com / blog / covid-19-care-

🩺 Recommendation for OBESITY:
In brief: Not quite that way. If you are having sex with someo

In [None]:
# Ejemplo: para diabetes
comorbidities = ["diabetes", "asthma", "cardiovascular", "obesity", "smoking"]

for cond in comorbidities:
#condition = "diabetes"
  prompt = (
      "<|startoftext|>\n"
      f"[CONDITION]: {cond}\n"
      "[PROMPT]: Provide a clear and actionable health recommendation for a COVID-19 patient with this condition.\n"
      f"[RECOMMENDATION]: For patients with {cond}, it is important to"
  )

  # Generar recomendación
  output = generator(prompt, max_length=100, do_sample=True, temperature=0.7)
  recommendation = output[0]['generated_text'].split("[RECOMMENDATION]:")[-1].split("<|endoftext|>")[0].strip()

  print(f"✅ Recommendation for {cond.upper()}:\n{recommendation}")


✅ Recommendation for DIABETES:
For patients with diabetes, it is important to follow appropriate diabetic-specific health recommendations. Most of the information available on this topic comes from expert opinion. It is clear that if you are well controlled, you can still maintain good immunity. However, if your diabetes
✅ Recommendation for ASTHMA:
For patients with asthma, it is important to follow appropriate disease-management guidelines. They should get their inhalers appropriately (including an urgent inhaler). If symptoms do not improve with treatment, please call your PCP and follow his / her directions completely..
✅ Recommendation for CARDIOVASCULAR:
For patients with cardiovascular, it is important to consider your diabetes status, as well as the other comorbidities that can affect a patient's response to infection. If you are well controlled, you will be fine............
✅ Recommendation for OBESITY:
For patients with obesity, it is important to consider the effects of obes

In [None]:
for cond in comorbidities:
    prompt = (
        "<|startoftext|>\n"
        f"[CONDITION]: {cond}\n"
        "[PROMPT]: Provide a clear and actionable health recommendation for a COVID-19 patient with this condition.\n"
        f"[RECOMMENDATION]: For patients with {cond}, it is important to"
    )
    output = generator(prompt, max_length=180, do_sample=True, temperature=0.6, top_k=50, top_p=0.9)
    text = output[0]['generated_text']
    recommendation = text.split("[RECOMMENDATION]:")[-1].split("<|endoftext|>")[0].strip()
    print(f"\n✅ Recommendation for {cond.upper()}:\n{recommendation}")



✅ Recommendation for DIABETES:
For patients with diabetes, it is important to keep A1c as low as possible, do everything you can to support gut health and immunity, and increase supplements that increase resistance to viruses. Be positive......................, increase supplements that increase resistance to viruses............................................................., increase

✅ Recommendation for ASTHMA:
For patients with asthma, it is important to follow appropriate asthma treatment guidelines. High-risk patients, such as those with uncontrolled asthma, chronic diseases or compromised immune system, are at higher risk for complications. Regular inhaled steroids (if available) along with systemic anti-inflammatory drugs (if available) should be used to control the disease. Regular inhaled steroids can be used as a boost to the regular inhaled steroids. If symptoms go bad, it's ER time. If you are having sex with someone who is postive htat you know, have you travelled? do 

In [None]:
!zip -r /content/clean_biogpt_comorbidity_model_final.zip /content/clean_biogpt_comorbidity_model_final

!zip -r /content/biogpt_comorbidity_model_final.zip /content/biogpt_comorbidity_model_final

  adding: content/clean_biogpt_comorbidity_model_final/ (stored 0%)
  adding: content/clean_biogpt_comorbidity_model_final/model.safetensors (deflated 7%)
  adding: content/clean_biogpt_comorbidity_model_final/tokenizer_config.json (deflated 73%)
  adding: content/clean_biogpt_comorbidity_model_final/merges.txt (deflated 60%)
  adding: content/clean_biogpt_comorbidity_model_final/special_tokens_map.json (deflated 51%)
  adding: content/clean_biogpt_comorbidity_model_final/config.json (deflated 49%)
  adding: content/clean_biogpt_comorbidity_model_final/generation_config.json (deflated 29%)
  adding: content/clean_biogpt_comorbidity_model_final/vocab.json (deflated 70%)
  adding: content/biogpt_comorbidity_model_final/ (stored 0%)
  adding: content/biogpt_comorbidity_model_final/model.safetensors (deflated 7%)
  adding: content/biogpt_comorbidity_model_final/tokenizer_config.json (deflated 73%)
  adding: content/biogpt_comorbidity_model_final/merges.txt (deflated 60%)
  adding: content/

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset

# Load dataset
dataset = load_dataset('text', data_files={'train': '/content/medical_explainer_dataset.txt'})

# Load BioGPT tokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/biogpt')

# Tokenize
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=256)

tokenized_dataset = dataset['train'].map(tokenize_function, batched=True, remove_columns=['text'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
!pip install --upgrade sympy

Collecting sympy
  Downloading sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Downloading sympy-1.13.3-py3-none-any.whl (6.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sympy
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.1
    Uninstalling sympy-1.13.1:
      Successfully uninstalled sympy-1.13.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.6

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Load BioGPT model
model = AutoModelForCausalLM.from_pretrained('microsoft/biogpt')

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir='/mnt/data/biogpt_explainer_model',
    num_train_epochs=10,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    fp16=True,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()


Step,Training Loss
100,1.2782
200,0.4984
300,0.2563
400,0.1477
500,0.1122


TrainOutput(global_step=560, training_loss=0.42067699687821525, metrics={'train_runtime': 443.384, 'train_samples_per_second': 5.052, 'train_steps_per_second': 1.263, 'total_flos': 264770391367680.0, 'train_loss': 0.42067699687821525, 'epoch': 10.0})

In [None]:
model.save_pretrained('/content/biogpt_explainer_model_final')
tokenizer.save_pretrained('/content/biogpt_explainer_model_final')
print('✅ Model and tokenizer saved successfully.')


✅ Model and tokenizer saved successfully.


In [None]:

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Cargar modelo entrenado
model_path = "/content/biogpt_explainer_model_final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

explainer = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Lista de comorbilidades para probar
comorbidities = [
    "diabetes", "asthma", "hypertension", "cardiovascular disease",
    "obesity", "smoking", "pneumonia", "immune suppression"
]

for cond in comorbidities:
    prompt = (
        "<|startoftext|>\n"
        f"[PROMPT]: How does {cond} affect COVID-19 outcomes?\n"
        "[EXPLANATION]:"
    )
    output = explainer(prompt, max_length=200, do_sample=True, temperature=0.7)
    explanation = output[0]['generated_text'].split("[EXPLANATION]:")[-1].split("<|endoftext|>")[0].strip()

    print(f"\n🧠 Explanation for {cond.upper()}:\n{explanation}")

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



🧠 Explanation for DIABETES:
The current mortality rate of patients suffering from coronavirus disease 2019 (COVID-19) is between 45% and 92%, with most dying within the first two weeks of the illness. In an effort to combat such an alarmingly high mortality rate, various treatment therapies such as low tidal volume ventilation strategies, corticosteroid therapy, and use of nitric oxide (NO) have been attempted in the management of patients with COVID-19. Three cases which were admitted to the ICU and confirmed to have COVID-19 were unable to be weaned from ventilatory support, and nitric oxide therapy was initiated. It improved patients' oxygenation for short periods of time but did not affect the mortality. The patients could not be weaned from the ventilator and expired. The patients could not be weaned from the ventilator and expired. The patients could not be weaned from the ventilator and expired. The

🧠 Explanation for ASTHMA:
BACKGROUND: Asthma is one of the risk factors for se

In [None]:
!zip -r /content/biogpt_explainer_model_final.zip /content/biogpt_explainer_model_final

  adding: content/biogpt_explainer_model_final/ (stored 0%)
  adding: content/biogpt_explainer_model_final/model.safetensors (deflated 7%)
  adding: content/biogpt_explainer_model_final/tokenizer_config.json (deflated 73%)
  adding: content/biogpt_explainer_model_final/merges.txt (deflated 60%)
  adding: content/biogpt_explainer_model_final/special_tokens_map.json (deflated 51%)
  adding: content/biogpt_explainer_model_final/config.json (deflated 49%)
  adding: content/biogpt_explainer_model_final/generation_config.json (deflated 29%)
  adding: content/biogpt_explainer_model_final/vocab.json (deflated 70%)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import zipfile

zipobj = zipfile.ZipFile('/content/drive/MyDrive/biogpt_explainer_model_final.zip', 'r')
zipobj.extractall("./explainer")

In [None]:
zipobj = zipfile.ZipFile('/content/drive/MyDrive/clean_biogpt_comorbidity_model_final.zip', 'r')
zipobj.extractall("./recommender")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Rutas a tus modelos fine-tuneados
explainer_model_path = "/content/explainer/content/biogpt_explainer_model_final"
recommender_model_path = "/content/recommender/content/clean_biogpt_comorbidity_model_final"

# Cargar modelos y tokenizers
explainer_tokenizer = AutoTokenizer.from_pretrained(explainer_model_path)
explainer_model = AutoModelForCausalLM.from_pretrained(explainer_model_path)
explainer_pipe = pipeline("text-generation", model=explainer_model, tokenizer=explainer_tokenizer)

recommender_tokenizer = AutoTokenizer.from_pretrained(recommender_model_path)
recommender_model = AutoModelForCausalLM.from_pretrained(recommender_model_path)
recommender_pipe = pipeline("text-generation", model=recommender_model, tokenizer=recommender_tokenizer)

# Lista de comorbilidades
comorbidities = ["diabetes", "asthma", "hypertension", "cardiovascular disease", "obesity", "smoking"]

# Generar reporte
for cond in comorbidities:
    # Prompt para explicación técnica
    explainer_prompt = (
        f"<|startoftext|>\n"
        f"[PROMPT]: How does {cond} affect COVID-19 outcomes?\n"
        "[EXPLANATION]:"
    )
    expl = explainer_pipe(explainer_prompt, max_length=300, do_sample=True, temperature=0.7)[0]['generated_text']
    explanation = expl.split("[EXPLANATION]:")[-1].split("<|endoftext|>")[0].strip()

    # Prompt para recomendación práctica
    recommender_prompt = (
        f"<|startoftext|>\n"
        f"[CONDITION]: {cond}\n"
        "[PROMPT]: Provide a clear and actionable health recommendation for a COVID-19 patient with this condition.\n"
        f"[RECOMMENDATION]: For patients with {cond}, it is important to"
    )
    rec = recommender_pipe(recommender_prompt, max_length=150, do_sample=True, temperature=0.6, truncation=True)[0]['generated_text']
    recommendation = rec.split("[RECOMMENDATION]:")[-1].split("<|endoftext|>")[0].strip()

    # Imprimir resultados
    print(f"\n============================")
    print(f"🧠 MEDICAL EXPLANATION - {cond.upper()}:\n{explanation}")
    print(f"\n💊 RECOMMENDATION - {cond.upper()}:\n{recommendation}")


Device set to use cpu
Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



🧠 MEDICAL EXPLANATION - DIABETES:
The current mortality rate of patients with COVID-19 is between 45% and 92%, with most dying within the first two weeks of the illness. Worldwide, a majority of cases occur in patients with comorbid conditions such as hypertension, diabetes, and smoking history. In addition, patients with diabetes are more likely to progress to chronic obstructive pulmonary disease (COPD) and respiratory failure requiring ventilatory support. Finally, it has been shown that severe pneumonia and ARDS may occur in patients with COVID-19 due to the presence of influenza A virus. Therefore, it has become imperative to evaluate the impact of diabetes on COVID-19 patients. Diabetics are at a higher risk of developing pneumonia and ARDS, but they may also have a better outcome following severe pneumonia and ARDS. In this review, we will summarize the current knowledge about the impact of diabetes on COVID-19 patients, with a particular focus on the severity of pneumonia and 