<a href="https://colab.research.google.com/github/RicardoPoleo/DeepLearning_FactChecker/blob/main/notebooks/Agents/ModelAgent3_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@title Install dependencies
!pip install ipywidgets
from ipywidgets import Dropdown, Output
from IPython.display import display

In [None]:
#@title Utils
def get_models():
    # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
    return [
        #"unsloth/mistral-7b-v0.3-bnb-4bit",  # New Mistral v3 2x faster!
        #"unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        #"unsloth/llama-3-8b-bnb-4bit",  # Llama-3 15 trillion tokens model 2x faster!
        #"unsloth/llama-3-8b-Instruct-bnb-4bit",
        #"unsloth/llama-3-70b-bnb-4bit",
        "unsloth/Phi-3-mini-4k-instruct",  # Phi-3 2x faster!
        "unsloth/Phi-3-medium-4k-instruct",
        "unsloth/mistral-7b-bnb-4bit",
        "unsloth/gemma-7b-bnb-4bit",  # Gemma 2.2x faster!
    ]  # More models at https://huggingface.co/unsloth

models = get_models()

In [None]:
selected_model = models[0]
selected_model_responses = []

# Create a dropdown
dropdown = Dropdown(
    options=models,  # Add your options here
    value=models[0],  # Set the default value
    description='Choose:',  # Add description
    disabled=False,  # Set it to True to disable
)

# Create output widget to display results
output = Output()

def on_dropdown_change(change):
    with output:
        output.clear_output()
        # You can add code here that reacts to the change
        selected_model = change['new']
        print(f"Selected model: {selected_model}")

# Link function to handle changes in the dropdown
dropdown.observe(on_dropdown_change, names='value')

# Display the widgets
display(dropdown, output)

Dropdown(description='Choose:', options=('unsloth/Phi-3-mini-4k-instruct', 'unsloth/Phi-3-medium-4k-instruct',…

Output()

In [None]:
selected_model

'unsloth/Phi-3-mini-4k-instruct'

In [None]:
import json

def do_inference(model_name, instruction, claim, explanation):
    print(f"=== Inference with the model: {model_name}")
    # Added public csv file for easy access
    dataset_path = "https://github.com/RicardoPoleo/DeepLearning_FactChecker/raw/main/datasets/3rd-attempt-input-instruction-claim-veredict-output-veredict.csv"
    finetuner = OurFineTuner(dataset_filepath=dataset_path, dataset_type="csv")  # Assuming OurFineTuner class is already defined/imported
    finetuner.pick_model(model_name)
    finetuner.set_instructions_format()  # Using default
    response = finetuner.perform_inference(instruction, claim, explanation)
    print(response)

    inference_result = {
        "model": model_name,
        "inference": {
            "instruction": instruction,
            "claim": claim,
            "explanation": explanation,
            "response": response
        }
    }

    # Generate a filename from the model name
    model_filename = model_name.replace("/", "_")  # Replace slashes to avoid path issues
    filepath = f'/content/drive/MyDrive/DeepLearning/{model_filename}_inference_results.json'

    # Save each model's inference result to its own JSON file
    with open(filepath, 'w') as json_file:
        json.dump(inference_result, json_file, indent=4)

    return inference_result

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

In [None]:
# Manual fixes
from unsloth import FastLanguageModel
import torch

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
def login_huggingface():
    from google.colab import userdata
    from huggingface_hub import login
    hf_token = userdata.get('hg_token')
    login(token=hf_token)
    pass

import torch
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import FastLanguageModel, is_bfloat16_supported

class OurFineTuner:
    def __init__(self, dataset_filepath, dataset_type="csv"):
        self.training_stats = None
        self.trainer = None
        self.max_seq_length = 2048
        self.instructions_format = ""
        self.dataset_filepath = dataset_filepath
        self.dataset_type = dataset_type
        self.model = None
        self.tokenizer = None
        self.dataset = None
        self.train_dataset = None
        self.validation_dataset = None

    def pick_model(self, model_name):
        dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
        load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=model_name,
            max_seq_length=self.max_seq_length,
            dtype=dtype,
            load_in_4bit=load_in_4bit,
        )
        self.add_qlora()

    def add_qlora(self):
        self.model = FastLanguageModel.get_peft_model(
            self.model,
            r=16,
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
            lora_alpha=16,
            lora_dropout=0,
            bias="none",
            use_gradient_checkpointing="unsloth",
            random_state=3407,
            use_rslora=False,
            loftq_config=None,
        )

    def load_dataset(self):
        if self.dataset_type == "csv":
            self.dataset = load_dataset("csv", data_files=self.dataset_filepath, split="train")
        elif self.dataset_type == "HuggingFace":
            self.dataset = load_dataset(self.dataset_filepath)
        else:
            raise ValueError(f"Unsupported dataset type: {self.dataset_type}")

    def format_data(self, test_size=0.2):
        split_dataset = self.dataset.train_test_split(test_size=test_size)
        self.train_dataset = split_dataset['train']
        self.validation_dataset = split_dataset['test']
        self.train_dataset = self.train_dataset.map(self.formatting_prompts_func, batched=True)
        self.validation_dataset = self.validation_dataset.map(self.formatting_prompts_func, batched=True)

    def formatting_prompts_func(self, examples):
        EOS_TOKEN = self.tokenizer.eos_token
        texts = [self.instructions_format.format(ex['instruction'], ex['input'], ex['output']) + EOS_TOKEN for ex in zip(examples['instruction'], examples['input'], examples['output'])]
        return {"text": texts}

    def prepare_trainer(self, max_steps=60):
        self.trainer = SFTTrainer(
            model=self.model,
            tokenizer=self.tokenizer,
            train_dataset=self.train_dataset,
            dataset_text_field="text",
            max_seq_length=self.max_seq_length,
            dataset_num_proc=2,
            packing=False,
            args=TrainingArguments(
                per_device_train_batch_size=2,
                gradient_accumulation_steps=4,
                warmup_steps=5,
                max_steps=max_steps,
                learning_rate=2e-4,
                fp16=not is_bfloat16_supported(),
                bf16=is_bfloat16_supported(),
                logging_steps=1,
                optim="adamw_8bit",
                weight_decay=0.01,
                lr_scheduler_type="linear",
                seed=3407,
                output_dir="outputs",
            ),
        )

    def perform_inference(self, instruction, claim, explanation):
        input_text = f"Claim: {claim}. Explanation: {explanation}."
        inputs = self.tokenizer(
            self.instructions_format.format(instruction, input_text, ""),
            return_tensors="pt"
        ).to("cuda")
        outputs = self.model.generate(**inputs, max_new_tokens=64, use_cache=True)
        response = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        return response

    def set_instructions_format(self, instructions_format=""):
        if instructions_format == "":
            self.instructions_format = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""
        else:
            self.instructions_format = instructions_format


In [None]:
#@title Run the whole list of models to compare the output of them all
def run_inference_for_model_comparison():
  # Define the prompt information
  instruction = "You are a fact-checker AI. Evaluate the following claim with its explanation and, based on the provided information, determine whether or not the claim is true or not, followed by the explanation of why."
  claim = "Vitamin C prevents the common cold."
  explanation = "Multiple studies have shown that Vitamin C boosts the immune system but does not prevent the common cold directly."

  # Run inference on each model
  for model in models:
      do_inference(model, instruction, claim, explanation)

models = get_models()
run_inference_for_model_comparison()

=== Inference with the model: unsloth/Phi-3-mini-4k-instruct
==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


["Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are a fact-checker AI. Evaluate the following claim with its explanation and, based on the provided information, determine whether or not the claim is true or not, followed by the explanation of why.\n\n### Input:\nClaim: Vitamin C prevents the common cold.. Explanation: Multiple studies have shown that Vitamin C boosts the immune system but does not prevent the common cold directly..\n\n### Response:\nThe claim is partially true. While Vitamin C does boost the immune system, it does not directly prevent the common cold. The explanation provided states that multiple studies have shown Vitamin C's effectiveness in boosting the immune system, which is true. However, the claim's assertion that Vitamin"]
=== Inference with the model: unsloth/Phi-3-medium-4k-instruct


config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors.index.json:   0%|          | 0.00/165k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.72G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.14k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.5 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are a fact-checker AI. Evaluate the following claim with its explanation and, based on the provided information, determine whether or not the claim is true or not, followed by the explanation of why.\n\n### Input:\nClaim: Vitamin C prevents the common cold.. Explanation: Multiple studies have shown that Vitamin C boosts the immune system but does not prevent the common cold directly..\n\n### Response:\nThe claim that Vitamin C prevents the common cold is not entirely true. While it is correct that Vitamin C boosts the immune system, as supported by multiple studies, it does not directly prevent the common cold. However, it may help reduce the severity and duration of cold symptoms.\n\n##']
=== Inference with the model: unsloth/mistral-7b-bnb-4bit


config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are a fact-checker AI. Evaluate the following claim with its explanation and, based on the provided information, determine whether or not the claim is true or not, followed by the explanation of why.\n\n### Input:\nClaim: Vitamin C prevents the common cold.. Explanation: Multiple studies have shown that Vitamin C boosts the immune system but does not prevent the common cold directly..\n\n### Response:\nThe claim that Vitamin C prevents the common cold is not true. While Vitamin C does boost the immune system, it does not directly prevent the common cold. The explanation provided in the input is correct, as multiple studies have shown that Vitamin C boosts the immune system but does not prevent the common cold directly']
=== Inference with the model: unsloth/gemma-7b-bnb-4bit


config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Gemma patching release 2024.5
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.57G [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/40.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Unsloth 2024.5 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are a fact-checker AI. Evaluate the following claim with its explanation and, based on the provided information, determine whether or not the claim is true or not, followed by the explanation of why.\n\n### Input:\nClaim: Vitamin C prevents the common cold.. Explanation: Multiple studies have shown that Vitamin C boosts the immune system but does not prevent the common cold directly..\n\n### Response:\nThe claim is false.']


In [None]:
_model = "unsloth/llama-3-8b-Instruct-bnb-4bit"
_instruction = "You are a fact-checker AI. Evaluate the following claim with its explanation and, based on the provided information, determine whether or not the claim is true or not, followed by the explanation of why."
_claim = "Vitamin C prevents the common cold."
_explanation = "Multiple studies have shown that Vitamin C boosts the immune system but does not prevent the common cold directly."

response = do_inference(_model, _instruction, _claim, _explanation)


=== Inference with the model: unsloth/llama-3-8b-Instruct-bnb-4bit


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are a fact-checker AI. Evaluate the following claim with its explanation and, based on the provided information, determine whether or not the claim is true or not, followed by the explanation of why.\n\n### Input:\nClaim: Vitamin C prevents the common cold.. Explanation: Multiple studies have shown that Vitamin C boosts the immune system but does not prevent the common cold directly..\n\n### Response:\nBased on the provided information, the claim "Vitamin C prevents the common cold" is **FALSE**.\n\nThe explanation provided suggests that while Vitamin C may have an impact on the immune system, it does not directly prevent the common cold. This implies that the claim is overstating the benefits of Vitamin C in relation to']


In [None]:
login_huggingface()
avialable_models = get_models()
print(f"avialable_models: {avialable_models}")


dataset_path = "/content/drive/MyDrive/DeepLearning/FineTuning/3rd-attempt-input-instruction-claim-veredict-output-veredict.csv"
ourFineTuner = OurFineTuner(dataset_path, dataset_type="csv")
ourFineTuner.pick_model(avialable_models[2])

ourFineTuner.load_dataset()
ourFineTuner.set_instructions_format()
ourFineTuner.format_data()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
avialable_models: ['unsloth/mistral-7b-v0.3-bnb-4bit', 'unsloth/mistral-7b-instruct-v0.3-bnb-4bit', 'unsloth/llama-3-8b-bnb-4bit', 'unsloth/llama-3-8b-Instruct-bnb-4bit', 'unsloth/llama-3-70b-bnb-4bit', 'unsloth/Phi-3-mini-4k-instruct', 'unsloth/Phi-3-medium-4k-instruct', 'unsloth/mistral-7b-bnb-4bit', 'unsloth/gemma-7b-bnb-4bit']


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Generating train split: 0 examples [00:00, ? examples/s]



Map:   0%|          | 0/1113 [00:00<?, ? examples/s]

KeyError: 'instruction'

In [None]:
trainer_results = ourFineTuner.train()

In [None]:
# Do not delete this cell

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,309 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.7343
2,1.8989
3,2.2572
4,1.8639
5,1.9007
6,1.4844
7,1.3969
8,1.1544
9,1.1142
10,1.017


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.32 GiB. GPU 

In [None]:
#@title This load of the Dataset from HuggingFace
#But is we are loding it from a CSV file or another source, please, dont run this.
def load_and_prepare_dataset():
  from datasets import load_dataset
  dataset = load_dataset('dwadden/healthver_entailment')
  claims_by_id = {}

  for example in dataset['train']:
      claim_id = example['claim_id']
      entry = {
          'claim': example['claim'],
          'abstract_id': example['abstract_id'],
          'title': example['title'],
          'abstract': example['abstract'],
          'verdict': example['verdict'],
          'evidence': example['evidence']
      }

      if claim_id in claims_by_id:
          claims_by_id[claim_id].append(entry)
      else:
          claims_by_id[claim_id] = [entry]

  return claims_by_id

claims_by_id = load_and_prepare_dataset()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [None]:
claim_id = 1311
claim = claims_by_id[claim_id]
print(f"Claim {claim_id}: {claim[0]['claim']}")
print(f"Total abstracts associated to this claim: {len(claim[0]['abstract'])}")

print("Print Evidences: ")
for i, abstract in enumerate(claim[0]['abstract']):
    print(f"- Abstract {i}: {abstract}")

print("Print Explanations")
for i, evidence in enumerate(claim[0]['evidence']):
    print(f"- Evidence {i}: {evidence}")

print(f"- Title: {claim[0]['title']}")

claim[0]['instruction'] = "You are a fact-checker AI. Evaluate the following claim with its explanation and, based on the provided information, determine whether or not the claim is true or not, followed by the explanation of why."
print(f"Instruction: {claim[0]['instruction']}")

print("Input: ")
print(f"- Claim: {claim[0]['claim']}")
print(f"- Evidence: {claim[0]['evidence']}")

print(f"Output: ")
print(f"- Label: {claim[0]['verdict']}")
print(f"- Explanations: {claim[0]['evidence']}")



Claim 1311: the coronavirus hasn't been bioengineered
Total abstracts associated to this claim: 8
Print Evidences: 
- Abstract 0: Origin of the COVID-19 virus has been intensely debated in the scientific community since the first infected cases were detected in December 2019.
- Abstract 1: The disease has caused a global pandemic, leading to deaths of thousands of people across the world and thus finding origin of this novel coronavirus is important in responding and controlling the pandemic.
- Abstract 2: Recent research results suggest that bats or pangolins might be the original hosts for the virus based on comparative studies using its genomic sequences.
- Abstract 3: This paper investigates the COVID-19 virus origin by using artificial intelligence (AI) and raw genomic sequences of the virus.
- Abstract 4: More than 300 genome sequences of COVID-19 infected cases collected from different countries are explored and analysed using unsupervised clustering methods.
- Abstract 5: The r

In [None]:
ourFineTuner.format_data()

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

# Assuming you have the HealthVer dataset properly coded in 'healthver_dataset.py'
# and the HealthVerEntailment class is correctly defined

dataset = load_dataset('dwadden/healthver_entailment')

# To access and print examples from the training set
for example in dataset['train']:
    print(example)


In [None]:
def get_formatted_dataset(example):
  formatted_dataset = {
    "instruction": [],
    "input": [],
    "output": []
}

for entry in dataset:
    instruction = "You are a fact-checker AI. Evaluate the following claim with its explanation and, based on the provided information, determine whether or not the claim is true or not, followed by the explanation of why."
    input_text = f"Claim: {entry['claim']}. Evidence: {json.dumps(entry['evidence'])}."
    output_text = f"Verdict: {entry['verdict']}. Explanation: {entry['explanation']}."

    formatted_dataset["instruction"].append(instruction)
    formatted_dataset["input"].append(input_text)
    formatted_dataset["output"].append(output_text)

In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('dwadden/healthver_entailment')

# Initialize an empty dictionary to aggregate claims by claim_id
claims_by_id = {}

# Iterate through the training set
for example in dataset['train']:
    claim_id = example['claim_id']
    # Prepare the entry to store in the dictionary
    entry = {
        'claim': example['claim'],
        'abstract_id': example['abstract_id'],
        'title': example['title'],
        'abstract': example['abstract'],
        'verdict': example['verdict'],
        'evidence': example['evidence']
    }

    # Check if the claim_id already exists in the dictionary
    if claim_id in claims_by_id:
        # If yes, append the new entry to the existing list for this claim_id
        claims_by_id[claim_id].append(entry)
    else:
        # If no, create a new list with this entry
        claims_by_id[claim_id] = [entry]

# Now claims_by_id contains all claims grouped by claim_id
# You can access all entries for a specific claim_id like this:
print(claims_by_id[1311])  # Replace 1311 with any claim_id you want to inspect

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


[{'claim': "the coronavirus hasn't been bioengineered", 'abstract_id': 47, 'title': 'Origin of Novel Coronavirus (COVID-19): A Computational Biology Study using Artificial Intelligence', 'abstract': ['Origin of the COVID-19 virus has been intensely debated in the scientific community since the first infected cases were detected in December 2019.', 'The disease has caused a global pandemic, leading to deaths of thousands of people across the world and thus finding origin of this novel coronavirus is important in responding and controlling the pandemic.', 'Recent research results suggest that bats or pangolins might be the original hosts for the virus based on comparative studies using its genomic sequences.', 'This paper investigates the COVID-19 virus origin by using artificial intelligence (AI) and raw genomic sequences of the virus.', 'More than 300 genome sequences of COVID-19 infected cases collected from different countries are explored and analysed using unsupervised clustering m

In [None]:
print(f"Total claims: {len(claims_by_id)}")
claim = claims_by_id[0]
print(f"Claim 1: {claim[0]['claim']}")
print(f"Total abstracts associated to this claim: {len(claim[0]['abstract'])}")

for i, abstract in enumerate(claim[0]['abstract']):
    print(f"- Abstract {i}: {abstract}")

print(f"Title: {claim[0]['title']}")
print(f"Verdict: {claim[0]['verdict']}")
print(f"Evidence: {claim[0]['evidence']}")

Total claims: 1392
Claim 1:  A face covering can prevent people who are asymptomatic carriers of Covid-19 from spreading the virus
Total abstracts associated to this claim: 10
- Abstract 0: The COVID‐19 pandemic caused by the novel coronavirus SARS‐CoV‐2 has claimed many lives worldwide.
- Abstract 1: Wearing medical masks or N95 masks (namely N95 respirators) can slow the virus spread and reduce the infection risk.
- Abstract 2: Reuse of these masks can minimize waste, protect the environment, and help to solve the current imminent shortage of masks.
- Abstract 3: Disinfection of used masks is needed for reuse of them with safety, but improper decontamination can damage the blocking structure of masks.
- Abstract 4: In this study, we demonstrated, using avian coronavirus of infectious bronchitis virus to mimic SARS‐CoV‐2, that medical masks and N95 masks remained their blocking efficacy after being steamed on boiling water even for 2 hours.
- Abstract 5: We also demonstrated that thre

In [None]:
from datasets import load_dataset
import json

# Load the JSON file
dataset_path = "/content/drive/MyDrive/DeepLearning/FineTuning/PubHealth_llama_fine_tuning_data.json"
with open(dataset_path, 'r') as json_file:
    dataset = json.load(json_file)

# Convert the JSON data to a suitable format for fine-tuning
formatted_dataset = {
    "instruction": [],
    "input": [],
    "output": []
}

for entry in dataset:
    instruction = "You are a fact-checker AI. Evaluate the following claim with its explanation and, based on the provided information, determine whether or not the claim is true or not, followed by the explanation of why."
    input_text = f"Claim: {entry['claim']}. Evidence: {json.dumps(entry['evidence'])}."
    output_text = f"Verdict: {entry['verdict']}. Explanation: {entry['explanation']}."

    formatted_dataset["instruction"].append(instruction)
    formatted_dataset["input"].append(input_text)
    formatted_dataset["output"].append(output_text)

# Now use the formatted dataset for fine-tuning as before
