# <span> 🔥🔥🔥 Fire detection using Vision Transformers (ViT) </span>
<hr style="border-bottom: solid;background-color:light;color:black;">

<h2>Introduction </h2>  
<p style="text-align:justify; padding:20px;">
In this notebook we will talk about classification & Transformers. We fine tune a Tranformer model to detect fire.
</p>

* [Imports](#section-1)
* [Data preparation](#section-2)
* [Building the model](#section-3)
* [Training the model](#section-4)

<a id="section-1"></a>
# <span>1. Imports</span>
<hr style="border-bottom: solid;background-color:light;color:black;">

In [1]:
from datasets import load_dataset
from datasets import load_metric

from transformers import TrainingArguments
from transformers import ViTFeatureExtractor
from transformers import ViTForImageClassification

import torch

from PIL import Image
import requests
import numpy as np

<a id="section-2"></a>
# <span>2. Data preparation</span>
<hr style="border-bottom: solid;background-color:light;color:black;">

In [2]:
ds = load_dataset("imagefolder", data_dir = "../input/fire-dataset/fire_dataset")
ds

Resolving data files:   0%|          | 0/999 [00:00<?, ?it/s]

Downloading and preparing dataset image_folder/default to /root/.cache/huggingface/datasets/image_folder/default-205668c41abe4b96/0.0.0/ee92df8e96c6907f3c851a987be3fd03d4b93b247e727b69a8e23ac94392a091...
                

Downloading data files #2:   0%|          | 0/63 [00:00<?, ?obj/s]

Downloading data files #4:   0%|          | 0/63 [00:00<?, ?obj/s]

Downloading data files #11:   0%|          | 0/62 [00:00<?, ?obj/s]

Downloading data files #1:   0%|          | 0/63 [00:00<?, ?obj/s]

Downloading data files #0:   0%|          | 0/63 [00:00<?, ?obj/s]

Downloading data files #14:   0%|          | 0/62 [00:00<?, ?obj/s]

Downloading data files #3:   0%|          | 0/63 [00:00<?, ?obj/s]

Downloading data files #9:   0%|          | 0/62 [00:00<?, ?obj/s]

Downloading data files #12:   0%|          | 0/62 [00:00<?, ?obj/s]

Downloading data files #5:   0%|          | 0/63 [00:00<?, ?obj/s]

Downloading data files #7:   0%|          | 0/62 [00:00<?, ?obj/s]

Downloading data files #13:   0%|          | 0/62 [00:00<?, ?obj/s]

Downloading data files #8:   0%|          | 0/62 [00:00<?, ?obj/s]

Downloading data files #6:   0%|          | 0/63 [00:00<?, ?obj/s]

Downloading data files #10:   0%|          | 0/62 [00:00<?, ?obj/s]

Downloading data files #15:   0%|          | 0/62 [00:00<?, ?obj/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset image_folder downloaded and prepared to /root/.cache/huggingface/datasets/image_folder/default-205668c41abe4b96/0.0.0/ee92df8e96c6907f3c851a987be3fd03d4b93b247e727b69a8e23ac94392a091. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 999
    })
})

In [3]:
data = ds['train'].train_test_split(test_size = 0.1)
data

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 899
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 100
    })
})

In [4]:
labels = data['train'].features['label']
labels = data["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label

In [5]:
metric = load_metric('accuracy')

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

In [6]:
 feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

Downloading:   0%|          | 0.00/160 [00:00<?, ?B/s]

In [7]:
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    Resize,
    ToTensor,
)

normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
train_transforms = Compose(
        [
            RandomResizedCrop(feature_extractor.size),
            RandomHorizontalFlip(),
            ToTensor(),
            normalize,
        ]
    )

val_transforms = Compose(
        [
            Resize(feature_extractor.size),
            CenterCrop(feature_extractor.size),
            ToTensor(),
            normalize,
        ]
    )

def preprocess_train(example_batch):
    """Apply train_transforms across a batch."""
    example_batch["pixel_values"] = [
        train_transforms(image.convert("RGB")) for image in example_batch["image"]
    ]
    return example_batch

def preprocess_val(example_batch):
    """Apply val_transforms across a batch."""
    example_batch["pixel_values"] = [val_transforms(image.convert("RGB")) for image in example_batch["image"]]
    return example_batch

In [8]:
train_ds = data['train']
val_ds = data['test']
test_ds = data['test']

In [9]:
train_ds.set_transform(preprocess_train)
val_ds.set_transform(preprocess_val)

In [10]:
train_ds[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=448x300>,
 'label': 0,
 'pixel_values': tensor([[[-0.5059, -0.4824, -0.6235,  ..., -0.4902, -0.4118, -0.3961],
          [-0.6549, -0.5686, -0.5216,  ..., -0.3490, -0.3490, -0.4353],
          [-0.6549, -0.6471, -0.5451,  ..., -0.4039, -0.3255, -0.3647],
          ...,
          [ 0.3804,  0.3804,  0.0588,  ..., -0.2471, -0.2392, -0.2314],
          [ 0.8196,  0.6627,  0.1922,  ..., -0.3490, -0.3725, -0.3569],
          [ 0.9922,  0.7569,  0.4196,  ..., -0.4275, -0.4667, -0.4431]],
 
         [[-0.5451, -0.5216, -0.6627,  ..., -0.4118, -0.3412, -0.3255],
          [-0.7098, -0.6235, -0.5765,  ..., -0.2706, -0.2784, -0.3647],
          [-0.7098, -0.7020, -0.6000,  ..., -0.3255, -0.2549, -0.3020],
          ...,
          [-0.4039, -0.3490, -0.6471,  ..., -0.5686, -0.6078, -0.6314],
          [ 0.0510, -0.0667, -0.5216,  ..., -0.5294, -0.5765, -0.5765],
          [ 0.2392, -0.0353, -0.3490,  ..., -0.6549, -0.6941, -0.6549]]

In [11]:
model_name_or_path = 'google/vit-base-patch16-224-in21k'
model = ViTForImageClassification.from_pretrained(
    model_name_or_path, 
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)

Downloading:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/330M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<a id="section-3"></a>
# <span>3. Building the model</span>
<hr style="border-bottom: solid;background-color:light;color:black;">

In [12]:
training_args = TrainingArguments(
    'finetuned-fire-detection',
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=4,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=2e-4,
  save_total_limit=2,
  remove_unused_columns=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
  hub_strategy="end"
)#

In [13]:
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [14]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['label'] for x in batch])
    }

In [15]:
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

Using cuda_amp half precision backend


<a id="section-4"></a>
# <span>4. Training the model</span>
<hr style="border-bottom: solid;background-color:light;color:black;">

In [16]:
train_results = trainer.train()
# rest is optional but nice to have
torch.save(model, "model.pth")
#trainer.save_model('model.h5')
#trainer.log_metrics("train", train_results.metrics)
#trainer.save_metrics("train", train_results.metrics)
#trainer.save_state()

***** Running training *****
  Num examples = 899
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 228


Step,Training Loss,Validation Loss,Accuracy
100,0.036,0.010521,1.0
200,0.077,0.006834,1.0


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
Saving model checkpoint to finetuned-fire-detection/checkpoint-100
Configuration saved in finetuned-fire-detection/checkpoint-100/config.json
Model weights saved in finetuned-fire-detection/checkpoint-100/pytorch_model.bin
Feature extractor saved in finetuned-fire-detection/checkpoint-100/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
Saving model checkpoint to finetuned-fire-detection/checkpoint-200
Configuration saved in finetuned-fire-detection/checkpoint-200/config.json
Model weights saved in finetuned-fire-detection/checkpoint-200/pytorch_model.bin
Feature extractor saved in finetuned-fire-detection/checkpoint-200/preprocessor_config.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from finetuned-fire-detection/checkpoint-200 (score: 0.006833572406321764).


In [17]:
torch.save(model, "model.pth")

In [18]:
model = torch.load("model.pth")

In [19]:
metrics = trainer.evaluate()
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


***** eval metrics *****
  epoch                   =        4.0
  eval_accuracy           =        1.0
  eval_loss               =     0.0068
  eval_runtime            = 0:00:04.73
  eval_samples_per_second =     21.109
  eval_steps_per_second   =      2.744


In [20]:
outputs = trainer.predict(test_ds)
print(outputs.metrics)

***** Running Prediction *****
  Num examples = 100
  Batch size = 8


{'test_loss': 0.006833572406321764, 'test_accuracy': 1.0, 'test_runtime': 4.7439, 'test_samples_per_second': 21.08, 'test_steps_per_second': 2.74}


In [21]:
torch.cuda.is_available = lambda : False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

url = '../input/fire-dataset/fire_dataset/non_fire_images/non_fire.11.png'

image = Image.open(url)
inputs = feature_extractor(images=image, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
# model predicts one of the 1000 ImageNet classes
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", id2label[predicted_class_idx])

Predicted class: non_fire_images


In [22]:
ls

[0m[01;34mfinetuned-fire-detection[0m/  model.pth


<a href="model.pth"> Download File </a>

In [23]:
model_path = '/kaggle/working/model.pth'

In [28]:
model = torch.load("model.pth", map_location=torch.device('cpu'))
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#model = model.to(device)

url = '../input/fire-dataset/fire_dataset/non_fire_images/non_fire.11.png'

image = Image.open(url)
inputs = feature_extractor(images=image, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
# model predicts one of the 1000 ImageNet classes
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", id2label[predicted_class_idx])

Predicted class: non_fire_images


In [33]:
logits.argmax(-1).item()

1

In [29]:
import pkg_resources
import types
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]

        # Some packages are weird and have different
        # imported names vs. system names
        if name == "PIL":
            name = "Pillow"
        elif name == "sklearn":
            name = "scikit-learn"

        yield name
imports = list(set(get_imports()))

requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

Pillow==9.1.1
transformers==4.20.1
requests==2.28.1
numpy==1.21.6
torch==1.11.0
torchvision==0.12.0
