In [1]:
!pip install transformers peft datasets accelerate bitsandbytes torch pdfplumber langchain pypdf
!pip install torch transformers accelerate peft datasets bitsandbytes



In [None]:
from huggingface_hub import login

login()



In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "google/gemma-2-2b-it"

# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, # Rank of low-rank matrix
    lora_alpha=32, # Scaling factor
    lora_dropout=0.05, # Dropout rate
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
model.print_trainable_parameters()


trainable params: 3,194,880 || all params: 2,617,536,768 || trainable%: 0.1221


In [5]:
from datasets import load_dataset

dataset = load_dataset("ccdv/arxiv-summarization")
dataset = dataset["train"].shuffle().select(range(5000))  # Use a subset for fine-tuning


In [6]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
    return text

# Example usage
pdf_text = extract_text_from_pdf("/content/1-s2.0-S2352484723010041-main (1).pdf")
print(pdf_text[:3000])  # Print first 1000 characters


Available online at www.sciencedirect.com
ScienceDirect
Energy Reports 9 (2023) 247–257
www.elsevier.com/locate/egyr
TMREES23-Fr, EURACA 06–08 February 2023, Metz-Grand Est, France
CNN-based, contextualized, real-time fire detection in computational
resource-constrained environments
Eleni
Tsaleraa,∗
, Andreas Papadakisb, Ioannis Voyiatzisa, Maria Samarakoua
aDepartmentofInformaticsandComputerEngineering,UniversityofWestAttica,AgiouSpyridonos,Egaleo,12243,Greece
bDepartmentofElectricalandElectronicsEngineeringEducators,SchoolofPedagogicalandTechnologicalEducation,Athens,14122,Greece
Received19May2023;accepted29May2023
Availableonline9June2023
Abstract
The increasing occurrence of wildfires, amplified by the changing climate conditions and drought, poses threats to human
lives, the environment and the geographically dispersed infrastructures. Such impact necessitates the prompt identification of
wildfires so that appropriate countermeasures are taken. The availability of electronic equip

In [7]:
data = [
    {
        "instruction": "Summarize the research paper.",
        "input": pdf_text,
        "response": "This study explores lightweight CNNs such as SqueezeNet, ShuffleNet, and ResNet50 for wildfire identification and contextualization, achieving 96% classification accuracy."
    }
]


In [8]:
import json

with open("train_data.json", "w") as f:
    json.dump(data, f)


In [9]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="train_data.json")
print(dataset["train"][0])  # Check a sample


Generating train split: 0 examples [00:00, ? examples/s]

{'instruction': 'Summarize the research paper.', 'input': 'Available online at www.sciencedirect.com\nScienceDirect\nEnergy Reports 9 (2023) 247–257\nwww.elsevier.com/locate/egyr\nTMREES23-Fr, EURACA 06–08 February 2023, Metz-Grand Est, France\nCNN-based, contextualized, real-time fire detection in computational\nresource-constrained environments\nEleni\nTsaleraa,∗\n, Andreas Papadakisb, Ioannis Voyiatzisa, Maria Samarakoua\naDepartmentofInformaticsandComputerEngineering,UniversityofWestAttica,AgiouSpyridonos,Egaleo,12243,Greece\nbDepartmentofElectricalandElectronicsEngineeringEducators,SchoolofPedagogicalandTechnologicalEducation,Athens,14122,Greece\nReceived19May2023;accepted29May2023\nAvailableonline9June2023\nAbstract\nThe increasing occurrence of wildfires, amplified by the changing climate conditions and drought, poses threats to human\nlives, the environment and the geographically dispersed infrastructures. Such impact necessitates the prompt identification of\nwildfires so that

In [10]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load tokenizer
model_name = "google/gemma-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model in bfloat16 precision
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# Check model size
print(f"Model loaded: {model_name}")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded: google/gemma-2b-it


In [11]:
from peft import LoraConfig, get_peft_model

# LoRA Configuration
lora_config = LoraConfig(
    r=16,   # Rank (controls the number of trainable parameters)
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate for regularization
    bias="none",
    task_type="CAUSAL_LM"  # Causal Language Modeling (for text generation)
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 1,843,200 || all params: 2,508,015,616 || trainable%: 0.0735


In [12]:
from transformers import TrainingArguments, Trainer
import torch
from datasets import load_dataset

# ✅ Load custom dataset
dataset = load_dataset("json", data_files="train_data.json")

# ✅ Tokenization function
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["input"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokenized["labels"] = tokenized["input_ids"].copy()  # Set labels for training
    return tokenized

# ✅ Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [13]:
training_args = TrainingArguments(
    output_dir="./gemma_finetuned",  # Save model here
    per_device_train_batch_size=4,   # Adjust batch size based on GPU memory
    gradient_accumulation_steps=8,   # Effective batch size = batch_size * accumulation_steps
    num_train_epochs=3,              # Number of epochs
    learning_rate=5e-5,               # Learning rate
    save_total_limit=2,               # Keep only the last 2 checkpoints
    save_strategy="epoch",            # Save model at each epoch
    evaluation_strategy="no",         # No evaluation dataset
    report_to="none",                 # Disable logging (optional)
    push_to_hub=False                 # Disable Hugging Face hub (optional)
)



In [14]:
# ✅ Custom Trainer to compute loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):  # ✅ Added **kwargs
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Compute CrossEntropy loss
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

# ✅ Initialize Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],  # Use tokenized dataset
    tokenizer=tokenizer
)

# ✅ Start training
trainer.train()

  trainer = CustomTrainer(


Step,Training Loss


TrainOutput(global_step=3, training_loss=13.9375, metrics={'train_runtime': 12.2132, 'train_samples_per_second': 0.246, 'train_steps_per_second': 0.246, 'total_flos': 18282033709056.0, 'train_loss': 13.9375, 'epoch': 3.0})

In [15]:
model.save_pretrained("./gemma_finetuned")
tokenizer.save_pretrained("./gemma_finetuned")


('./gemma_finetuned/tokenizer_config.json',
 './gemma_finetuned/special_tokens_map.json',
 './gemma_finetuned/tokenizer.model',
 './gemma_finetuned/added_tokens.json',
 './gemma_finetuned/tokenizer.json')

In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Define model path
model_path = "./gemma_finetuned"

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")  # Auto GPU/CPU allocation


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
import fitz  # PyMuPDF

# Path to the research paper
pdf_path = "/content/Final manuscript _30 Jan.pdf"

# Read and extract text from the PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n\n"  # Extract text from each page
    return text

# Extract text
research_text = extract_text_from_pdf(pdf_path)

# Print first 500 characters to verify extraction
print(research_text[:500])


Early Detection of Forest Fire Using Fine-tuned 
MobileNetV2: A Lightweight Deep Learning Approach 
Sanjeev Rao1 [0000-0001-7338-1930], Prathamjyot Singh2 [0009-0000-2164-310X], Moksh Sharma3 
[0009-0005-1574-076X] and Yugan Dhar4 [0009-0002-9051-1965] 
1,2,3,4 Computer Science and Engineering Department 
1,2,3,4 Thapar Institute of Engineering and Technology, Patiala, Punjab, India 
1 sanjeev.rao@thapar.edu, 2 psingh1_be22@thapar.edu,  
3 msharma2_be22@thapar.edu and 4 ydhar_be22@thapar.edu 
Ab


In [18]:
# Truncate text if it's too long
max_chars = 10000  # Adjust based on model's token limit
research_text = research_text[:max_chars]

research_text

"Early Detection of Forest Fire Using Fine-tuned \nMobileNetV2: A Lightweight Deep Learning Approach \nSanjeev Rao1 [0000-0001-7338-1930], Prathamjyot Singh2 [0009-0000-2164-310X], Moksh Sharma3 \n[0009-0005-1574-076X] and Yugan Dhar4 [0009-0002-9051-1965] \n1,2,3,4 Computer Science and Engineering Department \n1,2,3,4 Thapar Institute of Engineering and Technology, Patiala, Punjab, India \n1 sanjeev.rao@thapar.edu, 2 psingh1_be22@thapar.edu,  \n3 msharma2_be22@thapar.edu and 4 ydhar_be22@thapar.edu \nAbstract. Forest fire detection is essential for an efficient and rapid response to \nenvironmental protection. Existing methods for wildfire detection often rely on \ntraditional image processing techniques or shallow learning models, which strug-\ngle with real-time performance and accuracy, particularly in remote forest envi-\nronments with limited processing resources. This study addresses the perfor-\nmance gap in forest fire detection methodologies by comparing multiple convo-\nluti

In [21]:
# Tokenize text
inputs = tokenizer(research_text, return_tensors="pt", truncation=True, max_length=512).to("cuda")

# Generate summary
output = model.generate(**inputs, max_length=1000, do_sample=True, temperature=0.7)

# Decode and print the summary
summary = tokenizer.decode(output[0], skip_special_tokens=True)
print("\n🔹 Summary:\n", summary)



🔹 Summary:
 Early Detection of Forest Fire Using Fine-tuned 
MobileNetV2: A Lightweight Deep Learning Approach 
Sanjeev Rao1 [0000-0001-7338-1930], Prathamjyot Singh2 [0009-0000-2164-310X], Moksh Sharma3 
[0009-0005-1574-076X] and Yugan Dhar4 [0009-0002-9051-1965] 
1,2,3,4 Computer Science and Engineering Department 
1,2,3,4 Thapar Institute of Engineering and Technology, Patiala, Punjab, India 
1 sanjeev.rao@thapar.edu, 2 psingh1_be22@thapar.edu,  
3 msharma2_be22@thapar.edu and 4 ydhar_be22@thapar.edu 
Abstract. Forest fire detection is essential for an efficient and rapid response to 
environmental protection. Existing methods for wildfire detection often rely on 
traditional image processing techniques or shallow learning models, which strug-
gle with real-time performance and accuracy, particularly in remote forest envi-
ronments with limited processing resources. This study addresses the perfor-
mance gap in forest fire detection methodologies by comparing multiple convo-
lution

In [22]:
import shutil
shutil.make_archive("gemma_finetuned", 'zip', "./gemma_finetuned")


'/content/gemma_finetuned.zip'

In [23]:
from google.colab import files
files.download("gemma_finetuned.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>