# Climate wavers WaverX-NLP
WaverX-NLP is climate wavers natural language processing model created by finetuning Bert with the huggingface transformer library.
Let start by cloning waverX-NLP microservice repository

In [None]:
!git clone https://github.com/climatewavers/waverX-NLP.git


## Navigate to the model directory

In [2]:
%cd waverX-NLP

/opt/app-root/src/waverX-NLP


## Install Requirements


- torch
- transformers
- tqdm
- scikit-learn
- numpy
- pandas
- keras

In [None]:
!pip install -r requirements.txt

## Data Generation and Preparation

Data used in building was generated manually, from chatGPT and the tweets data  provide by Kaggle during the Real or Not? NLP with Disaster Tweets competition.Run below command to take a look at Kaggle data we made use of

In [None]:
!cat dataset/tweets.csv

### Building Model Dataset
We clean, proccess our data and build our dataset

In [8]:
import pandas as pd
"""
Filter Kaggle data
"""
def filter_data(input_file, output_file):
    # Read CSV file into a DataFrame
    df = pd.read_csv(input_file)

    # Drop rows where target column is 0
    # target = 0 represent tweet not related to disaster
    cleaned_df = df[df['target'] != 0]
    # Write cleaned DataFrame back to a CSV file
    cleaned_df.to_csv(output_file, index=False)

    print(f"Cleaned data saved to {output_file}")

# Filter data and store in same file
filter_data("dataset/tweets.csv", "dataset/tweets.csv")


Cleaned data saved to dataset/tweets.csv


We process our Kaggle data to concatenate them to our data generated from chatGPT

In [10]:
import pandas as pd
"""
Anaylse data, filter categories and append right data to dataset
"""

tweet = pd.read_csv("dataset/tweets.csv")

# Model labels
labels = ["Earthquake", "Drought",
          "Damaged Infrastructure", "Human Damage", "Human", "Land Slide", "Non Damage Buildings and  Street", "Non Damage Wildlife Forest",
          "Sea", "Urban Fire", "Wild Fire", "Water Disaster"]


for index, row in tweet.iterrows():
    keyword = row["keyword"]
    keyword = keyword.capitalize()
    if keyword == "Aftershock":
        keyword = "Earthquake"
    elif keyword == "Bridge collapse":
        keyword = "Damaged Infrastructure"
    elif keyword == "Buildings burning" or keyword == "Buildings on fire":
        keyword = "Urban Fire"
    elif keyword == "Burning" or keyword == "Burned" or keyword == "Bush fires":
        keyword = "Wild Fire"
    elif keyword == "Catastrophic":
        if "fire" in row["text"]:
            keyword = "Wild Fire"
        elif "earthquake" in row["text"]:
            keyword = "Earthquake"
    elif "flood" in keyword:
        keyword = "Water Disaster"
    elif "wild" in keyword:
        keyword = "Wild Fire"
    if keyword in labels:
        text = str(row["text"]).replace(" ", "_")
        label = keyword
        dataset = pd.DataFrame([[text, label]])
        dataset.to_csv("dataset/" + "disaster_text.csv",
                   mode='a', header=False, index=False)

print("Dataset generated")


def add_data_type(input_file):
    df = pd.read_csv(input_file)
    #Make sure all data are of same type of string
    df['text'] = df['text'].astype(str)
    df['label'] = df['label'].astype(str)
    print("Ensured all data are of type str")



if __name__ == "__main__":
    add_data_type("dataset/disaster_text.csv")

Dataset generated
Ensured all data are of type str


### Split Dataset
Split our dataset into train, validate, and test sets in the following percentage - 80%, 10%, 10% respectively

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset into a pandas DataFrame
df = pd.read_csv("dataset/disaster_text.csv")

# Split the dataset into train, validate, and test sets (80%, 10%, 10%)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
validate_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Save the datasets to CSV files
train_df.to_csv('dataset/train_disaster_dataset.csv', index=False)
validate_df.to_csv('dataset/val_disaster_dataset.csv', index=False)
test_df.to_csv('dataset/test_disaster_dataset.csv', index=False)

print("-----Dataset split ----")
print(f"Total train dataset == {len(train_df)}")
print(f"Total test dataset == {len(test_df)}")
print(f"Total validate dataset == {len(validate_df)}")

-----Dataset split ----
Total train dataset == 320
Total test dataset == 41
Total validate dataset == 40


## Model Building

### Process Dataset
Load dataset to process, tokenize and have ready to use with our dataloader

In [53]:
import pandas as pd
import torch
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertConfig
from tqdm import tqdm
import logging

## setting the threshold of logger to INFO
logging.basicConfig(filename='data_loader.log', level=logging.INFO)

## creating an object
logger = logging.getLogger()



class DisastersData:
    def __init__(self, data_path, max_sequence_length=512):
        """
        Load dataset and bert tokenizer
        """
        ## load data into memory
        self.train_df = pd.read_csv(data_path['train'])
        self.val_df = pd.read_csv(data_path['val'])
        self.test_df = pd.read_csv(data_path['test'])
        ## set max sequence length for model
        self.max_sequence_length = max_sequence_length
        ## get bert tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-mini', do_lower_case=True)
        self.tokenizer.save_pretrained("model/tokenizer")
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(self.train_df['label'].values)

    def train_val_test_split(self):
        """
        Separate out labels and texts
        """
        train_texts = self.train_df['text'].values
        train_labels = self.label_encoder.transform(self.train_df['label'].values)
        val_texts = self.val_df['text'].values
        val_labels =  self.label_encoder.transform(self.val_df['label'].values)
        test_texts = self.test_df['text'].values
        test_labels =  self.label_encoder.transform(self.test_df['label'].values)

        return train_texts, val_texts, test_texts, train_labels, val_labels, test_labels

    def preprocess(self, texts):
        """
        Add bert token (CLS and SEP) tokens to each sequence pre-tokenization
        """
        ## separate labels and texts before preprocessing
        # Adding CLS and SEP tokens at the beginning and end of each sequence for BERT
        texts_processed = ["[CLS] " + str(sequence) + " [SEP]" for sequence in texts]
        return texts_processed

    def tokenize(self, texts):
        """
        Use bert tokenizer to tokenize each sequence and post-process
        by padding or truncating to a fixed length
        """
        ## tokenize sequence
        tokenized_texts = [self.tokenizer.tokenize(text) for text in tqdm(texts)]

        ## convert tokens to ids
        print('convert tokens to ids')
        text_ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tqdm(tokenized_texts)]

        ## pad our text tokens for each sequence
        print('pad our text tokens for each sequence')
        text_ids_post_processed = pad_sequences(text_ids,
                                       maxlen=self.max_sequence_length,
                                       dtype="long",
                                       truncating="post",
                                       padding="post")
        return text_ids_post_processed

    def create_attention_mask(self, text_ids):
        """
        Add attention mask for padding tokens
        """
        attention_masks = []
        # create a mask of 1s for each token followed by 0s for padding
        for seq in tqdm(text_ids):
            seq_mask = [float(i>0) for i in seq]
            attention_masks.append(seq_mask)
        return attention_masks

    def process_texts(self):
        """
        Apply preprocessing and tokenization pipeline of texts
        """
        ## perform the split
        train_texts, val_texts, test_texts, train_labels, val_labels, test_labels = self.train_val_test_split()

        print('preprocessing texts')
        ## preprocess train, val, test texts
        train_texts_processed = self.preprocess(train_texts)
        val_texts_processed = self.preprocess(val_texts)
        test_texts_processed = self.preprocess(test_texts)

        del train_texts
        del val_texts
        del test_texts

        ## preprocess train, val, test texts
        print('tokenizing train texts')
        train_ids = self.tokenize(train_texts_processed)
        print('tokenizing val texts')
        val_ids = self.tokenize(val_texts_processed)
        print('tokenizing test texts')
        test_ids = self.tokenize(test_texts_processed)

        del train_texts_processed
        del val_texts_processed
        del test_texts_processed

        del self.train_df
        del self.val_df
        del self.test_df

        ## create masks for train, val, test texts
        print('creating train attention masks for texts')
        train_masks = self.create_attention_mask(train_ids)
        print('creating val attention masks for texts')
        val_masks = self.create_attention_mask(val_ids)
        print('creating test attention masks for texts')
        test_masks = self.create_attention_mask(test_ids)
        return (
                train_ids,
                val_ids,
                test_ids,
                train_masks,
                val_masks,
                test_masks,
                train_labels,
                val_labels,
                test_labels
                )


    def text_to_tensors(self):
        """
        Converting all the data into torch tensors
        """
        train_ids,  val_ids, test_ids, \
        train_masks, val_masks, test_masks, \
        train_labels, val_labels, test_labels = self.process_texts()

        print('converting all variables to tensors')
        ## convert inputs, masks and labels to torch tensors
        self.train_inputs = torch.tensor(train_ids)
        self.train_labels = torch.tensor(train_labels)
        self.train_masks = torch.tensor(train_masks)

        self.validation_inputs = torch.tensor(val_ids)
        self.validation_labels = torch.tensor(val_labels)
        self.validation_masks = torch.tensor(val_masks)

        self.test_inputs = torch.tensor(test_ids)
        self.test_labels = torch.tensor(test_labels)
        self.test_masks = torch.tensor(test_masks)

if __name__ == '__main__':
    data_path = {
        'train': 'dataset/train_disaster_dataset.csv',
        'val': 'dataset/val_disaster_dataset.csv',
        'test': 'dataset/test_disaster_dataset.csv'
    }
    DisastersData(data_path).text_to_tensors()


preprocessing texts
tokenizing train texts


100%|██████████| 320/320 [00:00<00:00, 1764.20it/s]


convert tokens to ids


100%|██████████| 320/320 [00:00<00:00, 38741.98it/s]


pad our text tokens for each sequence
tokenizing val texts


100%|██████████| 40/40 [00:00<00:00, 1664.75it/s]


convert tokens to ids


100%|██████████| 40/40 [00:00<00:00, 27494.62it/s]


pad our text tokens for each sequence
tokenizing test texts


100%|██████████| 41/41 [00:00<00:00, 1689.29it/s]


convert tokens to ids


100%|██████████| 41/41 [00:00<00:00, 35471.63it/s]


pad our text tokens for each sequence
creating train attention masks for texts


100%|██████████| 320/320 [00:00<00:00, 4598.31it/s]


creating val attention masks for texts


100%|██████████| 40/40 [00:00<00:00, 4219.09it/s]


creating test attention masks for texts


100%|██████████| 41/41 [00:00<00:00, 4136.79it/s]

converting all variables to tensors





### Building Model Dataloader
Let build our model dataloader

In [25]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from process_dataset import DisastersData


class DisastersDataLoader:

    def __init__(self, data_file, batch_size=8):
        self.data = DisastersData(data_file)
        self.batch_size = batch_size
        self.create_loaders()

    def create_loaders(self):
        """
        Create Torch dataloaders for data splits
        """
        self.data.text_to_tensors()
        print('creating dataloaders')
        train_data = TensorDataset(self.data.train_inputs,
                                    self.data.train_masks,
                                    self.data.train_labels)
        train_sampler = RandomSampler(train_data)
        self.train_dataloader = DataLoader(train_data,
                                            sampler=train_sampler,
                                            batch_size=self.batch_size)

        validation_data = TensorDataset(self.data.validation_inputs,
                                        self.data.validation_masks,
                                        self.data.validation_labels)
        validation_sampler = SequentialSampler(validation_data)
        self.validation_dataloader = DataLoader(validation_data,
                                                sampler=validation_sampler,
                                                batch_size=self.batch_size)

        test_data = TensorDataset(self.data.test_inputs,
                                        self.data.test_masks,
                                        self.data.test_labels)
        test_sampler = SequentialSampler(test_data)
        self.test_dataloader = DataLoader(test_data,
                                                sampler=test_sampler,
                                                batch_size=self.batch_size)
        print('finished creating dataloaders')

if __name__=='__main__':
    data_path = {
        'train': 'dataset/train_disaster_dataset.csv',
        'val': 'dataset/val_disaster_dataset.csv',
        'test': 'dataset/test_disaster_dataset.csv'
    }
    loader = DisastersDataLoader(data_path)


preprocessing texts
tokenizing train texts


100%|██████████| 320/320 [00:00<00:00, 1612.78it/s]

convert tokens to ids



100%|██████████| 320/320 [00:00<00:00, 35148.41it/s]


pad our text tokens for each sequence
tokenizing val texts


100%|██████████| 40/40 [00:00<00:00, 1713.94it/s]


convert tokens to ids


100%|██████████| 40/40 [00:00<00:00, 36994.96it/s]


pad our text tokens for each sequence
tokenizing test texts


100%|██████████| 41/41 [00:00<00:00, 1833.14it/s]


convert tokens to ids


100%|██████████| 41/41 [00:00<00:00, 35530.26it/s]


pad our text tokens for each sequence
creating train attention masks for texts


100%|██████████| 320/320 [00:00<00:00, 4437.36it/s]


creating val attention masks for texts


100%|██████████| 40/40 [00:00<00:00, 4721.59it/s]


creating test attention masks for texts


100%|██████████| 41/41 [00:00<00:00, 4365.30it/s]

converting all variables to tensors
creating dataloaders
finished creating dataloaders





### Load Bert Model
BERT model is initialized and fine-tuned using the Hugging Face Transformers library. BERT model is loaded using the from_pretrained() method. "prajjwal1/bert-mini" specifies the pre-trained model available in the Hugging Face model hub. The num_labels parameter is set based on the number of classes in your specific classification task.



In [36]:
from transformers import BertModel, BertConfig
from transformers import BertForSequenceClassification
import torch


class BERTClassifier:
    def __init__(self, num_labels=13):
        self.configuration = BertConfig()

    def get_model(self):
        """
        Initialize pretrained bert model from huggingface model hub
        """
        # initializing a model from the bert-base-uncased style configuration
        model = BertModel(self.configuration)

        model = BertForSequenceClassification.from_pretrained(
            "prajjwal1/bert-mini", num_labels=13
        )
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)

        return model


### Bert Model Configuration
The bert model configuration is specified here

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

class BertOptimConfig:
    def __init__(self, model, train_dataloader, epochs=2):
        # Don't apply weight decay to any parameters whose names include these tokens.
        # (Here, the BERT doesn't have `gamma` or `beta` parameters, only `bias` terms)
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.weight']
        # Separate the `weight` parameters from the `bias` parameters.
        # - For the `weight` parameters, this specifies a 'weight_decay_rate' of 0.01.
        # - For the `bias` parameters, the 'weight_decay_rate' is 0.0.
        optimizer_grouped_parameters = [
            # Filter for all parameters which *don't* include 'bias', 'gamma', 'beta'.
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            'weight_decay_rate': 0.1},

            # Filter for parameters which *do* include those.
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate': 0.0}
        ]
        # Note - `optimizer_grouped_parameters` only includes the parameter values, not
        # the names.

        # Number of training epochs (authors recommend between 2 and 4)
        self.epochs = epochs

        self.optimizer = AdamW(optimizer_grouped_parameters,
                        lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                        eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                        )
        # Total number of training steps is number of batches * number of epochs.
        # `train_dataloader` contains batched data so `len(train_dataloader)` gives
        # us the number of batches.
        total_steps = len(train_dataloader) * self.epochs

        ## create the learning rate scheduler.
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
                                                    num_warmup_steps = 0, # Default value in run_glue.py
                                                    num_training_steps = total_steps)

### Model Training
Training model function that trains our model

In [44]:
from utils.accuracy import flat_accuracy
from tqdm import trange
import torch
from sklearn.metrics import accuracy_score


def train_model(
    model, optimizer, scheduler, train_dataloader, validation_dataloader, epochs, device
):
    t = []

    # Store our loss and accuracy for plotting
    train_loss_set = []
    
    print("Training model")
    # trange is a tqdm wrapper around the normal python range
    for _ in trange(epochs, desc="Epoch"):

        ## set our model to training mode
        model.train()

        ## tracking variables
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        # train the model for one epoch
        for step, batch in enumerate(train_dataloader):
            ## move batch to GPU
            batch = tuple(t.to(device) for t in batch)
            ## unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch
            ## reset the gradients
            optimizer.zero_grad()
            ## forward pass
            outputs = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels,
            )
            loss, logits = outputs[:2]
            train_loss_set.append(loss.item())
            ## backward pass
            loss.backward()
            ## update parameters and take a step using the computed gradient
            optimizer.step()

            ## update the learning rate.
            scheduler.step()

            ## update tracking variables
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        print("Train loss: {}".format(tr_loss / nb_tr_steps))

        # Put model in evaluation mode to evaluate loss on the validation set
        model.eval()

        # Tracking variables
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:
            # add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch
            # avoiding model's computation and storage of gradients -> saving memory and speeding up validation
            with torch.no_grad():
                # forward pass, calculate logit predictions
                logits = model(
                    b_input_ids, token_type_ids=None, attention_mask=b_input_mask
                )

            # Move logits and labels to CPU
            logits = logits[0].detach().cpu().numpy()
            label_ids = b_labels.to("cpu").numpy()

            tmp_eval_accuracy = accuracy_score(label_ids, logits)

            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1
        model.save_pretrained("./model")
        print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))

    return model

### Model Evaluation

We create our model evaluation function

In [None]:
import torch
from utils.accuracy import flat_accuracy


def eval_model(model, test_dataloader, device):
    ## tracking variables
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    ## evaluate data for one epoch
    for batch in test_dataloader:
        ## add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        ## unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        ## avoiding model's computation and storage of gradients -> saving memory and speeding up validation
        with torch.no_grad():
            # forward pass, calculate logit predictions
            logits = model(
                b_input_ids, token_type_ids=None, attention_mask=b_input_mask
            )

        ## move logits and labels to CPU
        logits = logits[0].detach().cpu().numpy()
        label_ids = b_labels.to("cpu").numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    print("Test Accuracy: {}".format(eval_accuracy / nb_eval_steps))


### Putting All Pieces Together

In [14]:
import torch
from bert import BERTClassifier
from config import BertOptimConfig
from train_model import train_model
from evaluate import eval_model
from data_loader import DisastersDataLoader


epochs = 7
num_labels = 13
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_path = {
    "train": "dataset/train_disaster_dataset.csv",
    "val": "dataset/val_disaster_dataset.csv",
    "test": "dataset/test_disaster_dataset.csv",
}
data_loaders = DisastersDataLoader(data_path, batch_size=8)
model = BERTClassifier(num_labels=num_labels).get_model()
optim_config = BertOptimConfig(
    model=model, train_dataloader=data_loaders.train_dataloader, epochs=epochs
)
    ## execute the training routine
model = train_model(
    model=model,
    optimizer=optim_config.optimizer,
    scheduler=optim_config.scheduler,
    train_dataloader=data_loaders.train_dataloader,
    validation_dataloader=data_loaders.validation_dataloader,
    epochs=epochs,
    device=device,
)

## test model performance on unseen test set
eval_model(model=model, test_dataloader=data_loaders.test_dataloader, device=device)

## Save model and tokenizer
model.save_pretrained("model/waverx-nlp")

preprocessing texts
tokenizing train texts


100%|██████████| 320/320 [00:00<00:00, 1720.43it/s]


convert tokens to ids


100%|██████████| 320/320 [00:00<00:00, 41667.00it/s]

pad our text tokens for each sequence





tokenizing val texts


100%|██████████| 40/40 [00:00<00:00, 1728.24it/s]


convert tokens to ids


100%|██████████| 40/40 [00:00<00:00, 35726.61it/s]


pad our text tokens for each sequence
tokenizing test texts


100%|██████████| 41/41 [00:00<00:00, 1800.45it/s]


convert tokens to ids


100%|██████████| 41/41 [00:00<00:00, 35692.50it/s]


pad our text tokens for each sequence
creating train attention masks for texts


100%|██████████| 320/320 [00:00<00:00, 4363.03it/s]


creating val attention masks for texts


100%|██████████| 40/40 [00:00<00:00, 4311.58it/s]


creating test attention masks for texts


100%|██████████| 41/41 [00:00<00:00, 4357.33it/s]


converting all variables to tensors
creating dataloaders
finished creating dataloaders


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

Train loss: 2.4822049736976624


Epoch:  14%|█▍        | 1/7 [00:59<05:59, 59.91s/it]

Validation Accuracy: 0.25
Train loss: 2.304527533054352


Epoch:  29%|██▊       | 2/7 [01:57<04:51, 58.26s/it]

Validation Accuracy: 0.375
Train loss: 2.1314227849245073


Epoch:  43%|████▎     | 3/7 [02:54<03:51, 57.79s/it]

Validation Accuracy: 0.85
Train loss: 2.008513242006302


Epoch:  57%|█████▋    | 4/7 [03:51<02:52, 57.64s/it]

Validation Accuracy: 0.925
Train loss: 1.9165636211633683


Epoch:  71%|███████▏  | 5/7 [04:45<01:52, 56.31s/it]

Validation Accuracy: 0.925
Train loss: 1.8544430553913116


Epoch:  86%|████████▌ | 6/7 [05:40<00:55, 55.87s/it]

Validation Accuracy: 0.95
Train loss: 1.8229126304388046


Epoch: 100%|██████████| 7/7 [06:37<00:00, 56.75s/it]

Validation Accuracy: 0.95





Test Accuracy: 0.7291666666666666


## Run Inference with Intel Pytorch Extension

In [26]:
from tensorflow import keras
import pandas as pd
import numpy as np
from process_dataset import DisastersData
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch
import intel_extension_for_pytorch as ipex

# Load the fine-tuned BERT model and tokenizer
model_path = "model/waverx-nlp"
tokenizer = BertTokenizer.from_pretrained("model/tokenizer")
model = BertForSequenceClassification.from_pretrained(model_path)
model = ipex.optimize(model)

# Define your tokenized labels and mapping dictionary or list
tokenized_labels =  ["Earthquake", "Drought",
          "Damaged Infrastructure", "Human Damage", "Human", "Land Slide", "Non Damage Buildings and  Street", "Non Damage Wildlife Forest",
          "Sea", "Urban Fire", "Wild Fire", "Water Disaster", "Humanitarian Aid"]

tokenized_labels.sort()

def predict(input_text):

    # Tokenize input text
    tokenized_input = tokenizer(input_text, return_tensors="pt")

    # Perform inference
    with torch.no_grad():
        outputs = model(**tokenized_input)

    # Get predicted probabilities and labels
    probabilities = torch.softmax(outputs.logits, dim=1)
    predicted_label_idx = torch.argmax(probabilities, dim=1).item()

    token_to_label_mapping = {idx: label for idx, label in enumerate(tokenized_labels)}

    # Map predicted label index to original label
    predicted_original_label = token_to_label_mapping[predicted_label_idx]

    # Convert tensor to numpy array
    probabilities = probabilities.numpy()

    # Convert probabilities to percentages
    probabilities_percentage = probabilities * 100
    
    #Set probability threshold
    threshold = 15
    
    # Find the maximum probability percentage
    max_probability = max(probabilities_percentage.tolist()[0])
    
    if max_probability < threshold:
    # Assign a specific label when the maximum probability is below the threshold
        predicted_label = "Prediction Failed"
    
    # Create a dictionary from the list of probabilities and labels
    result_dict = {key: value for key, value in zip(tokenized_labels, probabilities_percentage.tolist()[0])}

    print("Predicted Label:", predicted_original_label)
    print("Prediction Probabilities:",result_dict)
    
    return {"prediction": predicted_original_label,
            "probability": result_dict }

  

if __name__ == "__main__":
    # Input text you want to classify
    input_text = " Urgent earthquake."
    print("Predicting input")
    predict(input_text)


Predicting input
Predicted Original Label: Earthquake
Predicted Probabilities: {'Damaged Infrastructure': 7.9579620361328125, 'Drought': 11.536685943603516, 'Earthquake': 21.99901580810547, 'Human': 4.627230167388916, 'Human Damage': 6.254914283752441, 'Humanitarian Aid': 4.814874172210693, 'Land Slide': 4.5093889236450195, 'Non Damage Buildings and  Street': 6.152083873748779, 'Non Damage Wildlife Forest': 3.9017553329467773, 'Sea': 4.942859172821045, 'Urban Fire': 8.086578369140625, 'Water Disaster': 5.939942359924316, 'Wild Fire': 9.276710510253906}


## Serve Model on Flask

In [None]:
#!/usr/bin/python3
""" Flask Application """

from os import environ
from flask import Flask, jsonify, request
from flask_cors import CORS
from prediction import predict
import json


app = Flask(__name__)
app.config['JSONIFY_PRETTYPRINT_REGULAR'] = True
cors = CORS(app, resources={r"/api/v1/*": {"origins": "*"}})

@app.route("/api/v1/nlp/model/waverx", methods=['POST'], strict_slashes=False)
def model_inference():
    data = request.data or '{}'
    body = json.loads(data)
    return jsonify(predict(body))

@app.route("/api/v1/nlp/model/waverx/status", strict_slashes=False)
def model_status():
    return jsonify({"status": "OK"})

@app.errorhandler(404)
def not_found(error):
    """ 404 Error
    ---
    responses:
      404:
        description: a resource was not found
    """
    return make_response(jsonify({'error': "Not found"}), 404)



if __name__ == "__main__":
    """ Main Function """
    host = environ.get('WAVERX_HOST')
    port = environ.get('WAVERX_PORT')
    if not host:
        host = '0.0.0.0'
    if not port:
        port = '5000'
    app.run(host=host, port=port, threaded=True)
