## Natural Language Processing with Disaster Tweets
#### By: Niv Dobzinski (PhD)

In [None]:
!pip install torch transformers peft

In [None]:
import numpy as np
import pandas as pd
import torch
from peft import LoraConfig, get_peft_model
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
import re
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
# from torch.nn import DataParallel 

### Data loading

In [None]:
# Data loading and preprocessing
train_file = "/kaggle/input/nlp-getting-started/train.csv"
test_file = "/kaggle/input/nlp-getting-started/test.csv"
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

### Preprocessing

### Fill missing values with a placeholder token UNK

In [None]:
y = train_df['target']
X = train_df.drop(columns=['target'])

X['keyword'] = X['keyword'].fillna('[UNK]')
X['location'] = X['location'].fillna('[UNK]')
test_df['keyword'] = test_df['keyword'].fillna('[UNK]')
test_df['location'] = test_df['location'].fillna('[UNK]')

### Text ckeaning

In [None]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    text = text.lower()
    text = re.sub(r'\[unk\]', 'PLACEHOLDERUNK', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = re.sub(r'PLACEHOLDERUNK', '[UNK]', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

X['text'] = X['text'].apply(clean_text)
X['keyword'] = X['keyword'].apply(clean_text)
X['location'] = X['location'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)
test_df['keyword'] = test_df['keyword'].apply(clean_text)
test_df['location'] = test_df['location'].apply(clean_text)

### Concatinate the columns text

In [None]:
X['combined'] = '[KEYWORD] ' + X['keyword'] + ' [LOCATION] ' + X['location'] + ' [TEXT] ' + X['text']
test_df['combined'] = '[KEYWORD] ' + test_df['keyword'] + ' [LOCATION] ' + test_df['location'] + ' [TEXT] ' + test_df['text']

### Split dataset and Tokenization

In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
def tokenize_texts(texts, labels=None):
    tokenized_data = tokenizer(texts, padding="max_length", truncation=True, return_tensors='pt')
    if labels is not None:
        return tokenized_data['input_ids'], tokenized_data['attention_mask'], torch.tensor(labels)
    return tokenized_data['input_ids'], tokenized_data['attention_mask']

X_train, X_val, y_train, y_val = train_test_split(X['combined'], y, test_size=0.2, random_state=42)
train_inputs, train_masks, train_labels = tokenize_texts(X_train.tolist(), y_train.tolist())
val_inputs, val_masks, val_labels = tokenize_texts(X_val.tolist(), y_val.tolist())
test_inputs, test_masks = tokenize_texts(test_df['combined'].tolist())



### Convert to Tensors

In [None]:
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
test_dataset = TensorDataset(test_inputs, test_masks)

### Prepare input data

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

### Load the pre-trained model

In [None]:
model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=2)
# Generate target modules
target_modules = []
for layer in range(24):
    for sub_module in ['query', 'key', 'value']:
        target_modules.append(f'roberta.encoder.layer.{layer}.attention.self.{sub_module}')

# Config Lora
lora_config = LoraConfig(
    target_modules=target_modules,
    r=4,  # rank of the low-rank approximation
    lora_alpha=128,  # scaling factor
    lora_dropout=0.1  # dropout rate for LoRA layers
)


### Wrap model with LoRA

In [None]:
lora_model = get_peft_model(model, lora_config)
# lora_model = DataParallel(lora_model)

### Enable GPU

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lora_model.to(device)

### Optimizer and Loss parameters

In [None]:
optimizer = torch.optim.Adam(lora_model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

accumulation_steps = 1  # Number of steps to accumulate gradients
scaler = GradScaler()

step_scheduler = StepLR(optimizer, step_size=1, gamma=0.9)

### Model fine-tuning

In [None]:
# Training loop
epochs = 6
lora_model.train()

for epoch in range(epochs):
    total_loss = 0
    optimizer.zero_grad()

    for i, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")):
        inputs, attention_masks, labels = batch
        inputs, attention_masks, labels = inputs.to(device), attention_masks.to(device), labels.to(device)

        with autocast():
            outputs = lora_model(input_ids=inputs, attention_mask=attention_masks).logits
            loss = loss_fn(outputs, labels) / accumulation_steps

        scaler.scale(loss).backward()

        if (i + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item() * accumulation_steps

    avg_train_loss = total_loss / len(train_loader)

    # Validation step
    lora_model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs, attention_masks, labels = batch
            inputs, attention_masks, labels = inputs.to(device), attention_masks.to(device), labels.to(device)
            with autocast():
                outputs = lora_model(input_ids=inputs, attention_mask=attention_masks).logits
                loss = loss_fn(outputs, labels)
                val_loss += loss.item() * inputs.size(0)  # Sum up the batch loss

                # Calculate accuracy
                _, preds = torch.max(outputs, dim=1)
                correct_predictions += torch.sum(preds == labels)
                total_predictions += labels.size(0)

    avg_val_loss = val_loss / len(val_loader.dataset)  # Average over the dataset
    accuracy = correct_predictions.double() / total_predictions
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}")

    step_scheduler.step()  # Adjust learning rate after each epoch
    lora_model.train()

Epoch 1/6: 100%|██████████| 191/191 [01:38<00:00,  1.94it/s]


Epoch 1/6, Train Loss: 0.6462, Validation Loss: 0.4785, Accuracy: 0.8011


Epoch 2/6: 100%|██████████| 191/191 [01:37<00:00,  1.96it/s]


Epoch 2/6, Train Loss: 0.4467, Validation Loss: 0.4375, Accuracy: 0.8050


Epoch 3/6: 100%|██████████| 191/191 [01:37<00:00,  1.96it/s]


Epoch 3/6, Train Loss: 0.4109, Validation Loss: 0.4059, Accuracy: 0.8339


Epoch 4/6: 100%|██████████| 191/191 [01:37<00:00,  1.96it/s]


Epoch 4/6, Train Loss: 0.3947, Validation Loss: 0.3816, Accuracy: 0.8483


Epoch 5/6: 100%|██████████| 191/191 [01:37<00:00,  1.96it/s]


Epoch 5/6, Train Loss: 0.3850, Validation Loss: 0.4249, Accuracy: 0.8267


Epoch 6/6: 100%|██████████| 191/191 [01:37<00:00,  1.96it/s]


Epoch 6/6, Train Loss: 0.3799, Validation Loss: 0.3714, Accuracy: 0.8523


### Validation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, val_loader):
    model.eval()  # Set model to evaluation mode
    all_labels = []
    all_preds = []

    with torch.no_grad():  # Disable gradient calculations
        for batch in tqdm(val_loader, desc="Evaluating"):
            inputs, attention_masks, labels = batch
            inputs, attention_masks, labels = inputs.to(device), attention_masks.to(device), labels.to(device)

            outputs = model(input_ids=inputs, attention_mask=attention_masks).logits
            preds = torch.argmax(outputs, dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"Validation Accuracy: {accuracy}")
    print(f"Validation Precision: {precision}")
    print(f"Validation Recall: {recall}")
    print(f"Validation F1 Score: {f1}")

    return accuracy, precision, recall, f1

# Evaluate the model
evaluate_model(lora_model, val_loader)

Evaluating: 100%|██████████| 48/48 [00:34<00:00,  1.41it/s]

Validation Accuracy: 0.8522652659225214
Validation Precision: 0.8533333333333334
Validation Recall: 0.7889060092449923
Validation F1 Score: 0.8198558847077663





(0.8522652659225214,
 0.8533333333333334,
 0.7889060092449923,
 0.8198558847077663)

### Predict test data

In [None]:
# Function to predict on the test dataset
def predict_test(model, test_loader):
    model.eval()  # Set model to evaluation mode
    predictions = []

    with torch.no_grad():  # Disable gradient calculations
        for batch in tqdm(test_loader, desc="Predicting"):
            inputs, attention_masks = batch
            inputs, attention_masks = inputs.to(device), attention_masks.to(device)

            outputs = model(input_ids=inputs, attention_mask=attention_masks).logits
            preds = torch.argmax(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())

    return predictions

In [None]:
# Predict on the test dataset
test_predictions = predict_test(lora_model, test_loader)

# Save the predictions to a CSV file
submission_df = pd.DataFrame({
    'id': test_df['id'],  # Assuming 'id' column is present in test_df
    'target': test_predictions
})

submission_df.to_csv('submission.csv', index=False)

Predicting: 100%|██████████| 102/102 [01:12<00:00,  1.40it/s]


### Kaggle Competition Score - TOP ~7%

### - **Kaggle Competition Accuracy Score:** 0.8397

![Kaggle Score](Kaggle_compatition_score.png)