In [None]:
#  Install necessary libraries
!pip install transformers torch scikit-learn pandas
!pip install transformers[torch] accelerate
import re
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
# Uploading dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
#Preprocessing: Cleaning the text data
def preprocess_text(text):
    # Removing URLs, mentions, hashtags, and special characters using the `re` module
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # Removing URLs
    text = re.sub(r"@\w+", '', text)  # Removing mentions
    text = re.sub(r"#", '', text)  # Removing hashtags
    text = re.sub(r"[^A-Za-z0-9 ]+", '', text)  # Removing special characters
    text = text.lower()  # Lowercasing for consistency
    return text
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

In [None]:
#Splitting the data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['text'], train_df['target'], test_size=0.2, random_state=42
)

In [None]:
#Tokenization using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True, max_length=128)

In [None]:
#Creating a Dataset Class for PyTorch
class DisasterTweetsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels) if self.labels is not None else len(self.encodings['input_ids'])

In [None]:
# Creating Dataset Objects
train_dataset = DisasterTweetsDataset(train_encodings, list(train_labels))
val_dataset = DisasterTweetsDataset(val_encodings, list(val_labels))

In [None]:
#Loading Pre-trained BERT Model for Sequence Classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
#  Setting up Training Arguments
training_args = TrainingArguments(
    output_dir='./results',  # Directory to store results
    num_train_epochs=3,      # Number of epochs
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    warmup_steps=500,        # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,       # Strength of weight decay
    logging_dir='./logs',    # Directory to store logs
    logging_steps=10,        # Log every 10 steps
    evaluation_strategy="epoch"  # Evaluate every epoch
)

In [None]:
# S Defining  Metrics 
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
#  Initializing Trainer Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
# Evaluate Model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

In [None]:
# Predicting on Test Data
test_dataset = DisasterTweetsDataset(test_encodings)
predictions = trainer.predict(test_dataset)

In [None]:
# Extracting Predictions
preds = predictions.predictions.argmax(-1)
test_df['prediction'] = preds

In [None]:
#Save predictions to CSV
test_df[['id', 'prediction']].to_csv('test_predictions.csv', index=False)

print("Predictions saved to 'test_predictions.csv'")