In [1]:
#IMPORTING LIBRARIES

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#LOADING DATA

df_train = pd.read_csv("C:/Users/shrav/OneDrive/Desktop/ML projects/Disaster classification LLM/Data/train.csv")
df_test = pd.read_csv("C:/Users/shrav/OneDrive/Desktop/ML projects/Disaster classification LLM/Data/test.csv")
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [3]:
#SPLITTING DATA
X_train, X_test, y_train, y_test = train_test_split(df_train['text'], df_train['target'], test_size=0.2)



In [4]:
#TOKENIZING DATA

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples, padding='max_length',truncation=True, max_length=128)


In [5]:
#Coverting X(text) to encodings

train_encodings = tokenize_function(X_train.tolist())
val_encodings= tokenize_function(X_test.tolist())
test_encodings = tokenize_function(df_test['text'].tolist())



In [6]:
#Coverting y(labels) to encodings
train_labels= torch.tensor(y_train.values)
val_labels= torch.tensor(y_test.values)


In [32]:
# CUSTOMIZE DATASET
class TweetDataset(Dataset):
    def __init__(self, encodings,labels):
        self.encodings= encodings
        self.labels=labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
        
       
    def __len__(self):
        return len(self.labels)

In [33]:
train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)

In [34]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [35]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)


In [None]:
# Training loop
model.train()
for epoch in range(3):  # Number of epochs
    total_loss = 0
    all_predictions = []
    all_labels = []
    
    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        
        # Convert logits to list of predictions
        predictions = np.argmax(logits, axis=1).flatten()
        all_predictions.extend(predictions)
        all_labels.extend(label_ids)

        # Optionally, print the loss every certain number of batches
        if (batch_idx + 1) % 10 == 0:  # Adjust this number based on your dataset size and batch size
            print(f'Epoch: {epoch+1}, Batch: {batch_idx+1}, Loss: {loss.item():.4f}')

    # Print average loss per epoch
    average_loss = total_loss / len(train_loader)
    print(f'End of Epoch {epoch+1}, Average Loss: {average_loss:.4f}')