In [1]:
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

In [4]:
!pip install transformers
from transformers import BertTokenizer



In [5]:
# Assuming 'data_for_bert.csv' is your cleaned dataset with 'BERT_Input' and 'Transport_Mode' columns
data_path = 'traffic-report-cleaned-for-bert.csv'
df = pd.read_csv(data_path)

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize all texts and map the tokens to thier word IDs
input_ids = []
attention_masks = []

In [15]:
# Example DataFrame column 'BERT_Input'
df = pd.DataFrame({
    'BERT_Input': [
        "2023-03-01 to 2023-03-31. Al Baha to Makhwa Road",
        "2023-03-01 to 2023-03-31. Baljuraishi to Al Baha Road - Transfered"
    ],
    'Transport_Mode_Labels': [0, 1]  # Example labels, replace with your actual labels
})

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Initialize lists to store the encoded information
input_ids = []
attention_masks = []

# Encoding each sentence in the dataset
for text in df['BERT_Input']:
    encoded_dict = tokenizer.encode_plus(
        text,                      # Sentence to encode.
        add_special_tokens=True,   # Add '[CLS]' and '[SEP]'
        max_length=64,             # Pad or truncate.
        padding='max_length',      # Pad to max length
        return_attention_mask=True,# Construct attn. masks.
        return_tensors='pt',       # Return pytorch tensors.
    )
    
    # Add the encoded sentence to the list. 
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(df['Transport_Mode_Labels'].values)

print(input_ids.shape, attention_masks.shape, labels.shape)

torch.Size([2, 64]) torch.Size([2, 64]) torch.Size([2])


In [17]:
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.model_selection import train_test_split

In [18]:
# Split the dataset into training and validation sets
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=42, test_size=0.1)

# Convert to DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=32)


In [20]:
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.model_selection import train_test_split
import pandas as pd

2024-03-06 11:50:18.976419: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [21]:
# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=df['Transport_Mode_Labels'].nunique())

# Specify the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer & Learning Rates
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Training Loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        model.zero_grad()
        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss}")

Average training loss: 0.7634192109107971
Average training loss: 0.5685696005821228
Average training loss: 0.5655654668807983
Average training loss: 0.3588297367095947


In [26]:
print(f"Average training loss: {avg_train_loss}")

model.eval()
eval_accuracy, eval_steps = 0, 0
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in validation_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    
    with torch.no_grad():        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    
    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = accuracy_score(label_ids, np.argmax(logits, axis=1))
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

eval_accuracy = eval_accuracy / nb_eval_steps
print(f"Validation Accuracy: {eval_accuracy}")

Average training loss: 0.3588297367095947
Validation Accuracy: 0.0


In [25]:
# Assuming you've already defined compute_metrics or you're directly using sklearn.metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Add the missing part of the validation loop here for clarity
# Ensure logits and label_ids are correctly handled

# Assuming the loop over validation_dataloader is correctly set
for batch in validation_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    logits = outputs.logits
    logits = logits.detach().cpu().numpy()  # Ensure logits are numpy arrays
    label_ids = b_labels.to('cpu').numpy()  # Ensure label_ids are numpy arrays

    # Now calculate metrics
    pred_labels = np.argmax(logits, axis=1)
    accuracy = accuracy_score(label_ids, pred_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(label_ids, pred_labels, average='weighted')

    print(f"Validation Accuracy: {accuracy}")
    print(f"Validation Precision: {precision}")
    print(f"Validation Recall: {recall}")
    print(f"Validation F1: {f1}")


Validation Accuracy: 0.0
Validation Precision: 0.0
Validation Recall: 0.0
Validation F1: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
torch.save(model.state_dict(), 'fine_tuned_bert_model.pt')


In [28]:
print("the model is fine-tuned now")

the model is fine-tuned now
