In [31]:
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [32]:
# Load pre-trained BERT model and tokenizer for classification
tokenizer_cls = BertTokenizer.from_pretrained('bert-base-uncased')
model_cls = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
from transformers import BertTokenizer, BertModel
# Load pre-trained BERT model and tokenizer for similarity
tokenizer_sim = BertTokenizer.from_pretrained('bert-base-uncased')
model_sim = BertModel.from_pretrained('bert-base-uncased')


In [35]:
Labeled_Metadata =  pd.read_csv('C:\\Users\\Steffi Grace\\24592774_LLM_ILabResearch\\Notebooks\\Datasets\\Labeled_Metadata.csv')

In [36]:
# Preprocess text data
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        return text
    else:
        return ""

for column in ['query', 'title', 'description', 'summary']:
    Labeled_Metadata[column] = Labeled_Metadata[column].apply(preprocess_text)

In [37]:
Labeled_Metadata.reset_index(drop=True, inplace=True)


In [38]:
# Split data for classification
X_cls = Labeled_Metadata['query'] + ' ' + Labeled_Metadata['title'] 
y_cls = Labeled_Metadata['label']
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)


In [39]:
# Tokenize text data for classification
X_train_tokens_cls = tokenizer_cls(list(X_train_cls), padding=True, truncation=True, return_tensors='pt')
X_test_tokens_cls = tokenizer_cls(list(X_test_cls), padding=True, truncation=True, return_tensors='pt')

In [40]:
# Convert labels to tensors
y_train_cls_tensor = torch.tensor(y_train_cls.values)
y_test_cls_tensor = torch.tensor(y_test_cls.values)

In [41]:
# Create Tensor datasets
train_dataset_cls = TensorDataset(X_train_tokens_cls.input_ids, X_train_tokens_cls.attention_mask, y_train_cls_tensor)
test_dataset_cls = TensorDataset(X_test_tokens_cls.input_ids, X_test_tokens_cls.attention_mask, y_test_cls_tensor)

In [42]:

print(train_dataset_cls)
print(test_dataset_cls)

<torch.utils.data.dataset.TensorDataset object at 0x00000263E1659930>
<torch.utils.data.dataset.TensorDataset object at 0x0000026429AF4E50>


In [43]:
# Create DataLoader
batch_size = 8
train_loader_cls = DataLoader(train_dataset_cls, batch_size=batch_size, shuffle=True)
test_loader_cls = DataLoader(test_dataset_cls, batch_size=batch_size)

In [44]:
# Define optimizer and loss function
learning_rate = 2e-5  
optimizer_cls = AdamW(model_cls.parameters(), lr=learning_rate)
criterion_cls = torch.nn.CrossEntropyLoss()



In [45]:
num_epochs=3
for epoch in range(num_epochs):
    model_cls.train()
    total_loss = 0
    print(f"Epoch {epoch+1}/{num_epochs}")
    for step, batch in enumerate(train_loader_cls):
        input_ids, attention_mask, labels = batch
        optimizer_cls.zero_grad()
        outputs_cls = model_cls(input_ids, attention_mask=attention_mask, labels=labels)
        loss_cls = outputs_cls.loss
        loss_cls.backward()
        optimizer_cls.step()
        
        total_loss += loss_cls.item()
        
        if step % 10 == 0 and step > 0:
            print(f"Step {step}/{len(train_loader_cls)}, Loss: {loss_cls.item()}")

Epoch 1/3
Step 10/128, Loss: 0.3725121021270752
Step 20/128, Loss: 0.6625328063964844
Step 30/128, Loss: 0.04891571030020714
Step 40/128, Loss: 0.3398627042770386
Step 50/128, Loss: 0.4157848358154297
Step 60/128, Loss: 0.06156798452138901
Step 70/128, Loss: 0.06934899091720581
Step 80/128, Loss: 0.0635581985116005
Step 90/128, Loss: 0.08111036568880081
Step 100/128, Loss: 0.31306222081184387
Step 110/128, Loss: 0.08917460590600967
Step 120/128, Loss: 0.06815195083618164
Epoch 2/3
Step 10/128, Loss: 0.06544636189937592
Step 20/128, Loss: 0.06858750432729721
Step 30/128, Loss: 0.05166354775428772
Step 40/128, Loss: 0.35384711623191833
Step 50/128, Loss: 0.5837855935096741
Step 60/128, Loss: 0.05342096462845802
Step 70/128, Loss: 0.08128581941127777
Step 80/128, Loss: 0.03457849100232124
Step 90/128, Loss: 0.16889844834804535
Step 100/128, Loss: 0.025073276832699776
Step 110/128, Loss: 0.8743771910667419
Step 120/128, Loss: 0.13529574871063232
Epoch 3/3
Step 10/128, Loss: 0.4911466240882

In [46]:
# Evaluate the model
model_cls.eval()
predictions_cls = []
true_labels_cls = []
for batch in test_loader_cls:
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs_cls = model_cls(input_ids, attention_mask=attention_mask)
    logits_cls = outputs_cls.logits
    preds_cls = torch.argmax(logits_cls, dim=1).tolist()
    predictions_cls.extend(preds_cls)
    true_labels_cls.extend(labels.tolist())

In [47]:
# Calculate accuracy
accuracy_cls = accuracy_score(true_labels_cls, predictions_cls)
print("Classification Accuracy:", accuracy_cls)

Classification Accuracy: 0.984313725490196


In [52]:
# Compute similarity scores
metadata_embeddings = []
for index, row in Labeled_Metadata.iterrows():
    text = row['query'] + ' '+ row['title']
    tokenized_text = tokenizer_sim(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        output_sim = model_sim(**tokenized_text)
    embeddings = output_sim.last_hidden_state.mean(dim=1).squeeze().numpy()
    metadata_embeddings.append(embeddings)

In [53]:
user_input = input("Enter your query: ")

# Tokenize user input for classification
user_input_tokens_cls = tokenizer_cls(user_input, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    output_cls = model_cls(**user_input_tokens_cls)
predicted_class = torch.argmax(output_cls.logits).item()

# Display predicted class
print(user_input)
print("Predicted Class:", predicted_class)

Electric Vehicles
Predicted Class: 0
