In [1]:
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load pre-trained BERT model and tokenizer for classification
tokenizer_cls = BertTokenizer.from_pretrained('bert-base-uncased')
model_cls = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from transformers import BertTokenizer, BertModel
# Load pre-trained BERT model and tokenizer for similarity
tokenizer_sim = BertTokenizer.from_pretrained('bert-base-uncased')
model_sim = BertModel.from_pretrained('bert-base-uncased')


In [4]:
Labeled_Metadata =  pd.read_csv('C:\\Users\\Steffi Grace\\24592774_LLM_ILabResearch\\Notebooks\\Datasets\\Labeled_MetaData.csv')

In [5]:
# Preprocess text data
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        return text
    else:
        return ""

for column in ['query', 'title', 'description', 'summary']:
    Labeled_Metadata[column] = Labeled_Metadata[column].apply(preprocess_text)

In [6]:
Labeled_Metadata.reset_index(drop=True, inplace=True)


In [7]:
# Split data for classification
X_cls = Labeled_Metadata['query'] + ' ' + Labeled_Metadata['title'] + ' ' + Labeled_Metadata['description'] + ' ' + Labeled_Metadata['summary']
y_cls = Labeled_Metadata['label']
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)


In [8]:
# Tokenize text data for classification
X_train_tokens_cls = tokenizer_cls(list(X_train_cls), padding=True, truncation=True, return_tensors='pt')
X_test_tokens_cls = tokenizer_cls(list(X_test_cls), padding=True, truncation=True, return_tensors='pt')

In [9]:
# Convert labels to tensors
y_train_cls_tensor = torch.tensor(y_train_cls.values)
y_test_cls_tensor = torch.tensor(y_test_cls.values)

In [10]:
# Create Tensor datasets
train_dataset_cls = TensorDataset(X_train_tokens_cls.input_ids, X_train_tokens_cls.attention_mask, y_train_cls_tensor)
test_dataset_cls = TensorDataset(X_test_tokens_cls.input_ids, X_test_tokens_cls.attention_mask, y_test_cls_tensor)

In [11]:

print(train_dataset_cls)
print(test_dataset_cls)

<torch.utils.data.dataset.TensorDataset object at 0x00000262AC29CAF0>
<torch.utils.data.dataset.TensorDataset object at 0x00000262748A02E0>


In [12]:
# Create DataLoader
batch_size = 8
train_loader_cls = DataLoader(train_dataset_cls, batch_size=batch_size, shuffle=True)
test_loader_cls = DataLoader(test_dataset_cls, batch_size=batch_size)

In [13]:
# Define optimizer and loss function
learning_rate = 2e-5  
optimizer_cls = AdamW(model_cls.parameters(), lr=learning_rate)
criterion_cls = torch.nn.CrossEntropyLoss()



In [14]:
num_epochs=3
for epoch in range(num_epochs):
    model_cls.train()
    total_loss = 0
    print(f"Epoch {epoch+1}/{num_epochs}")
    for step, batch in enumerate(train_loader_cls):
        input_ids, attention_mask, labels = batch
        optimizer_cls.zero_grad()
        outputs_cls = model_cls(input_ids, attention_mask=attention_mask, labels=labels)
        loss_cls = outputs_cls.loss
        loss_cls.backward()
        optimizer_cls.step()
        
        total_loss += loss_cls.item()
        
        if step % 10 == 0 and step > 0:
            print(f"Step {step}/{len(train_loader_cls)}, Loss: {loss_cls.item()}")

Epoch 1/3
Step 10/128, Loss: 0.17760060727596283
Step 20/128, Loss: 0.08405087143182755
Step 30/128, Loss: 0.45002347230911255
Step 40/128, Loss: 0.058092888444662094
Step 50/128, Loss: 0.09061025083065033
Step 60/128, Loss: 0.4177345037460327
Step 70/128, Loss: 0.0660485327243805
Step 80/128, Loss: 0.44476690888404846
Step 90/128, Loss: 0.07182550430297852
Step 100/128, Loss: 0.3900478482246399
Step 110/128, Loss: 0.4117822051048279
Step 120/128, Loss: 0.07616524398326874
Epoch 2/3
Step 10/128, Loss: 0.056320562958717346
Step 20/128, Loss: 0.07138118147850037
Step 30/128, Loss: 1.1121132373809814
Step 40/128, Loss: 0.05707618221640587
Step 50/128, Loss: 0.7856000661849976
Step 60/128, Loss: 0.41835907101631165
Step 70/128, Loss: 0.40329042077064514
Step 80/128, Loss: 0.3917740285396576
Step 90/128, Loss: 0.7029182314872742
Step 100/128, Loss: 0.12035848945379257
Step 110/128, Loss: 0.09176396578550339
Step 120/128, Loss: 0.3963022232055664
Epoch 3/3
Step 10/128, Loss: 0.04481664672493

In [None]:
# Evaluate the model
model_cls.eval()
predictions_cls = []
true_labels_cls = []
for batch in test_loader_cls:
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs_cls = model_cls(input_ids, attention_mask=attention_mask)
    logits_cls = outputs_cls.logits
    preds_cls = torch.argmax(logits_cls, dim=1).tolist()
    predictions_cls.extend(preds_cls)
    true_labels_cls.extend(labels.tolist())

In [None]:
# Calculate accuracy
accuracy_cls = accuracy_score(true_labels_cls, predictions_cls)
print("Classification Accuracy:", accuracy_cls)

Classification Accuracy: 0.9803921568627451


In [None]:
# Compute similarity scores
metadata_embeddings = []
for index, row in Labeled_Metadata.iterrows():
    text = row['title'] + ' ' + row['description'] + ' ' + row['summary']
    tokenized_text = tokenizer_sim(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        output_sim = model_sim(**tokenized_text)
    embeddings = output_sim.last_hidden_state.mean(dim=1).squeeze().numpy()
    metadata_embeddings.append(embeddings)

In [None]:
user_input = input("Enter your query: ")

# Tokenize user input for classification
user_input_tokens_cls = tokenizer_cls(user_input, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    output_cls = model_cls(**user_input_tokens_cls)
predicted_class = torch.argmax(output_cls.logits).item()

# Display predicted class
print(user_input)
print("Predicted Class:", predicted_class)

electric cars
Predicted Class: 0


In [None]:
from imblearn.over_sampling import RandomOverSampler, SMOTE

# Random over-sampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from 
import torch
import numpy as np
import pandas as pd

# Load pre-trained BERT model and tokenizer for classification
tokenizer_cls = BertTokenizer.from_pretrained('bert-base-uncased')
model_cls = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  

# Load sample dataset
sample_data = Labeled_Metadata.copy()

# Preprocess text data
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        return text
    else:
        return ""

for column in ['query', 'title', 'description', 'summary']:
    sample_data[column] = sample_data[column].apply(preprocess_text)

# Split the data into features and labels
X_cls = sample_data['query'] + ' ' + sample_data['title'] + ' ' + sample_data['description'] + ' ' + sample_data['summary']
y_cls = sample_data['label']

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_resampled_cls, y_resampled_cls = smote.fit_resample(X_cls, y_cls)

# Split the resampled data into training and testing sets
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_resampled_cls, y_resampled_cls, test_size=0.2, random_state=42)

# Tokenize text data for classification
X_train_tokens_cls = tokenizer_cls(X_train_cls.tolist(), padding=True, truncation=True, return_tensors='pt')
X_test_tokens_cls = tokenizer_cls(X_test_cls.tolist(), padding=True, truncation=True, return_tensors='pt')

# Create Tensor datasets
train_dataset_cls = TensorDataset(X_train_tokens_cls.input_ids, X_train_tokens_cls.attention_mask, torch.tensor(y_train_cls))
test_dataset_cls = TensorDataset(X_test_tokens_cls.input_ids, X_test_tokens_cls.attention_mask, torch.tensor(y_test_cls))

# Create DataLoader
batch_size = 32  # Define your batch size
train_loader_cls = DataLoader(train_dataset_cls, batch_size=batch_size, shuffle=True)
test_loader_cls = DataLoader(test_dataset_cls, batch_size=batch_size)

# Define optimizer and loss function
optimizer_cls = AdamW(model_cls.parameters(), lr=2e-5)  # Define your learning rate
criterion_cls = torch.nn.CrossEntropyLoss()

# Fine-tune the model
num_epochs = 5  # Define your number of epochs
for epoch in range(num_epochs):
    model_cls.train()
    for batch in train_loader_cls:
        input_ids, attention_mask, labels = batch
        optimizer_cls.zero_grad()
        outputs_cls = model_cls(input_ids, attention_mask=attention_mask, labels=labels)
        loss_cls = outputs_cls.loss
        loss_cls.backward()
        optimizer_cls.step()

# Evaluate the model
model_cls.eval()
predictions_cls = []
true_labels_cls = []
for batch in test_loader_cls:
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs_cls = model_cls(input_ids, attention_mask=attention_mask)
    logits_cls = outputs_cls.logits
    preds_cls = torch.argmax(logits_cls, dim=1).tolist()
    predictions_cls.extend(preds_cls)
    true_labels_cls.extend(labels.tolist())

# Calculate accuracy
accuracy_cls = accuracy_score(true_labels_cls, predictions_cls)
print("Classification Accuracy:", accuracy_cls)


ModuleNotFoundError: No module named 'imblearn'