In [None]:
!pip install transformers
!pip install datasets
!pip install scikit-learn pandas

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from google.colab import drive

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls /content/drive/My\ Drive/

ColabNotebooks	list_formatted0.docx  table_formatted0.docx  tempus	    try.png
data		practice	      table_formatted.docx   tempus-sample
LayoutLMv3	sample_output.docx    table.py		     test


In [None]:
# Define file paths
data_path = '/content/drive/My Drive/ColabNotebooks/Cluepoints/'  # Change 'your_folder' to the actual path in your Google Drive
train_file = data_path + 'stack_overflow_questions_train.csv'
valid_file = data_path + 'stack_overflow_questions_valid.csv'
test_file = data_path + 'stack_overflow_questions_test.csv'

# Load datasets
train_df = pd.read_csv(train_file)
valid_df = pd.read_csv(valid_file)
test_df = pd.read_csv(test_file)

In [None]:
# Combine title and body for text processing
def preprocess_text(row):
    return row['Title'] + " " + row['Body']

train_df['text'] = train_df.apply(preprocess_text, axis=1)
valid_df['text'] = valid_df.apply(preprocess_text, axis=1)
test_df['text'] = test_df.apply(preprocess_text, axis=1)

# Encode labels
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['Y'])
valid_df['label'] = label_encoder.transform(valid_df['Y'])
test_df['label'] = label_encoder.transform(test_df['Y'])

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Create Dataset Class
class StackOverflowDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return encoding['input_ids'].squeeze(0), encoding['attention_mask'].squeeze(0), torch.tensor(self.labels[idx], dtype=torch.long)

# Prepare datasets and loaders
train_dataset = StackOverflowDataset(train_df['text'].tolist(), train_df['label'].tolist(), tokenizer)
valid_dataset = StackOverflowDataset(valid_df['text'].tolist(), valid_df['label'].tolist(), tokenizer)
test_dataset = StackOverflowDataset(test_df['text'].tolist(), test_df['label'].tolist(), tokenizer)

BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Load Pretrained BERT Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:


# Training Loop
EPOCHS = 5
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in train_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        #print(f"Iteration {input_ids}, Loss: {loss.item() / len(train_loader):.4f}")
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

# Validation Loop
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for input_ids, attention_mask, labels in valid_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

print(f"Validation Accuracy: {correct / total:.4f}")

# Testing Loop
correct = 0
total = 0
with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

print(f"Test Accuracy: {correct / total:.4f}")

Epoch 1, Loss: 0.2566
Epoch 2, Loss: 0.1578
Epoch 3, Loss: 0.1043
Epoch 4, Loss: 0.0619
Epoch 5, Loss: 0.0363
Validation Accuracy: 0.9265
Test Accuracy: 0.9265


In [None]:
model.save_pretrained(data_path + "bert_finetuned_stackoverflow")
tokenizer.save_pretrained(data_path + "bert_finetuned_stackoverflow")

('/content/drive/My Drive/ColabNotebooks/Cluepoints/bert_finetuned_stackoverflow/tokenizer_config.json',
 '/content/drive/My Drive/ColabNotebooks/Cluepoints/bert_finetuned_stackoverflow/special_tokens_map.json',
 '/content/drive/My Drive/ColabNotebooks/Cluepoints/bert_finetuned_stackoverflow/vocab.txt',
 '/content/drive/My Drive/ColabNotebooks/Cluepoints/bert_finetuned_stackoverflow/added_tokens.json')

In [None]:
# Load the old model
model_old = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))
#tokenizer_old = BertTokenizer.from_pretrained(data_path + "bert_finetuned")
model_old.to(device)
model_old.eval()

test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model_old(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

print(f"Test Pretrained Accuracy: {correct / total:.4f}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Pretrained Accuracy: 0.6223


In [None]:
# Load the finetuned model
model = BertForSequenceClassification.from_pretrained(data_path + "bert_finetuned_stackoverflow")
tokenizer = BertTokenizer.from_pretrained(data_path + "bert_finetuned_stackoverflow")
model.to(device)
model.eval()

# Testing Loop with Example Outputs
correct = 0
total = 0
positive_sample_outputs = []
negative_sample_outputs = []

test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

        # Store sample outputs
        for i in range(len(predictions)):
            if predictions[i] != labels[i]:
              if len(negative_sample_outputs) < 3:  # Show 5 example outputs
                negative_sample_outputs.append({
                    'Text': tokenizer.decode(input_ids[i], skip_special_tokens=True),
                    'Predicted': label_encoder.inverse_transform([predictions[i].item()])[0],
                    'Actual': label_encoder.inverse_transform([labels[i].item()])[0]
                    })
              else:
                if len(positive_sample_outputs) == 3:  # Show 5 example outputs
                    break
            else:
              if len(positive_sample_outputs) < 3:  # Show 5 example outputs
                positive_sample_outputs.append({
                    'Text': tokenizer.decode(input_ids[i], skip_special_tokens=True),
                    'Predicted': label_encoder.inverse_transform([predictions[i].item()])[0],
                    'Actual': label_encoder.inverse_transform([labels[i].item()])[0]
                    })
              else:
                if len(negative_sample_outputs) == 3:  # Show 5 example outputs
                    break

print(f"Test Finetuned Accuracy: {correct / total:.4f}")

# Show example outputs
print("\nPositive Sample Outputs:")
for example in positive_sample_outputs:
    print(f"\nText: {example['Text']}\nPredicted: {example['Predicted']}\nActual: {example['Actual']}")

print("\nNegative Sample Outputs:")
for example in negative_sample_outputs:
    print(f"\nText: {example['Text']}\nPredicted: {example['Predicted']}\nActual: {example['Actual']}")

Test Finetuned Accuracy: 0.9265

Positive Sample Outputs:

Text: how to get all the child records from different tables based on given parent id in sql server i am having 4 different tables like select * from system select * from set select * from item select * from versions now for each system id there will be * * n no. of sets * *, and foe * * each set * * there qill be * * n no. of items * * and for * * each item * * there will be * * n no. of versions * *. * * each system has < br / > n no of set < br / > each set has < br / > n
Predicted: LQ_EDIT
Actual: LQ_EDIT

Text: retrieve all except some data of the another table i have two table m _ master and tbl _ appointment [ this is tbl _ appointment table ] [ 1 ] [ this is m _ master table ] [ 2 ] [ 1 ] : http : / / i. stack. imgur. com / c7f7y. png [ 2 ] : http : / / i. stack. imgur. com / hd6qf. png
Predicted: LQ_EDIT
Actual: LQ_EDIT

Text: pandas : read _ html < p > i'm trying to extract us states from wiki url, and for which i'm u