In [352]:
import pandas as pd
import torch
import transformers
from mpl_toolkits.mplot3d.proj3d import transform
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

In [353]:
print(transformers.__version__)

4.46.2


In [354]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"Using : {torch.cuda.get_device_name(0)}")


Using device: cuda
Using : NVIDIA GeForce RTX 3050 Laptop GPU


In [355]:
file_path = 'final_questions.csv'  # Replace with your CSV file path
df = pd.read_csv(file_path)

In [356]:
questions = df['Question Text'].tolist()
labels = df['Chapter_name'].tolist()


In [357]:
len(questions)

1532

In [358]:
label_map = {label: idx for idx, label in enumerate(set(labels))}
map_to_label ={idx:label for idx, label in enumerate(set(labels))}
labels = [label_map[label] for label in labels]
num_classes = len(label_map)

# Load BERT tokenizer and define constants
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 64
batch_size = 8
# 1e-5 =55%
#3e-5 = 65%
#8e-5 =67%
learning_rate = 4e-5
num_epochs = 13

In [359]:
class QuestionDataset(Dataset):
    def __init__(self, questions, labels, tokenizer, max_length):
        self.questions = questions
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        label = self.labels[idx]
        
        encoding = (self.tokenizer.encode_plus(
            question,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        ))
        
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }


In [360]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        return self.fc(cls_output)

In [361]:
model = BERTClassifier('bert-base-uncased', num_classes).to(device)

In [362]:
dataset = QuestionDataset(questions, labels, tokenizer, max_length)
train_size = int(0.8* len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [363]:
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [364]:
print(transformers.__version__)

4.46.2


In [365]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_train_loss}")

Epoch 1/13, Training Loss: 3.0810184850321187
Epoch 2/13, Training Loss: 2.3808476181773397
Epoch 3/13, Training Loss: 1.7897256914671364
Epoch 4/13, Training Loss: 1.1909023055008479
Epoch 5/13, Training Loss: 0.6818510725513681
Epoch 6/13, Training Loss: 0.35050326130993953
Epoch 7/13, Training Loss: 0.18312316801544135
Epoch 8/13, Training Loss: 0.10658546390810184
Epoch 9/13, Training Loss: 0.060723251354723395
Epoch 10/13, Training Loss: 0.03868163166591873
Epoch 11/13, Training Loss: 0.027698989425386702
Epoch 12/13, Training Loss: 0.022375932785791235
Epoch 13/13, Training Loss: 0.0186339944976007


In [366]:
def predict_question(question, model, tokenizer, device, max_length=32):
    # Preprocess the question (tokenize)
    encoding = tokenizer.encode_plus(
        question,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    model.eval()

    with torch.no_grad():  
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, predicted_class = torch.max(outputs, 1)

    return predicted_class.item()

In [367]:
label_map

{'Copy Control': 0,
 'Streams and IO Library': 1,
 'Multifile Programs': 2,
 'Variable and Basic types': 3,
 'Expressions': 4,
 'Generic Algorithms and STL': 5,
 'Functions': 6,
 'Generic Algorithms': 7,
 'Templates ': 8,
 'Getting started': 9,
 'Statements': 10,
 'Pointers and Dynamic Memory': 11,
 'Tools for Large Programs': 12,
 'Objects and Classes': 13,
 'Virtual Functions': 14,
 'Strings, Vectors, and Arrays': 15,
 'Structures': 16,
 'C++ Programming Basics': 17,
 'Templates': 18,
 'Object-Oriented Programming': 19,
 'String,Vectors, and Arrays': 20,
 'Loops and Decisions': 21,
 'Specialised Tools and Techniques': 22,
 'Associative Containers': 23,
 'Specialised Library Facilities': 24,
 'Inheritance': 25,
 'Pointers': 26,
 'Operator Overloading': 27,
 'Sequential Containers': 28}

In [368]:
question = "How can you implement a template?"

In [369]:

predicted_class = predict_question(question, model, tokenizer, device)
print(f"Predicted class: {predicted_class}")
print(f'{map_to_label[predicted_class]}')

Predicted class: 18
Templates


In [370]:
def calculate_accuracy(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    total_samples = 0
    correct_predictions = 0

    with torch.no_grad():  # Disable gradient computation
        for batch in dataloader:
            # Move inputs and labels to the device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            # Get model predictions
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, predictions = torch.max(outputs, 1)  # Get the index of the highest logit
            
            # Count correct predictions
            correct_predictions += (predictions == labels).sum().item()
            total_samples += labels.size(0)

    # Calculate accuracy
    accuracy = correct_predictions / total_samples
    return accuracy


In [371]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

val_accuracy = calculate_accuracy(model, val_loader, device)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

Validation Accuracy: 67.43%
