In [92]:
import pandas as pd
import torch
import transformers
from mpl_toolkits.mplot3d.proj3d import transform
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

In [93]:
print(transformers.__version__)

4.46.2


In [94]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [95]:
file_path = 'questions.csv'  # Replace with your CSV file path
df = pd.read_csv(file_path)

In [96]:
questions = df['Question Text'].tolist()
labels = df['Chapter'].tolist()


In [97]:
label_map = {label: idx for idx, label in enumerate(set(labels))}
map_to_label ={idx:label for idx, label in enumerate(set(labels))}
labels = [label_map[label] for label in labels]
num_classes = len(label_map)

# Load BERT tokenizer and define constants
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 64
batch_size = 8
learning_rate = 1e-5
num_epochs = 10

In [98]:
class QuestionDataset(Dataset):
    def __init__(self, questions, labels, tokenizer, max_length):
        self.questions = questions
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        label = self.labels[idx]
        
        encoding = (self.tokenizer.encode_plus(
            question,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        ))
        
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }


In [99]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        return self.fc(cls_output)

In [100]:
model = BERTClassifier('bert-base-uncased', num_classes).to(device)

In [101]:
dataset = QuestionDataset(questions, labels, tokenizer, max_length)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [102]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [103]:
print(transformers.__version__)

4.46.2


In [104]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # Move batch data to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_train_loss}")

Epoch 1/10, Training Loss: 2.776620243109909
Epoch 2/10, Training Loss: 2.6627495709587548
Epoch 3/10, Training Loss: 2.5324043713363946
Epoch 4/10, Training Loss: 2.3613462868858788
Epoch 5/10, Training Loss: 2.173934487735524
Epoch 6/10, Training Loss: 1.962233216154809
Epoch 7/10, Training Loss: 1.7435327791700177
Epoch 8/10, Training Loss: 1.5568605895135916
Epoch 9/10, Training Loss: 1.326779702130486
Epoch 10/10, Training Loss: 1.143190967101677


In [105]:
def predict_question(question, model, tokenizer, device, max_length=32):
    # Preprocess the question (tokenize)
    encoding = tokenizer.encode_plus(
        question,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Put model in evaluation mode
    model.eval()

    with torch.no_grad():  # No gradient computation for inference
        # Get model output (logits)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Get the predicted class by finding the index with the maximum value
        _, predicted_class = torch.max(outputs, 1)

    return predicted_class.item()

In [106]:
label_map

{'Object-Oriented Software Development': 0,
 'Operator Overloading': 1,
 'Virtual Functions': 2,
 'Objects and Classes': 3,
 'The Big Picture': 4,
 'Inheritance': 5,
 'Streams and Files': 6,
 'Multifile Programs': 7,
 'Templates and Exceptions': 8,
 'Pointers': 9,
 'Arrays and Strings': 10,
 'Functions': 11,
 'Loops and Decisions': 12,
 'Structures': 13,
 'C++ Programming Basics': 14,
 'The Standard Template Library': 15}

In [119]:
question = "what is a friend function?"

In [120]:

predicted_class = predict_question(question, model, tokenizer, device)
print(f"Predicted class: {predicted_class}")
print(f'{map_to_label[predicted_class]}')

Predicted class: 2
Virtual Functions
