In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
import torch.nn as nn   
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

In [15]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"Using : {device}")


Using device: cuda:0
Using : cuda:0


In [16]:
file_path = 'final_questions.csv' 
df = pd.read_csv(file_path)

In [17]:
print(df.head())
print(f"The length of dataframe is :{df.shape[0]}")

                                       Question Text     Chapter_name
0                     1. Pascal, BASIC, and C are p.  Getting started
1  2. A widget is to the blueprint for a widget a...  Getting started
2       3. The two major components of an object are  Getting started
3  4. In C++, a function contained within a class...  Getting started
4  5. Protecting data from access by unauthorized...  Getting started
The length of dataframe is :1532


In [18]:
questions = df['Question Text'].tolist()
labels = df['Chapter_name'].tolist()


In [19]:
label_map = {label: idx for idx, label in enumerate(set(labels))}
map_to_label ={idx:label for idx, label in enumerate(set(labels))}
labels = [label_map[label] for label in labels]
num_classes = len(label_map)
# Load BERT tokenizer and define constants
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [20]:
#Super Parameters
max_length = 64
batch_size = 8
learning_rate = 4e-5 # (1e-5 =55%) (3e-5 = 65%) (8e-5=67%)
num_epochs = 13

In [21]:
#Intializing a question_dataset 
class QuestionDataset(Dataset):
    def __init__(self, questions, labels, tokenizer, max_length):
        self.questions = questions
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        label = self.labels[idx]
        
        encoding = (self.tokenizer.encode_plus(
            question,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        ))
        
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }


In [22]:
# BERTClassifier  bert model to connected to a linear neural network to predict class
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        return self.fc(cls_output)

In [23]:
model = BERTClassifier('bert-base-uncased', num_classes).to(device)

In [24]:
dataset = QuestionDataset(questions, labels, tokenizer, max_length)
train_size = int(0.8* len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [25]:
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)# why we used adamW
#why we used cross entropy loss
criterion = nn.CrossEntropyLoss()

In [26]:
#Training the model
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()#what is zero_grad
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_train_loss}")

Epoch 1/13, Training Loss: 3.178834300536614
Epoch 2/13, Training Loss: 2.338138468853839
Epoch 3/13, Training Loss: 1.521136522438232
Epoch 4/13, Training Loss: 0.9190841750665144
Epoch 5/13, Training Loss: 0.46990196150425195
Epoch 6/13, Training Loss: 0.2335159226574681
Epoch 7/13, Training Loss: 0.1308491174373534
Epoch 8/13, Training Loss: 0.08726094286985599
Epoch 9/13, Training Loss: 0.0994282330555672
Epoch 10/13, Training Loss: 0.04530196473631379
Epoch 11/13, Training Loss: 0.03714738613260644
Epoch 12/13, Training Loss: 0.03187484273614435
Epoch 13/13, Training Loss: 0.015257569223448827


In [27]:
!nvidia-smi

Tue May 20 02:13:05 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 572.83                 Driver Version: 572.83         CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4050 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   63C    P0             23W /  137W |    2461MiB /   6141MiB |     94%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
pip install playsound

In [43]:
# from threading import Thread
# from playsound import playsound
# import time
# audio_file = 'alarm.mp3'
# def play_sound():
#     while True:
#         playsound(audio_file)
# sound_thread = Thread(target=play_sound, daemon=True)
# sound_thread.start()
# time.sleep(10)

# print("Finished playing sound for 10 seconds!")


In [29]:
torch.save(model, 'model_complete.pth') 

In [30]:
def predict_question(question, model, tokenizer, device, max_length=32):
    # Preprocess the question (tokenize)
    encoding = tokenizer.encode_plus(
        question,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    model.eval()

    with torch.no_grad():  
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, predicted_class = torch.max(outputs, 1)

    return predicted_class.item()

In [31]:
label_map

{'Operator Overloading': 0,
 'Pointers and Dynamic Memory': 1,
 'Templates ': 2,
 'Loops and Decisions': 3,
 'Pointers': 4,
 'Strings, Vectors, and Arrays': 5,
 'C++ Programming Basics': 6,
 'Virtual Functions': 7,
 'Inheritance': 8,
 'Generic Algorithms and STL': 9,
 'Getting started': 10,
 'Structures': 11,
 'Specialised Library Facilities': 12,
 'Multifile Programs': 13,
 'Functions': 14,
 'Templates': 15,
 'Generic Algorithms': 16,
 'Specialised Tools and Techniques': 17,
 'Objects and Classes': 18,
 'Object-Oriented Programming': 19,
 'Tools for Large Programs': 20,
 'Copy Control': 21,
 'Variable and Basic types': 22,
 'Streams and IO Library': 23,
 'Expressions': 24,
 'Associative Containers': 25,
 'String,Vectors, and Arrays': 26,
 'Statements': 27,
 'Sequential Containers': 28}

In [32]:
question = "How can you implement a template?"

In [33]:

predicted_class = predict_question(question, model, tokenizer, device)
print(f"Predicted class: {predicted_class}")
print(f'{map_to_label[predicted_class]}')

Predicted class: 15
Templates


In [34]:
def calculate_accuracy(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    total_samples = 0
    correct_predictions = 0

    with torch.no_grad():  # Disable gradient computation
        for batch in dataloader:
            # Move inputs and labels to the device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            # Get model predictions
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, predictions = torch.max(outputs, 1)  # Get the index of the highest logit
            
            # Count correct predictions
            correct_predictions += (predictions == labels).sum().item()
            total_samples += labels.size(0)

    # Calculate accuracy
    accuracy = correct_predictions / total_samples
    return accuracy


In [35]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

val_accuracy = calculate_accuracy(model, val_loader, device)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

Validation Accuracy: 66.78%


In [45]:
file_path = 'final_questions.csv' 
df = pd.read_csv(file_path)
def analyze_chapter_weightage(df):
    # Get chapter-wise question count
    chapter_counts = df['Chapter_name'].value_counts()
    
    # Calculate percentages
    total_questions = len(df)
    chapter_weightage = (chapter_counts / total_questions * 100).round(2)
    
    # Create a DataFrame with the analysis
    analysis_df = pd.DataFrame({
        'Chapter': chapter_counts.index,
        'Number_of_Questions': chapter_counts.values,
        'Weightage_Percentage': chapter_weightage.values
    })
    
    # Sort by weightage in descending order
    analysis_df = analysis_df.sort_values('Weightage_Percentage', ascending=False)
    
    # Save to CSV
    output_file = 'chapter_weightage_analysis.csv'
    analysis_df.to_csv(output_file, index=False)
    
    # Display the analysis
    print("Chapter-wise Question Distribution:")
    print("=" * 60)
    print(analysis_df)
    print("\nAnalysis has been saved to:", output_file)
    
    return analysis_df

# Run the analysis
weightage_analysis = analyze_chapter_weightage(df)

Chapter-wise Question Distribution:
                             Chapter  Number_of_Questions  \
0                          Functions                  148   
1                Objects and Classes                  135   
2               Operator Overloading                   85   
3                        Expressions                   76   
4       Strings, Vectors, and Arrays                   75   
5              Sequential Containers                   74   
6        Object-Oriented Programming                   70   
7                          Templates                   67   
8                       Copy Control                   57   
9             Streams and IO Library                   57   
10                        Statements                   50   
11                 Virtual Functions                   43   
12                          Pointers                   43   
13                Generic Algorithms                   42   
14          Variable and Basic types             