# Final Year Project Topic Classifier with Unknown Topic Detection
This notebook demonstrates a hybrid approach combining supervised classification for known topics and open-set detection for unknown ones.

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import TextClassificationPipeline


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load and prepare data
df = pd.read_csv('data\\deepseek_sentences.csv', quotechar='"', encoding='utf-8', on_bad_lines='warn')
df['text'] = df['sentence']
df['label'] = df['positive_topic']

# Encode labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label_encoded'].tolist(), test_size=0.2, random_state=42
)

In [4]:
# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)

In [5]:
# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy='no'
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.7758,2.84706
2,2.6526,2.672204
3,2.1907,2.781902
4,2.7961,2.856775
5,2.3404,2.81443
6,2.4025,2.992105
7,1.9508,2.926044
8,1.9589,2.912133
9,1.9549,2.948062
10,1.7698,2.957731


TrainOutput(global_step=290, training_loss=2.4429030911675813, metrics={'train_runtime': 49.7135, 'train_samples_per_second': 46.667, 'train_steps_per_second': 5.833, 'total_flos': 26262559625280.0, 'train_loss': 2.4429030911675813, 'epoch': 10.0})

In [7]:
# Classification pipeline with confidence threshold for unknowns
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, device=0 if torch.cuda.is_available() else -1)

def classify_with_unknown(text, threshold=0.5):
    result = pipe(text)[0]
    top = max(result, key=lambda x: x['score'])
    if top['score'] < threshold:
        return 'unknown'
    return label_encoder.inverse_transform([int(top['label'].split('_')[-1])])[0]

# Example usage
examples = [
    "I love working on Deep Learning projects related to computer vision.",
    "Quantum computing is fascinating, though it's not in our topic list.",
    "Machine Learning and NLP are where my passion lies.",
    "I'm excited about applying AI in healthcare for disease prediction.",
    "I'm really passionate about machine learning and natural language processing because they're at the forefront of AI innovation. I'd love to work on a chatbot project or develop a sentiment analysis tool for social media data. However, I want to steer clear of legacy system maintenance and basic CRUD applications since they don't offer much learning potential."
]
for ex in examples:
    print(f"Input: {ex}\nPredicted Topic: {classify_with_unknown(ex)}\n")

Device set to use cuda:0


Input: I love working on Deep Learning projects related to computer vision.
Predicted Topic: unknown

Input: Quantum computing is fascinating, though it's not in our topic list.
Predicted Topic: nan

Input: Machine Learning and NLP are where my passion lies.
Predicted Topic: unknown

Input: I'm excited about applying AI in healthcare for disease prediction.
Predicted Topic: unknown

Input: I'm really passionate about machine learning and natural language processing because they're at the forefront of AI innovation. I'd love to work on a chatbot project or develop a sentiment analysis tool for social media data. However, I want to steer clear of legacy system maintenance and basic CRUD applications since they don't offer much learning potential.
Predicted Topic: nan



