# Final Year Project Topic Classifier with Unknown Topic Detection
This notebook demonstrates a hybrid approach combining supervised classification for known topics and open-set detection for unknown ones.

In [17]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import TextClassificationPipeline


In [18]:
# Load and prepare data
df = pd.read_csv('data\\deepseek_sentences.csv', quotechar='"', encoding='utf-8', on_bad_lines='warn')
df['text'] = df['sentence']
df['label'] = df['positive_topic']

# Encode labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label_encoded'].tolist(), test_size=0.2, random_state=42
)

In [19]:
# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)

In [20]:
# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy='no'
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train
trainer.train()



Epoch,Training Loss,Validation Loss
1,3.0603,3.069764
2,1.8529,2.160371
3,1.2268,1.65041
4,0.8261,1.350386
5,0.5821,1.164469
6,0.6295,1.077422
7,0.4657,1.056893
8,0.4146,1.030629
9,0.3902,1.015885
10,0.2678,1.011382


TrainOutput(global_step=410, training_loss=1.0807020135042145, metrics={'train_runtime': 592.0794, 'train_samples_per_second': 5.455, 'train_steps_per_second': 0.692, 'total_flos': 33211522279200.0, 'train_loss': 1.0807020135042145, 'epoch': 10.0})

In [24]:
# Classification pipeline with confidence threshold for unknowns
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, device=0 if torch.cuda.is_available() else -1)

def classify_with_unknown(text, threshold=0.5):
    result = pipe(text)[0]
    top = max(result, key=lambda x: x['score'])
    if top['score'] < threshold:
        return 'unknown'
    return label_encoder.inverse_transform([int(top['label'].split('_')[-1])])[0]

# Example usage
examples = [
    "I love working on Deep Learning projects related to computer vision.",
    "Quantum computing is fascinating, though it's not in our topic list.",
    "Machine Learning and NLP are where my passion lies.",
    "I'm excited about applying AI in healthcare for disease prediction."
]
for ex in examples:
    print(f"Input: {ex}\nPredicted Topic: {classify_with_unknown(ex)}\n")

Device set to use cpu


Input: I love working on Deep Learning projects related to computer vision.
Predicted Topic: Computer vision and image processing

Input: Quantum computing is fascinating, though it's not in our topic list.
Predicted Topic: Quantum Computing

Input: Machine Learning and NLP are where my passion lies.
Predicted Topic: Natural Language Processing

Input: I'm excited about applying AI in healthcare for disease prediction.
Predicted Topic: unknown



