# Import Libraries


In [1]:
import pandas as pd
import re

## Load the Raw Dataset


In [2]:

df = pd.read_csv("semiLabelledData.csv")

print("ðŸ”¹ Original shape:", df.shape)
print("ðŸ”¹ Columns available:", df.columns.tolist())

df.head()


ðŸ”¹ Original shape: (10658, 11)
ðŸ”¹ Columns available: ['Unnamed: 0.1', 'Unnamed: 0', 'req1', 'req1_id', 'req2', 'req2_id', 'similarity', 'cosine', 'BinaryClass', 'MultiClass', 'Id']


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,req1,req1_id,req2,req2_id,similarity,cosine,BinaryClass,MultiClass,Id
0,9999,9999,the current operational status shall be indica...,PURE_B001,the dmi shall indicate the result of the self-...,PURE_B004,9999,9999,1,9999,9999
1,9999,9999,isolation of the etcs trainborne equipment sha...,PURE_B002,the etcs trainborne equipment shall transmit i...,PURE_B029,9999,9999,0,9999,9999
2,9999,9999,the driver shall know the distance to the next...,PURE_B003,when the traction unit has come to a standstil...,PURE_B068,9999,9999,0,9999,9999
3,9999,9999,the driver shall know the distance to the next...,PURE_B003,the driver shall be able to use the reversing ...,PURE_B055,9999,9999,0,9999,9999
4,9999,9999,the dmi shall indicate the result of the self-...,PURE_B004,operation of the train trip shall be indicated...,PURE_B099,9999,9999,1,9999,9999


## Select Useful Columns

In [3]:
df = df[["req1", "req2", "BinaryClass"]].copy()

## Remove Missing / Invalid Data

In [4]:
df.dropna(subset=["req1", "req2", "BinaryClass"], inplace=True)
df = df[df["req1"].apply(lambda x: isinstance(x, str))]
df = df[df["req2"].apply(lambda x: isinstance(x, str))]

## Clean Text

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)      # remove URLs
    text = re.sub(r"[^a-z\s]", "", text)     # remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip() # remove extra spaces
    return text

df["req1"] = df["req1"].apply(clean_text)
df["req2"] = df["req2"].apply(clean_text)

## Remove Duplicates & Invalid Labels


In [6]:
df = df[(df["req1"].str.len() > 3) & (df["req2"].str.len() > 3)]
df.drop_duplicates(subset=["req1", "req2"], inplace=True)
df.rename(columns={"BinaryClass": "label"}, inplace=True)
df = df[df["label"].isin([0, 1])]

## Save Cleaned Dataset

In [7]:
output_file = "cleaned_requirements.csv"
df.to_csv(output_file, index=False)

print(f"âœ… Cleaned dataset saved as '{output_file}'")
print("âœ… Total rows after cleaning:", len(df))
df.head()

âœ… Cleaned dataset saved as 'cleaned_requirements.csv'
âœ… Total rows after cleaning: 10418


Unnamed: 0,req1,req2,label
0,the current operational status shall be indica...,the dmi shall indicate the result of the selftest,1
1,isolation of the etcs trainborne equipment sha...,the etcs trainborne equipment shall transmit i...,0
2,the driver shall know the distance to the next...,when the traction unit has come to a standstil...,0
3,the driver shall know the distance to the next...,the driver shall be able to use the reversing ...,0
4,the dmi shall indicate the result of the selftest,operation of the train trip shall be indicated...,1


## Install & Import Libraries

In [8]:
# Install missing package(s) in the notebook environment
%pip install -q datasets



Note: you may need to restart the kernel to use updated packages.


In [9]:
%pip install -q transformers

Note: you may need to restart the kernel to use updated packages.


In [10]:
%pip install -q torch

Note: you may need to restart the kernel to use updated packages.


In [11]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score