# **1. Data Loading**

In [None]:
!pip install simpletransformers

import pandas as pd
from sklearn.model_selection import train_test_split


Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m41.0/42.4 kB[0m [31m21.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m817.0 kB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit 

In [None]:
data = pd.read_csv('NLP_Dataset_Extended.csv')

print(data.info())
print(data['Prediction'].value_counts())
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Input       2000 non-null   object
 1   Prediction  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB
None
Prediction
low-risk     1000
high-risk    1000
Name: count, dtype: int64


In [None]:
# Preparing the data in the correct format for SimpleTransformers
train_df = pd.DataFrame({
    'text': train_data['Input'],
    'labels': train_data['Prediction']  # High-risk or low-risk labels
})

val_df = pd.DataFrame({
    'text': val_data['Input'],
    'labels': val_data['Prediction']
})
print("Before mapping - train_df columns:", train_df.columns)
print("Before mapping - val_df columns:", val_df.columns)
print("Before mapping - unique labels in train_df:", train_df['labels'].unique())
print("Before mapping - unique labels in val_df:", val_df['labels'].unique())

label_mapping = {"low-risk": 0, "high-risk": 1}

train_df['labels'] = train_df['labels'].map(label_mapping)
val_df['labels'] = val_df['labels'].map(label_mapping)

print("After mapping - unique labels in train_df:", train_df['labels'].unique())
print("After mapping - unique labels in val_df:", val_df['labels'].unique())

print("Any NaN in train_df labels after mapping:", train_df['labels'].isna().any())
print("Any NaN in val_df labels after mapping:", val_df['labels'].isna().any())

train_df = train_df.dropna(subset=['labels'])
val_df = val_df.dropna(subset=['labels'])

train_df['labels'] = train_df['labels'].astype(int)
val_df['labels'] = val_df['labels'].astype(int)

print("Sample of training data:")
print(train_df.head())
print("\nSample of validation data:")
print(val_df.head())

Before mapping - train_df columns: Index(['text', 'labels'], dtype='object')
Before mapping - val_df columns: Index(['text', 'labels'], dtype='object')
Before mapping - unique labels in train_df: ['high-risk' 'low-risk']
Before mapping - unique labels in val_df: ['high-risk' 'low-risk']
After mapping - unique labels in train_df: [1 0]
After mapping - unique labels in val_df: [1 0]
Any NaN in train_df labels after mapping: False
Any NaN in val_df labels after mapping: False
Sample of training data:
                                                  text  labels
968  Patient is a 37-year-old female with total cho...       1
240  Patient is a 80-year-old female with total cho...       1
819  Patient is a 55-year-old male with total chole...       0
692  Patient is a 45-year-old male with total chole...       0
420  Patient is a 20-year-old male with total chole...       0

Sample of validation data:
                                                   text  labels
1860  Patient is a 21-year-

# **2. Text PreProcessing**

In [None]:
import re

def clean_health_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s\d/.-]', '', text)  # Keep letters, numbers, spaces, slashes, dots, and hyphens

    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    return text

train_df['text'] = train_df['text'].apply(clean_health_text)
val_df['text'] = val_df['text'].apply(clean_health_text)
print("Sample of cleaned training data:")
print(train_df.head())

Sample of cleaned training data:
                                                  text  labels
968  patient is a 62-year-old female with total cho...       1
240  patient is a 74-year-old female with total cho...       1
819  patient is a 52-year-old female with total cho...       0
692  patient is a 41-year-old female with total cho...       0
420  patient is a 42-year-old male with total chole...       1


# **3. Text Embedding using BERT and RoBERTa**

**Training the BERT model**

In [None]:
from simpletransformers.classification import ClassificationModel
training_args = {
    "num_train_epochs": 5,          # More epochs for better convergence
    "train_batch_size": 64,         # Larger batch size for stable gradients
    "eval_batch_size": 64,          # Match for evaluation
    "max_seq_length": 256,          # Longer sequences to capture full context
    "learning_rate": 2e-5,          # Slightly lower LR for finer updates
    "fp16": True,                   # Mixed precision to fit larger batches
    "overwrite_output_dir": True,   # Overwrite previous outputs
    "evaluate_during_training": True,  # Monitor validation performance
    "use_early_stopping": True,     # Stop if no improvement
    "early_stopping_patience": 2,   # Wait 2 epochs for improvement
    "early_stopping_delta": 0.01,   # Minimum improvement to count
    "adam_epsilon": 1e-8,           # Default optimizer stability
    "warmup_steps": 100,            # Gradual LR warmup for stability
    "weight_decay": 0.01,           # Regularization to prevent overfitting
    "output_dir": "outputs/",       # Temporary local directory
    "best_model_dir": "outputs/best_model/"  # Save best model
}

print("Training DistilBERT model...")
bert_model = ClassificationModel(
    "distilbert",
    "distilbert-base-uncased",
    num_labels=2,
    args=training_args,
    use_cuda = True
)

bert_model.train_model(train_df, eval_df=val_df)

Training DistilBERT model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 5:   0%|          | 0/25 [00:00<?, ?it/s]

  with amp.autocast():


0it [00:00, ?it/s]

  with amp.autocast():


Running Epoch 2 of 5:   0%|          | 0/25 [00:00<?, ?it/s]

  with amp.autocast():


0it [00:00, ?it/s]

  with amp.autocast():


Running Epoch 3 of 5:   0%|          | 0/25 [00:00<?, ?it/s]

  with amp.autocast():


0it [00:00, ?it/s]

  with amp.autocast():


Running Epoch 4 of 5:   0%|          | 0/25 [00:00<?, ?it/s]

  with amp.autocast():


0it [00:00, ?it/s]

  with amp.autocast():


Running Epoch 5 of 5:   0%|          | 0/25 [00:00<?, ?it/s]

  with amp.autocast():


0it [00:00, ?it/s]

  with amp.autocast():


(125,
 defaultdict(list,
             {'global_step': [25, 50, 75, 100, 125],
              'train_loss': [0.5951766967773438,
               0.5443115234375,
               0.5090799331665039,
               0.39609575271606445,
               0.34929609298706055],
              'mcc': [0.0,
               0.0,
               np.float64(0.5097696163531571),
               np.float64(0.5704340097763294),
               np.float64(0.5815924178018351)],
              'accuracy': [0.785, 0.785, 0.8525, 0.855, 0.855],
              'f1_score': [0.43977591036414565,
               0.43977591036414565,
               0.7371205792258424,
               0.7852170048881647,
               0.7904548574731747],
              'tp': [np.int64(314),
               np.int64(314),
               np.int64(303),
               np.int64(285),
               np.int64(282)],
              'tn': [np.int64(0),
               np.int64(0),
               np.int64(38),
               np.int64(57),
             

**Training the Roberta Model**

In [None]:
print("Training RoBERTa model...")
roberta_model = ClassificationModel(
    "roberta",
    "roberta-base",
    num_labels=2,
    args=training_args,
    use_cuda = True# Apply fast settings
)

roberta_model.train_model(train_df, eval_df=val_df)
# import torch
# torch.cuda.is_available()

Training RoBERTa model...


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 5:   0%|          | 0/25 [00:00<?, ?it/s]

  with amp.autocast():


0it [00:00, ?it/s]

  with amp.autocast():


Running Epoch 2 of 5:   0%|          | 0/25 [00:00<?, ?it/s]

  with amp.autocast():


0it [00:00, ?it/s]

  with amp.autocast():


Running Epoch 3 of 5:   0%|          | 0/25 [00:00<?, ?it/s]

  with amp.autocast():


0it [00:00, ?it/s]

  with amp.autocast():


Running Epoch 4 of 5:   0%|          | 0/25 [00:00<?, ?it/s]

  with amp.autocast():


0it [00:00, ?it/s]

  with amp.autocast():


Running Epoch 5 of 5:   0%|          | 0/25 [00:00<?, ?it/s]

  with amp.autocast():


0it [00:00, ?it/s]

  with amp.autocast():


(125,
 defaultdict(list,
             {'global_step': [25, 50, 75, 100, 125],
              'train_loss': [0.6189594268798828,
               0.5299205780029297,
               0.29204559326171875,
               0.34792256355285645,
               0.20083045959472656],
              'mcc': [0.0,
               0.0,
               0.0,
               np.float64(0.5685835876620333),
               np.float64(0.6051127010779447)],
              'accuracy': [0.785, 0.785, 0.785, 0.8425, 0.8625],
              'f1_score': [0.43977591036414565,
               0.43977591036414565,
               0.43977591036414565,
               0.7817785051827605,
               0.8020853012351676],
              'tp': [np.int64(314),
               np.int64(314),
               np.int64(314),
               np.int64(274),
               np.int64(283)],
              'tn': [np.int64(0),
               np.int64(0),
               np.int64(0),
               np.int64(63),
               np.int64(62)],
     

# **5. Evaluate on Validation Set**

**Evaluate the BERT model**

In [None]:
print("Evaluating DistilBERT model...")
result_bert, model_outputs_bert, wrong_predictions_bert = bert_model.eval_model(val_df)

print("DistilBERT Evaluation Results:")
print(result_bert)

Evaluating DistilBERT model...


0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/7 [00:00<?, ?it/s]

  with amp.autocast():


DistilBERT Evaluation Results:
{'mcc': np.float64(0.5815924178018351), 'accuracy': 0.855, 'f1_score': 0.7904548574731747, 'tp': np.int64(282), 'tn': np.int64(60), 'fp': np.int64(26), 'fn': np.int64(32), 'auroc': np.float64(0.916419789660791), 'auprc': np.float64(0.9755742873028195), 'eval_loss': 0.304013763155256}


**Evaluate the ROBERTA model**

In [None]:
print("Evaluating RoBERTa model...")
result_roberta, model_outputs_roberta, wrong_predictions_roberta = roberta_model.eval_model(val_df)

print("RoBERTa Evaluation Results:")
print(result_roberta)

Evaluating RoBERTa model...


0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/7 [00:00<?, ?it/s]

  with amp.autocast():


RoBERTa Evaluation Results:
{'mcc': np.float64(0.6051127010779447), 'accuracy': 0.8625, 'f1_score': 0.8020853012351676, 'tp': np.int64(283), 'tn': np.int64(62), 'fp': np.int64(24), 'fn': np.int64(31), 'auroc': np.float64(0.9234187527773664), 'auprc': np.float64(0.978326274207754), 'eval_loss': 0.28796838436807903}


# **6. Saving the Best Model**

In [None]:
bert_model.save_model('bert_best_model')

In [None]:
roberta_model.save_model('roberta_best_model')

# **7. Prediction on Real-World Input**

In [None]:
real_world_text = [
    "Patient is a 45-year-old male with total cholesterol 220 mg/dl, hdl cholesterol 40 mg/dl, blood pressure 140/90, smoker, no diabetes, and family history of heart disease. has a desk job and rarely exercises.",
    "Patient is a 30-year-old female with total cholesterol 160 mg/dl, hdl cholesterol 60 mg/dl, blood pressure 110/70, non-smoker, no diabetes, and no family history. exercises regularly and eats a balanced diet.",
    "Patient is a 28-year-old male with total cholesterol 150 mg/dl, hdl cholesterol 55 mg/dl, blood pressure 115/75, non-smoker, no diabetes, and no family history of heart disease. exercises 5 days a week and follows a balanced diet."
]

# Predict the class with DistilBERT
predictions_bert, _ = bert_model.predict(real_world_text)
print(f"DistilBERT Predictions: {predictions_bert} (0 = low-risk, 1 = high-risk)")


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

DistilBERT Predictions: [1 0 1] (0 = low-risk, 1 = high-risk)


In [None]:
predictions_roberta, _ = roberta_model.predict(real_world_text)
print(f"RoBERTa Predictions: {predictions_roberta} (0 = low-risk, 1 = high-risk)")

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

RoBERTa Predictions: [1 0 0] (0 = low-risk, 1 = high-risk)
