In [None]:
import nltk
nltk.download('punkt', download_dir='nltk_data')
nltk.download('stopwords', download_dir='nltk_data')
nltk.data.path.append('nltk_data')  # Add to path if needed


[nltk_data] Downloading package punkt to nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import pandas as pd
import re
import nltk
import gensim
import spacy
import string
import matplotlib.pyplot as plt
import scipy.sparse as sp

from sklearn.feature_extraction.text import TfidfVectorizer
from docx import Document
from collections import Counter
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from rake_nltk import Rake
from wordcloud import WordCloud
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Importing Data**

In [None]:
nlp = spacy.load('en_core_web_sm')

def read_docx(df_cseg, i):
    doc = df_cseg.Segment[i]
    data = {'Person': [], 'Text': [], 'Time': [],
            'Code': df_cseg['Code'][i].split('>')[-1].strip(),
            'Area': df_cseg['Area'][i],
            'Coverage': df_cseg['Coverage %'][i]}
    for paragraph in doc.split('\n'):
        # Use regular expression to find text after "\t[hh:mm:ss]\t"
        match = re.search(r'\t(\d{2}:\d{2}:\d{2})\t', paragraph)
        if match:
            start_index = match.start()
            end_index = match.end()
            time = match.group(1)
            part_before_time = paragraph[:start_index].strip()
            text_after_time = paragraph[end_index:]
            data['Person'].append(part_before_time)
            data['Text'].append(text_after_time)
            data['Time'].append(time)
    return pd.DataFrame(data)

In [None]:
import os
from tqdm import tqdm

num_coded_segs = []
num_codes = []

folder_path = '/content/drive/MyDrive/Colab Notebooks/NLP coded = parsed = interview Q coded .xlsx & .mx24 files'
df_combo = pd.DataFrame()

file_paths = []

for filename in os.listdir(folder_path):
    full_path = os.path.join(folder_path, filename)
    if os.path.isfile(full_path) and filename.endswith('.xlsx'):
        file_paths.append(folder_path + "/" + filename)

print("Number of files in queue - ", len(file_paths))

for item in tqdm(range(len(file_paths))):

    try:
        file_path = file_paths[item]

        df_csys = pd.read_excel(file_path, sheet_name='Code System').fillna({'Code System': '', 'Unnamed: 1': '', 'Unnamed: 2': '', 'Unnamed: 3': ''})
        df_csys['Code System'] = df_csys['Code System'].astype(str) + df_csys['Unnamed: 1'].astype(str) + df_csys['Unnamed: 2'].astype(str) + df_csys['Unnamed: 3'].astype(str)
        df_csys = df_csys[['Code System', 'Frequency']]

        num_coded_segs.append(df_csys['Frequency'][0])
        df_csys = df_csys.drop(index=0).reset_index(drop=True)
        num_codes.append(len(df_csys))

        df = pd.DataFrame()
        df_cseg = pd.read_excel(file_path, sheet_name='Coded Segments')
        for i in range(len(df_cseg)):
            df = pd.concat([df, read_docx(df_cseg, i)], ignore_index=True)

        df_combo = pd.concat([df_combo, df], ignore_index=True)

    except:
        print("File failed - ", file_path)
        pass

Number of files in queue -  164


 96%|█████████▋| 158/164 [00:26<00:00,  6.09it/s]

File failed -  /content/drive/MyDrive/Colab Notebooks/NLP coded = parsed = interview Q coded .xlsx & .mx24 files/NLP KumarKuldeep.mx24.xlsx


100%|██████████| 164/164 [00:27<00:00,  6.06it/s]


In [None]:
df_combo

Unnamed: 0,Person,Text,Time,Code,Area,Coverage
0,Interviewer,"All right. So, could you start by telling me w...",00:00:03,Question 2-b,287,0.656285
1,Interviewee,"Oh, I am a psychiatrist and I am a professor o...",00:00:06,Question 2-b,287,0.656285
2,Interviewer,How long have you been a psychiatrist?,00:00:19,Question 2-b,287,0.656285
3,Interviewee,"Uh, since 2002.",00:00:22,Question 2-b,287,0.656285
4,Interviewer,Okay. And I didn't get a chance to see where y...,00:00:29,Question 2-d,483,1.104480
...,...,...,...,...,...,...
32276,Interviewee,And some of these conferences that I'm talking...,00:45:25,Question 5-d,1148,2.414809
32277,Interviewer,Hm.,00:45:35,Question 5-d,1148,2.414809
32278,Interviewee,"Internationally when, the journal that I talke...",00:45:36,Question 5-d,1148,2.414809
32279,Interviewer,Hm.,00:45:41,Question 5-d,1148,2.414809


In [None]:
import re

# Define the conceptual codes list
conceptual_codes_list = [
    "Scholarly Positioning and Motivation",
    "National or International Context",
    "Discipline and Knowledge Production",
    "Personal Turning Points",
    "Politics, Ethics, and Morals",
    "Research Impact on the Real World",
    "Research Impact on Academia"
]

# Function to populate NLP codes and Conceptual codes
def split_codes(code):
    if code in conceptual_codes_list:  # If the code is a conceptual code
        return "", code  # NLPCode is blank, ConceptualCode is populated
    else:  # Otherwise, it's an NLP code
        standardized_code = re.sub(r'[-a-zA-Z]+$', '', code)  # Standardize NLP codes
        return standardized_code, ""  # NLPCode populated, ConceptualCode is blank

# Apply the function and split into two columns
df_combo[['NLPCodes', 'ConceptualCodes']] = df_combo['Code'].apply(
    lambda x: pd.Series(split_codes(x))
)


In [None]:
df_combo = df_combo[df_combo['Text'].notna() & df_combo['Text'].str.strip().ne('')]



In [None]:
df_combo.drop(columns=['Code', "Person", "Time", "Area", "Coverage"], inplace=True)




**Final Data for modeling**

In [None]:
df_combo

Unnamed: 0,Text,NLPCodes,ConceptualCodes
0,"All right. So, could you start by telling me w...",Question 2,
1,"Oh, I am a psychiatrist and I am a professor o...",Question 2,
2,How long have you been a psychiatrist?,Question 2,
3,"Uh, since 2002.",Question 2,
4,Okay. And I didn't get a chance to see where y...,Question 2,
...,...,...,...
32276,And some of these conferences that I'm talking...,Question 5,
32277,Hm.,Question 5,
32278,"Internationally when, the journal that I talke...",Question 5,
32279,Hm.,Question 5,


basic data cleaning

In [None]:
filler_words = {"hmm", "hmmm", "Mm, hm.", "Yes.", "Hmm.", "Mm.", "Hm.", "Mhm."}

# Remove rows where the 'Text' column contains only the filler words
df_combo = df_combo[~df_combo['Text'].str.strip().str.lower().isin(filler_words)]

# Reset the index after filtering
df_combo.reset_index(drop=True, inplace=True)

print(df_combo)


                                                    Text    NLPCodes  \
0      All right. So, could you start by telling me w...  Question 2   
1      Oh, I am a psychiatrist and I am a professor o...  Question 2   
2                 How long have you been a psychiatrist?  Question 2   
3                                        Uh, since 2002.  Question 2   
4      Okay. And I didn't get a chance to see where y...  Question 2   
...                                                  ...         ...   
32271  And some of these conferences that I'm talking...  Question 5   
32272                                                Hm.  Question 5   
32273  Internationally when, the journal that I talke...  Question 5   
32274                                                Hm.  Question 5   
32275  So it's not like a US thing. Of course, there ...  Question 5   

      ConceptualCodes  
0                      
1                      
2                      
3                      
4              

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

**Data Preprocessing**

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from rake_nltk import Rake


# Define custom stopwords
custom_stopwords = set([
    "um", "uh", "like", "yeah", "right", "well", "oh", "nt", "aint", "gonna",
    "gotta", "wanna", "dont", "cant", "wont", "im", "hes", "shes", "theyre",
    "youre", "ive", "didnt", "isnt", "arent", "aint", "hmm", "mm", "uhh", "blah"
])
all_stopwords = set(stopwords.words('english')).union(custom_stopwords)

# Step 1: Clean Text and Apply RAKE
def preprocess_and_extract_keywords(text):
    """
    Preprocess text and extract RAKE keywords:
    - Lowercasing
    - Removing punctuation
    - Removing stopwords (including custom stopwords)
    - Extracting keywords using RAKE
    """
    # Lowercase and clean text
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)      # Remove numbers

    # Extract RAKE keywords
    rake = Rake(stopwords=all_stopwords)
    rake.extract_keywords_from_text(text)
    keywords = rake.get_ranked_phrases()  # Extract ranked keywords
    keywords_text = " ".join(keywords)    # Combine keywords into a single string

    return keywords_text

# Apply preprocessing and keyword extraction
df = df_combo.copy()  # Make a copy of the DataFrame
df['Preprocessed_Text'] = df['Text'].apply(preprocess_and_extract_keywords)

# Display the processed DataFrame
print(df.head())


                                                Text    NLPCodes  \
0  All right. So, could you start by telling me w...  Question 2   
1  Oh, I am a psychiatrist and I am a professor o...  Question 2   
2             How long have you been a psychiatrist?  Question 2   
3                                    Uh, since 2002.  Question 2   
4  Okay. And I didn't get a chance to see where y...  Question 2   

  ConceptualCodes                         Preprocessed_Text  
0                       telling start field currently could  
1                  psychiatry psychiatrist professor brazil  
2                                         psychiatrist long  
3                                                     since  
4                               see okay located get chance  


**Label Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize two LabelEncoders
nlp_label_encoder = LabelEncoder()
conceptual_label_encoder = LabelEncoder()

# Handle missing values (if any) by filling NaN with a placeholder
df['NLPCodes'] = df['NLPCodes'].fillna('Unknown')
df['ConceptualCodes'] = df['ConceptualCodes'].fillna('Unknown')

# Encode the NLP Codes column
df['NLPCodes_Encoded'] = nlp_label_encoder.fit_transform(df['NLPCodes'])

# Encode the Conceptual Codes column
df['ConceptualCodes_Encoded'] = conceptual_label_encoder.fit_transform(df['ConceptualCodes'])

# Check the unique encoded values
print("NLP Codes Encoding Mapping:")
print(dict(zip(nlp_label_encoder.classes_, nlp_label_encoder.transform(nlp_label_encoder.classes_))))

print("\nConceptual Codes Encoding Mapping:")
print(dict(zip(conceptual_label_encoder.classes_, conceptual_label_encoder.transform(conceptual_label_encoder.classes_))))



NLP Codes Encoding Mapping:
{'': 0, '02 ': 1, '04 Research ': 2, '05 ': 3, '06 State of the ': 4, '07 Career Mobility and Time ': 5, '10 International Collaboration and ': 6, 'Quesiton 12': 7, 'Quesiton 13': 8, 'Quesiton 2': 9, 'Quesiton 3': 10, 'Quesiton 8': 11, 'Quesiton 9': 12, 'Question  11': 13, 'Question 10': 14, 'Question 11': 15, 'Question 12': 16, 'Question 13': 17, 'Question 2': 18, 'Question 3': 19, 'Question 4': 20, 'Question 5': 21, 'Question 6': 22, 'Question 7': 23, 'Question 8': 24, 'Question 9': 25}

Conceptual Codes Encoding Mapping:
{'': 0, 'Discipline and Knowledge Production': 1, 'National or International Context': 2, 'Personal Turning Points': 3, 'Politics, Ethics, and Morals': 4, 'Research Impact on Academia': 5, 'Research Impact on the Real World': 6, 'Scholarly Positioning and Motivation': 7}


**Tokenization using custom dataset class**

In [None]:
from transformers import RobertaTokenizer, DebertaTokenizer, RobertaForSequenceClassification, DebertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd

# Initialize tokenizers
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
deberta_tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

# Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, nlp_labels, conceptual_labels, tokenizer, max_len):
        self.texts = texts
        self.nlp_labels = nlp_labels
        self.conceptual_labels = conceptual_labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        nlp_label = self.nlp_labels[idx]
        conceptual_label = self.conceptual_labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(nlp_label, dtype=torch.long),  # NLP Codes
            "conceptual_label": torch.tensor(conceptual_label, dtype=torch.long),
            "Preprocessed_Text": text
        }




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

Prepare Datasets

In [None]:
# Split dataset into training, validation, and test sets
train_val_df = df.sample(frac=0.8, random_state=42)  # 80% for training and validation
test_df = df.drop(train_val_df.index)  # Remaining 20% for testing

# Further split train_val_df into training (70%) and validation (30%) sets
train_df = train_val_df.sample(frac=0.7, random_state=42)  # 56% of original dataset
val_df = train_val_df.drop(train_df.index)  # Remaining 24% for validation


# Prepare datasets
train_dataset = TextDataset(
    texts=train_df["Preprocessed_Text"].tolist(),
    nlp_labels=train_df["NLPCodes_Encoded"].tolist(),
    conceptual_labels=train_df["ConceptualCodes_Encoded"].tolist(),
    tokenizer=roberta_tokenizer,
    max_len=128,
)
val_dataset = TextDataset(
    texts=val_df["Preprocessed_Text"].tolist(),
    nlp_labels=val_df["NLPCodes_Encoded"].tolist(),
    conceptual_labels=val_df["ConceptualCodes_Encoded"].tolist(),
    tokenizer=roberta_tokenizer,
    max_len=128,
)
test_dataset = TextDataset(
    texts=test_df["Preprocessed_Text"].tolist(),
    nlp_labels=test_df["NLPCodes_Encoded"].tolist(),
    conceptual_labels=test_df["ConceptualCodes_Encoded"].tolist(),
    tokenizer=roberta_tokenizer,
    max_len=128,
)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)


Custom Roberta Deberta Models

In [None]:
from torch import nn

class CustomRobertaModel(nn.Module):
    def __init__(self, roberta_model, num_nlp_labels, num_conceptual_labels):
        super(CustomRobertaModel, self).__init__()
        self.roberta = roberta_model
        self.classifier_nlp = nn.Linear(roberta_model.config.hidden_size, num_nlp_labels)
        self.classifier_conceptual = nn.Linear(roberta_model.config.hidden_size, num_conceptual_labels)

    def forward(self, input_ids, attention_mask, labels=None, conceptual_labels=None):
        # Get the hidden states from the RoBERTa model
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)

        # Specifically, take the hidden state corresponding to the [CLS] token .
        hidden_state = outputs.hidden_states[-1][:, 0, :] #outputs[0][:, 0, :]

        # Pass the hidden states to the NLP and Conceptual Code classifiers
        logits_nlp = self.classifier_nlp(hidden_state)
        logits_conceptual = self.classifier_conceptual(hidden_state)

        loss = None


        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss_nlp = loss_fct(logits_nlp.view(-1, self.classifier_nlp.out_features), labels.view(-1))

            loss_conceptual = loss_fct(logits_conceptual.view(-1, self.classifier_conceptual.out_features), conceptual_labels.view(-1))
            loss = loss_nlp + loss_conceptual # Combine the losses

        return {"logits_nlp": logits_nlp, "logits_conceptual": logits_conceptual, "loss": loss}

In [None]:
from transformers import RobertaForSequenceClassification, DebertaForSequenceClassification
import torch.optim as optim
from transformers import get_scheduler
from torch.nn import CrossEntropyLoss

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize models
num_nlp_labels = len(df["NLPCodes_Encoded"].unique())
num_conceptual_labels = len(df["ConceptualCodes_Encoded"].unique())

# Initialize RoBERTa model with `output_hidden_states=True`
roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base", output_hidden_states=True)
custom_model = CustomRobertaModel(roberta_model, num_nlp_labels, num_conceptual_labels).to(device)

#Commenting Deberta for future use
# DeBERTa
#deberta_model = DebertaForSequenceClassification.from_pretrained(
 #   "microsoft/deberta-base", num_labels=num_labels).to(device)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Fine tuning**

In [None]:
# Optimizer and Scheduler
learning_rate = 2e-5
num_epochs = 5

# RoBERTa Optimizer
roberta_optimizer = optim.AdamW(roberta_model.parameters(), lr=learning_rate)

# DeBERTa Optimizer
#deberta_optimizer = optim.AdamW(deberta_model.parameters(), lr=learning_rate)

# Learning rate scheduler
rscheduler = get_scheduler(
    name="linear",
    optimizer=roberta_optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_loader) * num_epochs,
)

#dscheduler = get_scheduler(
 #   name="linear",
  #  optimizer=deberta_optimizer,
  #  num_warmup_steps=0,num_training_steps=len(train_loader) * num_epochs,)


In [None]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU found.")


GPU is available: NVIDIA L4


In [None]:
# Training Loop
def train_model(model, optimizer, train_loader, val_loader, scheduler, num_epochs):
    model.train()
    criterion = CrossEntropyLoss()

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        epoch_loss = 0

        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)  # NLP labels
            conceptual_labels = batch["conceptual_label"].to(device)  # Conceptual labels

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                conceptual_labels=conceptual_labels
            )

            loss = outputs["loss"]
            loss.backward()

            optimizer.step()
            scheduler.step()

            epoch_loss += loss.item()

        print(f"Training Loss: {epoch_loss / len(train_loader):.4f}")

        # Validation
        evaluate_model(model, val_loader)


# Evaluation Function
def evaluate_model(model, val_loader):
    model.eval()
    correct_nlp = 0
    correct_conceptual = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)  # NLP labels
            conceptual_labels = batch["conceptual_label"].to(device)  # Conceptual labels

            outputs = model(input_ids, attention_mask=attention_mask)

            # NLP Predictions
            predictions_nlp = torch.argmax(outputs["logits_nlp"], dim=1)
            correct_nlp += (predictions_nlp == labels).sum().item()

            # Conceptual Predictions
            predictions_conceptual = torch.argmax(outputs["logits_conceptual"], dim=1)
            correct_conceptual += (predictions_conceptual == conceptual_labels).sum().item()

            total += labels.size(0)

    accuracy_nlp = correct_nlp / total
    accuracy_conceptual = correct_conceptual / total
    print(f"Validation Accuracy - NLP Codes: {accuracy_nlp * 100:.2f}%")
    print(f"Validation Accuracy - Conceptual Codes: {accuracy_conceptual * 100:.2f}%")



**Training RoBERTa and DeBERTa**

In [24]:
# Train RoBERTa
print("Training RoBERTa...")
train_model(
    model=custom_model,
    optimizer=roberta_optimizer,
    train_loader=train_loader,
    val_loader=val_loader,
    scheduler=rscheduler,
    num_epochs=num_epochs,
)

# Train DeBERTa
#print("Training DeBERTa...")
#train_model(
  #  model=deberta_model,
   # optimizer=deberta_optimizer,
   # train_loader=train_loader,
   # val_loader=val_loader,
    #scheduler=dscheduler,
    #num_epochs=num_epochs,)


Training RoBERTa...
Epoch 1/5
Training Loss: 2.6449
Validation Accuracy - NLP Codes: 42.33%
Validation Accuracy - Conceptual Codes: 91.27%
Epoch 2/5
Training Loss: 2.1220
Validation Accuracy - NLP Codes: 45.37%
Validation Accuracy - Conceptual Codes: 91.21%
Epoch 3/5
Training Loss: 1.7533
Validation Accuracy - NLP Codes: 45.84%
Validation Accuracy - Conceptual Codes: 91.22%
Epoch 4/5
Training Loss: 1.3920
Validation Accuracy - NLP Codes: 46.85%
Validation Accuracy - Conceptual Codes: 90.10%
Epoch 5/5
Training Loss: 1.1676
Validation Accuracy - NLP Codes: 47.04%
Validation Accuracy - Conceptual Codes: 89.81%


**Predict Conceptual codes/NLP codes**

In [26]:
from torch.nn.functional import softmax

def predict_conceptual_codes(model, data_loader, label_encoder_nlp, label_encoder_conceptual):
    """
    Predicts NLP codes and corresponding Conceptual codes using the trained model.
    Args:
        model: CustomRobertaModel with two classifiers.
        data_loader: DataLoader for test/validation data.
        label_encoder_nlp: LabelEncoder for NLP codes.
        label_encoder_conceptual: LabelEncoder for Conceptual codes.
    Returns:
        A DataFrame with input text, predicted NLP codes, and predicted Conceptual codes.
    """
    model.eval()
    texts = []
    predicted_nlp_codes = []
    predicted_conceptual_codes = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Forward pass through the model
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # NLP Code Predictions
            logits_nlp = outputs["logits_nlp"]
            probabilities_nlp = softmax(logits_nlp, dim=1)
            predicted_nlp_indices = torch.argmax(probabilities_nlp, dim=1).cpu().numpy()
            predicted_nlp_labels = label_encoder_nlp.inverse_transform(predicted_nlp_indices)

            # Conceptual Code Predictions
            logits_conceptual = outputs["logits_conceptual"]
            probabilities_conceptual = softmax(logits_conceptual, dim=1)
            predicted_conceptual_indices = torch.argmax(probabilities_conceptual, dim=1).cpu().numpy()
            predicted_conceptual_labels = label_encoder_conceptual.inverse_transform(predicted_conceptual_indices)

            # Collect results and convert tensors to strings
            texts.extend([t for t in batch["Preprocessed_Text"]])
            predicted_nlp_codes.extend(predicted_nlp_labels)
            predicted_conceptual_codes.extend(predicted_conceptual_labels)

    # Combine results into a DataFrame
    results_df = pd.DataFrame({
        "Text": texts,
        "Predicted NLP Codes": predicted_nlp_codes,
        "Predicted Conceptual Codes": predicted_conceptual_codes
    })

    return results_df

**Results**

In [30]:
results_df = predict_conceptual_codes(
    model=custom_model,
    data_loader=test_loader,
    label_encoder_nlp=nlp_label_encoder,
    label_encoder_conceptual=conceptual_label_encoder
)

In [31]:
results_df

Unnamed: 0,Text,Predicted NLP Codes,Predicted Conceptual Codes
0,whose name medical school anxiety disorders re...,Question 3,
1,,Question 13,
2,one case owner harvard university inaudible st...,Question 3,
3,cognitive behavioral therapy cbt,,
4,one session one session great finding depressi...,,
...,...,...,...
6450,hm,Question 3,
6451,start thinking ok new contribution might need ...,Question 5,
6452,international machine learning conference real...,Question 5,
6453,hm,Question 3,


Calculate Metrics

In [29]:
from sklearn.metrics import f1_score, precision_score, accuracy_score
from torch.nn.functional import softmax

def evaluate_model_metrics(model, data_loader, label_encoder_nlp, label_encoder_conceptual):
    """
    Predicts NLP codes and Conceptual codes and computes evaluation metrics.

    Args:
        model: CustomRobertaModel with two classifiers.
        data_loader: DataLoader for test/validation data.
        label_encoder_nlp: LabelEncoder for NLP codes.
        label_encoder_conceptual: LabelEncoder for Conceptual codes.

    Returns:
        A dictionary containing accuracy, F1-score, and precision for NLP and Conceptual codes.
    """
    model.eval()
    true_nlp_labels = []
    predicted_nlp_labels = []
    true_conceptual_labels = []
    predicted_conceptual_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Forward pass through the model
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # NLP Code Predictions
            logits_nlp = outputs["logits_nlp"]
            probabilities_nlp = softmax(logits_nlp, dim=1)
            predicted_nlp_indices = torch.argmax(probabilities_nlp, dim=1).cpu().numpy()
            true_nlp_labels.extend(batch["label"].cpu().numpy())
            predicted_nlp_labels.extend(predicted_nlp_indices)

            # Conceptual Code Predictions
            logits_conceptual = outputs["logits_conceptual"]
            probabilities_conceptual = softmax(logits_conceptual, dim=1)
            predicted_conceptual_indices = torch.argmax(probabilities_conceptual, dim=1).cpu().numpy()
            true_conceptual_labels.extend(batch["conceptual_label"].cpu().numpy())
            predicted_conceptual_labels.extend(predicted_conceptual_indices)

    # Decode labels for metrics computation
    decoded_true_nlp = label_encoder_nlp.inverse_transform(true_nlp_labels)
    decoded_predicted_nlp = label_encoder_nlp.inverse_transform(predicted_nlp_labels)
    decoded_true_conceptual = label_encoder_conceptual.inverse_transform(true_conceptual_labels)
    decoded_predicted_conceptual = label_encoder_conceptual.inverse_transform(predicted_conceptual_labels)

    # Compute Metrics for NLP Codes
    nlp_accuracy = accuracy_score(decoded_true_nlp, decoded_predicted_nlp)
    nlp_f1 = f1_score(decoded_true_nlp, decoded_predicted_nlp, average="weighted")
    nlp_precision = precision_score(decoded_true_nlp, decoded_predicted_nlp, average="weighted")

    # Compute Metrics for Conceptual Codes
    conceptual_accuracy = accuracy_score(decoded_true_conceptual, decoded_predicted_conceptual)
    conceptual_f1 = f1_score(decoded_true_conceptual, decoded_predicted_conceptual, average="weighted")
    conceptual_precision = precision_score(decoded_true_conceptual, decoded_predicted_conceptual, average="weighted")

    return {
        "NLP Codes": {
            "Accuracy": nlp_accuracy,
            "F1-Score": nlp_f1,
            "Precision": nlp_precision,
        },
        "Conceptual Codes": {
            "Accuracy": conceptual_accuracy,
            "F1-Score": conceptual_f1,
            "Precision": conceptual_precision,
        },
    }


In [32]:
metrics = evaluate_model_metrics(
    model=custom_model,
    data_loader=test_loader,
    label_encoder_nlp=nlp_label_encoder,
    label_encoder_conceptual=conceptual_label_encoder
)

print("Evaluation Metrics:")
print("NLP Codes:")
print(metrics["NLP Codes"])
print("Conceptual Codes:")
print(metrics["Conceptual Codes"])


Evaluation Metrics:
NLP Codes:
{'Accuracy': 0.4757552285050349, 'F1-Score': 0.46943313952204724, 'Precision': 0.49051686399310174}
Conceptual Codes:
{'Accuracy': 0.8977536793183578, 'F1-Score': 0.8674249381874762, 'Precision': 0.839207808842279}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
