In [None]:
import nltk
nltk.download('punkt', download_dir='nltk_data')
nltk.download('stopwords', download_dir='nltk_data')
nltk.data.path.append('nltk_data')  # Add to path if needed


[nltk_data] Downloading package punkt to nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import pandas as pd
import re
import nltk
import gensim
import spacy
import string
import matplotlib.pyplot as plt
import scipy.sparse as sp

from sklearn.feature_extraction.text import TfidfVectorizer
from docx import Document
from collections import Counter
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from rake_nltk import Rake
from wordcloud import WordCloud
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
nlp = spacy.load('en_core_web_sm')

def read_docx(df_cseg, i):
    doc = df_cseg.Segment[i]
    data = {'Person': [], 'Text': [], 'Time': [],
            'Code': df_cseg['Code'][i].split('>')[-1].strip(),
            'Area': df_cseg['Area'][i],
            'Coverage': df_cseg['Coverage %'][i]}
    for paragraph in doc.split('\n'):
        # Use regular expression to find text after "\t[hh:mm:ss]\t"
        match = re.search(r'\t(\d{2}:\d{2}:\d{2})\t', paragraph)
        if match:
            start_index = match.start()
            end_index = match.end()
            time = match.group(1)
            part_before_time = paragraph[:start_index].strip()
            text_after_time = paragraph[end_index:]
            data['Person'].append(part_before_time)
            data['Text'].append(text_after_time)
            data['Time'].append(time)
    return pd.DataFrame(data)

In [None]:
import os
from tqdm import tqdm

num_coded_segs = []
num_codes = []

folder_path = '/content/drive/MyDrive/Colab Notebooks/NLP coded = parsed = interview Q coded .xlsx & .mx24 files'
df_combo = pd.DataFrame()

file_paths = []

for filename in os.listdir(folder_path):
    full_path = os.path.join(folder_path, filename)
    if os.path.isfile(full_path) and filename.endswith('.xlsx'):
        file_paths.append(folder_path + "/" + filename)

print("Number of files in queue - ", len(file_paths))

for item in tqdm(range(len(file_paths))):

    try:
        file_path = file_paths[item]

        df_csys = pd.read_excel(file_path, sheet_name='Code System').fillna({'Code System': '', 'Unnamed: 1': '', 'Unnamed: 2': '', 'Unnamed: 3': ''})
        df_csys['Code System'] = df_csys['Code System'].astype(str) + df_csys['Unnamed: 1'].astype(str) + df_csys['Unnamed: 2'].astype(str) + df_csys['Unnamed: 3'].astype(str)
        df_csys = df_csys[['Code System', 'Frequency']]

        num_coded_segs.append(df_csys['Frequency'][0])
        df_csys = df_csys.drop(index=0).reset_index(drop=True)
        num_codes.append(len(df_csys))

        df = pd.DataFrame()
        df_cseg = pd.read_excel(file_path, sheet_name='Coded Segments')
        for i in range(len(df_cseg)):
            df = pd.concat([df, read_docx(df_cseg, i)], ignore_index=True)

        df_combo = pd.concat([df_combo, df], ignore_index=True)

    except:
        print("File failed - ", file_path)
        pass

Number of files in queue -  153


 97%|█████████▋| 149/153 [00:19<00:00, 10.76it/s]

File failed -  /content/drive/MyDrive/Colab Notebooks/NLP coded = parsed = interview Q coded .xlsx & .mx24 files/NLP KumarKuldeep.mx24.xlsx


100%|██████████| 153/153 [00:20<00:00,  7.61it/s]


In [None]:
def standardize_code(code):
    # Use regex to remove the alphabetic suffix, e.g., '11a' -> '11'
    return re.sub(r'[-a-zA-Z]+$', '', code)

# Apply the standardization to the 'Code' column in df_combo
df_combo['Code'] = df_combo['Code'].apply(standardize_code)

# Display the updated DataFrame
print("Updated df_combo:")
print(df_combo.head())

Updated df_combo:
        Person                                               Text      Time  \
0  Interviewer  Okay. So I'd like to begin by asking a few que...  00:00:25   
1  Interviewee  I know I have always wanted to be a teacher. S...  00:00:39   
2  Interviewer  Hm. I see. Well, uh, this is a little bit out ...  00:01:51   
3  Interviewee                                My peers? My peers?  00:02:05   
4  Interviewer                                              Yeah.  00:02:09   

         Code  Area  Coverage  
0  Question 3  1084  2.276259  
1  Question 3  1084  2.276259  
2  Quesiton 3   727  1.526605  
3  Quesiton 3   727  1.526605  
4  Quesiton 3   727  1.526605  


In [None]:
df_combo

Unnamed: 0,Person,Text,Time,Code,Area,Coverage
0,Interviewer,Okay. So I'd like to begin by asking a few que...,00:00:25,Question 3,1084,2.276259
1,Interviewee,I know I have always wanted to be a teacher. S...,00:00:39,Question 3,1084,2.276259
2,Interviewer,"Hm. I see. Well, uh, this is a little bit out ...",00:01:51,Quesiton 3,727,1.526605
3,Interviewee,My peers? My peers?,00:02:05,Quesiton 3,727,1.526605
4,Interviewer,Yeah.,00:02:09,Quesiton 3,727,1.526605
...,...,...,...,...,...,...
27204,Interviewer,Mm. How should scientists behave in relation t...,00:57:32,Question 13,2828,5.740966
27205,Interviewee,"Well, I think that over time, science has real...",00:57:41,Question 13,2828,5.740966
27206,Interviewer,"Hm. Yeah, that's something I want to ask you t...",00:58:53,Question 13,2828,5.740966
27207,Interviewee,"Well, first of all, we shall consider that emp...",00:59:13,Question 13,2828,5.740966


In [None]:
filler_words = {"hmm", "hmmm", "Mm, hm.", "Yes.", "Hmm.", "Mm.", "Hm.", "Mhm."}

# Remove rows where the 'Text' column contains only the filler words
df_combo = df_combo[~df_combo['Text'].str.strip().str.lower().isin(filler_words)]

# Reset the index after filtering
df_combo.reset_index(drop=True, inplace=True)

print(df_combo)


            Person                                               Text  \
0      Interviewer  Okay. So I'd like to begin by asking a few que...   
1      Interviewee  I know I have always wanted to be a teacher. S...   
2      Interviewer  Hm. I see. Well, uh, this is a little bit out ...   
3      Interviewee                                My peers? My peers?   
4      Interviewer                                              Yeah.   
...            ...                                                ...   
27199  Interviewer  Mm. How should scientists behave in relation t...   
27200  Interviewee  Well, I think that over time, science has real...   
27201  Interviewer  Hm. Yeah, that's something I want to ask you t...   
27202  Interviewee  Well, first of all, we shall consider that emp...   
27203  Interviewer  Uh just to show you an example, this film that...   

           Time         Code  Area  Coverage  
0      00:00:25   Question 3  1084  2.276259  
1      00:00:39   Question 3 

In [None]:
df_combo.to_csv("df_combo.csv", index=False)


In [None]:
df_combo = df_combo[['Code','Text']]

In [None]:
df_combo

Unnamed: 0,Code,Text
0,Question 3,Okay. So I'd like to begin by asking a few que...
1,Question 3,I know I have always wanted to be a teacher. S...
2,Quesiton 3,"Hm. I see. Well, uh, this is a little bit out ..."
3,Quesiton 3,My peers? My peers?
4,Quesiton 3,Yeah.
...,...,...
27199,Question 13,Mm. How should scientists behave in relation t...
27200,Question 13,"Well, I think that over time, science has real..."
27201,Question 13,"Hm. Yeah, that's something I want to ask you t..."
27202,Question 13,"Well, first of all, we shall consider that emp..."




In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Preprocessing

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from rake_nltk import Rake


# Define custom stopwords
custom_stopwords = set([
    "um", "uh", "like", "yeah", "right", "well", "oh", "nt", "aint", "gonna",
    "gotta", "wanna", "dont", "cant", "wont", "im", "hes", "shes", "theyre",
    "youre", "ive", "didnt", "isnt", "arent", "aint", "hmm", "mm", "uhh", "blah"
])
all_stopwords = set(stopwords.words('english')).union(custom_stopwords)

# Step 1: Clean Text and Apply RAKE
def preprocess_and_extract_keywords(text):
    """
    Preprocess text and extract RAKE keywords:
    - Lowercasing
    - Removing punctuation
    - Removing stopwords (including custom stopwords)
    - Extracting keywords using RAKE
    """
    # Lowercase and clean text
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)      # Remove numbers

    # Extract RAKE keywords
    rake = Rake(stopwords=all_stopwords)
    rake.extract_keywords_from_text(text)
    keywords = rake.get_ranked_phrases()  # Extract ranked keywords
    keywords_text = " ".join(keywords)    # Combine keywords into a single string

    return keywords_text

# Apply preprocessing and keyword extraction
df = df_combo.copy()  # Make a copy of the DataFrame
df['Preprocessed_Text'] = df['Text'].apply(preprocess_and_extract_keywords)

# Display the processed DataFrame
print(df.head())


         Code                                               Text  \
0  Question 3  Okay. So I'd like to begin by asking a few que...   
1  Question 3  I know I have always wanted to be a teacher. S...   
2  Quesiton 3  Hm. I see. Well, uh, this is a little bit out ...   
3  Quesiton 3                                My peers? My peers?   
4  Quesiton 3                                              Yeah.   

                                   Preprocessed_Text  
0  first place become interested research researc...  
1  inaudible new track job little bit hard middle...  
2  little bit current stage also look want topic ...  
3                                        peers peers  
4                                                     


label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode labels (Code column)
label_encoder = LabelEncoder()
df['Code_Encoded'] = label_encoder.fit_transform(df['Code'])

# Check the unique labels and their encoded values
print(label_encoder.classes_)  # This shows the mapping of labels to integers


['02 ' '04 Research ' '05 ' '06 State of the '
 '07 Career Mobility and Time ' '10 International Collaboration and '
 'Quesiton 12' 'Quesiton 13' 'Quesiton 2' 'Quesiton 3' 'Quesiton 8'
 'Quesiton 9' 'Question  11' 'Question 10' 'Question 11' 'Question 12'
 'Question 13' 'Question 2' 'Question 3' 'Question 4' 'Question 5'
 'Question 6' 'Question 7' 'Question 8' 'Question 9']


In [None]:
df

Unnamed: 0,Code,Text,Preprocessed_Text,Code_Encoded
0,Question 3,Okay. So I'd like to begin by asking a few que...,first place become interested research researc...,18
1,Question 3,I know I have always wanted to be a teacher. S...,inaudible new track job little bit hard middle...,18
2,Quesiton 3,"Hm. I see. Well, uh, this is a little bit out ...",little bit current stage also look want topic ...,9
3,Quesiton 3,My peers? My peers?,peers peers,9
4,Quesiton 3,Yeah.,,9
...,...,...,...,...
27199,Question 13,Mm. How should scientists behave in relation t...,scientists behave oppressive governments relat...,16
27200,Question 13,"Well, I think that over time, science has real...",differences different branches time science ri...,16
27201,Question 13,"Hm. Yeah, that's something I want to ask you t...",thats something little bit little bit knowledg...,16
27202,Question 13,"Well, first of all, we shall consider that emp...",human perspective therefore traditionally prot...,16


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


Split into train test and val

In [None]:
from transformers import RobertaTokenizer, DebertaTokenizer, RobertaForSequenceClassification, DebertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd

# Initialize tokenizers
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
deberta_tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

# Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Split dataset into training, validation, and test sets
train_val_df = df.sample(frac=0.8, random_state=42)  # 80% for training and validation
test_df = df.drop(train_val_df.index)  # Remaining 20% for testing

# Further split train_val_df into training (70%) and validation (30%) sets
train_df = train_val_df.sample(frac=0.7, random_state=42)  # 56% of original dataset
val_df = train_val_df.drop(train_df.index)  # Remaining 24% for validation


# Prepare datasets
train_dataset = TextDataset(
    texts=train_df["Preprocessed_Text"].tolist(),
    labels=train_df["Code_Encoded"].tolist(),
    tokenizer=roberta_tokenizer,  # Change to deberta_tokenizer for DeBERTa
    max_len=128,
)
val_dataset = TextDataset(
    texts=val_df["Preprocessed_Text"].tolist(),
    labels=val_df["Code_Encoded"].tolist(),
    tokenizer=roberta_tokenizer,  # Change to deberta_tokenizer for DeBERTa
    max_len=128,
)



# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

In [None]:
test_dataset = TextDataset(
    texts=test_df["Text"].tolist(),
    labels=test_df["Code_Encoded"].tolist(),
    tokenizer=roberta_tokenizer,  # Use RoBERTa or DeBERTa tokenizer as needed
    max_len=128,
)
test_loader = DataLoader(test_dataset, batch_size=8)

Building Models


In [None]:
from transformers import RobertaForSequenceClassification, DebertaForSequenceClassification
import torch.optim as optim
from transformers import get_scheduler
from torch.nn import CrossEntropyLoss

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize models
num_labels = len(df["Code_Encoded"].unique())  # Number of unique codes

# RoBERTa
roberta_model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=num_labels
).to(device)

# DeBERTa
deberta_model = DebertaForSequenceClassification.from_pretrained(
    "microsoft/deberta-base", num_labels=num_labels
).to(device)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fine tuning

In [None]:
# Optimizer and Scheduler
learning_rate = 2e-5
num_epochs = 5

# RoBERTa Optimizer
roberta_optimizer = optim.AdamW(roberta_model.parameters(), lr=learning_rate)

# DeBERTa Optimizer
deberta_optimizer = optim.AdamW(deberta_model.parameters(), lr=learning_rate)

# Learning rate scheduler
rscheduler = get_scheduler(
    name="linear",
    optimizer=roberta_optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_loader) * num_epochs,
)

dscheduler = get_scheduler(
    name="linear",
    optimizer=deberta_optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_loader) * num_epochs,
)


In [None]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU found.")


GPU is available: NVIDIA L4


In [None]:
# Training Loop
def train_model(model, optimizer, train_loader, val_loader, scheduler, num_epochs):
    model.train()
    criterion = CrossEntropyLoss()

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        epoch_loss = 0

        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            scheduler.step()

            epoch_loss += loss.item()

        print(f"Training Loss: {epoch_loss / len(train_loader):.4f}")

        # Validation
        evaluate_model(model, val_loader)

# Evaluation Function
def evaluate_model(model, val_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f"Validation Accuracy: {accuracy * 100:.2f}%")


#### training RoBERTa and DeBERTa

In [None]:
# Train RoBERTa
print("Training RoBERTa...")
train_model(
    model=roberta_model,
    optimizer=roberta_optimizer,
    train_loader=train_loader,
    val_loader=val_loader,
    scheduler=rscheduler,
    num_epochs=num_epochs,
)

# Train DeBERTa
print("Training DeBERTa...")
train_model(
    model=deberta_model,
    optimizer=deberta_optimizer,
    train_loader=train_loader,
    val_loader=val_loader,
    scheduler=dscheduler,
    num_epochs=num_epochs,
)


Training RoBERTa...
Epoch 1/5
Training Loss: 2.0760
Validation Accuracy: 48.55%
Epoch 2/5
Training Loss: 1.6024
Validation Accuracy: 49.67%
Epoch 3/5
Training Loss: 1.3076
Validation Accuracy: 50.83%
Epoch 4/5
Training Loss: 1.0083
Validation Accuracy: 50.42%
Epoch 5/5
Training Loss: 0.8108
Validation Accuracy: 50.28%
Training DeBERTa...
Epoch 1/5
Training Loss: 2.2367
Validation Accuracy: 45.63%
Epoch 2/5
Training Loss: 1.6458
Validation Accuracy: 50.04%
Epoch 3/5
Training Loss: 1.3288
Validation Accuracy: 50.67%
Epoch 4/5
Training Loss: 1.0115
Validation Accuracy: 51.17%
Epoch 5/5
Training Loss: 0.8010
Validation Accuracy: 51.25%


#### Prediction and Top-3 Extraction

In [None]:
def get_top_3_predictions(model, data_loader, label_encoder):
    model.eval()
    top_3_predictions = []
    actual_labels = []
    probabilities_list = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
            top_3 = torch.topk(probabilities, k=3, dim=1)

            top_3_predictions.extend(top_3.indices.cpu().numpy())
            probabilities_list.extend(top_3.values.cpu().numpy())
            actual_labels.extend(labels.cpu().numpy())

    # Decode top 3 predictions
    decoded_predictions = [
        label_encoder.inverse_transform(predictions) for predictions in top_3_predictions
    ]

    return decoded_predictions, actual_labels, probabilities_list


# Get predictions for RoBERTa
roberta_predictions, roberta_actual_labels, roberta_probabilities = get_top_3_predictions(
    roberta_model, test_loader, label_encoder
)

# Get predictions for DeBERTa
deberta_predictions, deberta_actual_labels, deberta_probabilities = get_top_3_predictions(
    deberta_model, test_loader, label_encoder
)


Getting predictions for RoBERTa...
Getting predictions for DeBERTa...


In [None]:
# Combine predictions into a DataFrame
roberta_results_df = pd.DataFrame({
    "Text": test_df["Preprocessed_Text"].tolist(),  # Get text values as a list
    "Actual Code": label_encoder.inverse_transform(roberta_actual_labels),
    "Top-3 Predicted Codes": roberta_predictions,
    "Top-3 Probabilities": [list(map(float, probs)) for probs in roberta_probabilities]
})

deberta_results_df = pd.DataFrame({
    "Text": test_df["Preprocessed_Text"].tolist(),
    "Actual Code": label_encoder.inverse_transform(deberta_actual_labels),
    "Top-3 Predicted Codes": deberta_predictions,
    "Top-3 Probabilities": [list(map(float, probs)) for probs in deberta_probabilities]
})

# Display the results
print("RoBERTa Results")
print(roberta_results_df.head())

print("\nDeBERTa Results")
print(deberta_results_df.head())

RoBERTa Results
                                                Text   Actual Code  \
0  atypical postdoc experience maybe someday some...  Question 3-c   
1  current point want research okay maybe future ...  Question 3-d   
2  always want wants mentors always try gain enou...  Question 3-d   
3  field something super super complicated introd...  Question 4-a   
4  goal doe already guess everything doe doe gues...  Question 4-d   

                         Top-3 Predicted Codes  \
0   [Question 6-c, Question 3-d, Question 4-a]   
1   [Question 3-d, Question 6-c, Question 4-b]   
2   [Question 7-a, Question 7-b, Question 3-a]   
3   [Question 4-a, Question 4-b, Question 5-a]   
4  [Question 4-d, Question 3-d, Question 13-b]   

                                 Top-3 Probabilities  
0  [0.19936630129814148, 0.12929052114486694, 0.1...  
1  [0.9162039160728455, 0.011854534968733788, 0.0...  
2  [0.624843418598175, 0.056328725069761276, 0.05...  
3  [0.8188257813453674, 0.0650848746299743

In [None]:
# Export RoBERTa results to a CSV file
roberta_results_df.to_csv("roberta_predictions.csv", index=False)
print("RoBERTa predictions exported to 'roberta_predictions.csv'.")

# Export DeBERTa results to a CSV file
deberta_results_df.to_csv("deberta_predictions.csv", index=False)
print("DeBERTa predictions exported to 'deberta_predictions.csv'.")


RoBERTa predictions exported to 'roberta_predictions.csv'.
DeBERTa predictions exported to 'deberta_predictions.csv'.
