# Response_5: LLM-based prediction

### Table of Contents
- [1. Load Data](#1-load-data)  

- [2. Help function](#2-help-function)  

- [3. Preprocessing data](#3-preprocessing-data)

- [4. Word embedding](#3-word-embedding)

- [5. Latent structure transforming](#3-latent-structure-transforming)

- [6. Classification on the transformed latent structure](#3-classification-on-the-transformed-latent-structure)


In [116]:
import pandas as pd
import numpy as np

from tqdm import tqdm

In [117]:
from transformers import BertTokenizer, BertModel, TrainingArguments, Trainer

from datasets import Dataset

import torch

import torch.nn as nn
import torch.optim as optim

In [118]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### 1. Load Data

In [119]:
df = pd.read_csv('datasets/feature_extraction.csv')

In [120]:
df.head()

Unnamed: 0.1,Unnamed: 0,response,score,response_length,grammer_error,response_corrected,readability,type_token_ratio,mean_sentence_length,coherence,sentiment,gpt_scores,Relevance,Completeness,Depth of Reflection,Response Quality,Linguistic Quality
0,0,"During my journey, I faced a big health proble...",3,110,0,"During my journey, I faced a big health proble...",78.28,0.647541,20.333333,0.487087,0.999572,"{'Relevance': 'Yes', 'Completeness': 2, 'Depth...",Yes,2,Moderate,No,No
1,1,i once had a big problem when i wasn't ready f...,3,70,8,I once had a big problem when I wasn't ready f...,91.11,0.765432,16.2,0.321876,-0.986382,"{'Relevance': 'Yes', 'Completeness': 2, 'Depth...",Yes,2,Moderate,No,No
2,2,During my junior year I faced a significant ch...,3,135,2,During my junior year I faced a significant ch...,-65.56,0.678832,137.0,0.0,0.995565,"{'Relevance': 'Yes', 'Completeness': 3, 'Depth...",Yes,3,Moderate,Yes,No
3,3,"Once, I encountered a situation that was quite...",2,109,0,"Once, I encountered a situation that was quite...",47.18,0.684211,19.0,0.308484,0.99491,"{'Relevance': 'Yes', 'Completeness': 2, 'Depth...",Yes,2,Moderate,No,No
4,4,"during a trip, i suffered a serious accident t...",3,53,6,"During a trip, I suffered a serious accident t...",49.52,0.721311,15.25,0.434561,-0.989042,"{'Relevance': 'Yes', 'Completeness': 2, 'Depth...",Yes,2,Moderate,No,No


### 2. Method 1

**Method 1: BERT-Based Text Classification**  

1. Utilize a **pretrained BERT model** to generate embeddings for the response text, capturing contextual and semantic information.  
2. Pass the embeddings through a **four-layer neural network** for classification, mapping the text representations to their corresponding scores.  
3. Train the model using a supervised learning approach, optimizing for classification performance.  

In [121]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')



In [122]:
def tokenize_text(text, tokenizer):
    # Tokenize the input text and get the input ids and attention masks
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    return inputs

In [123]:
def get_bert_embeddings(text, tokenizer, model):
    """
    This function generates the numerical representation of peice of text, which captures its meaning, using the BERT model
    """
    inputs = tokenize_text(text, tokenizer)
    
    # Pass the tokenized input through BERT
    with torch.no_grad():  # Disable gradient calculation to save memory
        outputs = model(**inputs)
        
    # We use the embeddings from the [CLS] token (first token) as the document representation
    embeddings = outputs.last_hidden_state[:, 0, :].squeeze()
    
    return embeddings.numpy()

In [125]:
embeddings = []
for response in tqdm(df['response_corrected'], desc="Embedding the response"):
    embeddings.append(get_bert_embeddings(response, tokenizer, bert_model))

df['embeddings'] = embeddings


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

In [126]:
X = np.array(df['embeddings'].tolist())  # Convert list of arrays into a 2D array
y = np.array(df['score'])

# Split data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

In [127]:
X_train.shape

(800, 768)

In [128]:
class ScorePredictor(nn.Module):
    def __init__(self, input_dim, num_classes=5):
        super(ScorePredictor, self).__init__()
        self.fc1 = nn.Linear(input_dim, 384)
        self.fc2 = nn.Linear(384, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, num_classes)  # Output 5 logits (no activation here)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)  # Raw logits for classification
        return x

# Initialize the model
input_dim = X_train.shape[1]  
model = ScorePredictor(input_dim)
    

In [129]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

In [130]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

# Define loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error loss for regression task
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10000
for epoch in range(epochs):
    model.train()  # Set the model to training mode
    
    # Zero the gradients
    optimizer.zero_grad()

    # Forward pass
    outputs = model(X_train_tensor)
    
    # Compute the loss
    loss = criterion(outputs, y_train_tensor)  # Targets should be integers (0-4)
    
    # Backward pass (compute gradients)
    loss.backward()

    # Update weights
    optimizer.step()

    # Print loss every 10 epochs
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.8f}')

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/10000], Loss: 1.74980521
Epoch [20/10000], Loss: 1.62838209
Epoch [30/10000], Loss: 1.06118965
Epoch [40/10000], Loss: 0.83923447
Epoch [50/10000], Loss: 0.70706666
Epoch [60/10000], Loss: 0.64164537
Epoch [70/10000], Loss: 0.60364807
Epoch [80/10000], Loss: 0.56753349
Epoch [90/10000], Loss: 0.53432864
Epoch [100/10000], Loss: 0.50285804
Epoch [110/10000], Loss: 0.47485527
Epoch [120/10000], Loss: 0.44956717
Epoch [130/10000], Loss: 0.42660120
Epoch [140/10000], Loss: 0.40567324
Epoch [150/10000], Loss: 0.38630584
Epoch [160/10000], Loss: 0.36809438
Epoch [170/10000], Loss: 0.35099956
Epoch [180/10000], Loss: 0.33530530
Epoch [190/10000], Loss: 0.32066971
Epoch [200/10000], Loss: 0.30683622
Epoch [210/10000], Loss: 0.29398561
Epoch [220/10000], Loss: 0.28197607
Epoch [230/10000], Loss: 0.27023998
Epoch [240/10000], Loss: 0.25906238
Epoch [250/10000], Loss: 0.24851002
Epoch [260/10000], Loss: 0.23828527
Epoch [270/10000], Loss: 0.22908299
Epoch [280/10000], Loss: 0.22060549
E

In [131]:
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

In [132]:
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    # Forward pass to get logits
    logits = model(X_test_tensor)

    # Convert logits to probabilities using softmax
    probabilities = torch.softmax(logits, dim=1)  # Apply softmax across the classes (dim=1)

    # Get predicted labels by finding the index of the max probability
    predicted_labels = torch.argmax(probabilities, dim=1)  # Predicted class indices

    # Optionally, you can also get the class probabilities for the predicted labels
    predicted_probabilities = probabilities.gather(1, predicted_labels.view(-1, 1))


In [133]:
predicted_labels

tensor([1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
        0, 1, 1, 1, 1, 0, 1, 1, 0, 4, 0, 0, 4, 1, 1, 1, 1, 0, 0, 2, 1, 0, 0, 1,
        1, 4, 1, 0, 1, 1, 1, 1, 1, 1, 1, 4, 0, 1, 4, 1, 1, 4, 1, 4, 1, 0, 4, 1,
        1, 0, 1, 1, 1, 4, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
        4, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 0, 1, 1, 1, 0, 4, 0, 2, 0, 1, 1, 1, 1,
        4, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 2, 1, 1, 1, 0, 1, 1, 1, 4, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
        1, 1, 1, 0, 1, 1, 1, 1])

In [134]:
pd.DataFrame(probabilities.numpy())

Unnamed: 0,0,1,2,3,4
0,0.200570,0.200594,0.200214,0.199589,0.199033
1,0.201040,0.200768,0.199899,0.198942,0.199351
2,0.200288,0.200370,0.199775,0.199260,0.200306
3,0.199981,0.200982,0.199938,0.199331,0.199768
4,0.199896,0.200698,0.199727,0.199623,0.200055
...,...,...,...,...,...
195,0.200890,0.200384,0.200131,0.199308,0.199288
196,0.200054,0.201394,0.199923,0.198954,0.199675
197,0.199214,0.201213,0.199721,0.199541,0.200311
198,0.199994,0.200424,0.200221,0.199693,0.199668


In [135]:
y_test

array([2, 4, 4, 2, 4, 4, 3, 3, 4, 3, 2, 5, 4, 4, 4, 4, 5, 5, 3, 3, 3, 5,
       2, 4, 3, 4, 3, 4, 4, 3, 3, 3, 2, 3, 3, 3, 3, 5, 3, 4, 4, 3, 3, 3,
       4, 2, 2, 3, 3, 4, 3, 5, 2, 5, 5, 5, 5, 5, 3, 4, 3, 3, 4, 4, 2, 5,
       3, 3, 4, 3, 5, 5, 5, 2, 2, 4, 3, 3, 5, 2, 2, 5, 5, 4, 3, 4, 5, 5,
       3, 2, 2, 3, 5, 3, 5, 5, 3, 2, 2, 2, 2, 3, 4, 3, 5, 2, 1, 2, 3, 4,
       3, 5, 3, 3, 2, 3, 5, 5, 4, 4, 2, 3, 3, 4, 4, 4, 5, 3, 2, 4, 3, 2,
       5, 3, 1, 2, 4, 3, 2, 4, 3, 3, 3, 3, 3, 3, 3, 3, 5, 4, 2, 3, 4, 3,
       2, 4, 2, 3, 4, 2, 3, 3, 2, 4, 2, 4, 3, 4, 3, 5, 3, 2, 2, 4, 3, 5,
       4, 3, 3, 4, 4, 4, 3, 2, 3, 2, 3, 3, 5, 5, 3, 5, 3, 4, 3, 1, 4, 5,
       4, 3])

In [136]:
accuracy_score(predicted_labels, y_test)

0.035

### 3. Method 2

**Method 2: Fine-Tuning BERT for Text Classification**  

1. **Initialize a pretrained BERT model** using a sequence classification head to directly map text responses to their corresponding scores.  
2. **Apply Low-Rank Adaptation (LoRA)** to efficiently fine-tune the model, reducing the number of trainable parameters while preserving BERT’s generalization capabilities.  
3. **Train the model** on labeled response data, leveraging fine-tuning to optimize classification performance while maintaining computational efficiency.  

#### 3.1 Building the datasets for huggingface trainer object

In [137]:
ft_df = df[['response_corrected', 'score']]
ft_df.head()

Unnamed: 0,response_corrected,score
0,"During my journey, I faced a big health proble...",3
1,I once had a big problem when I wasn't ready f...,3
2,During my junior year I faced a significant ch...,3
3,"Once, I encountered a situation that was quite...",2
4,"During a trip, I suffered a serious accident t...",3


In [138]:
ft_df['score'] = ft_df['score']-1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ft_df['score'] = ft_df['score']-1


In [140]:
dataset = Dataset.from_pandas(ft_df)
dataset

Dataset({
    features: ['response_corrected', 'score'],
    num_rows: 1000
})

In [141]:
def tokenize_function_response(examples):
    return tokenizer(
        examples['response_corrected'],
        padding='max_length',
        truncation=True
        )

In [142]:
tokenized_dataset = dataset.map(tokenize_function_response, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [143]:
tokenized_dataset

Dataset({
    features: ['response_corrected', 'score', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

In [144]:
tokenized_dataset = tokenized_dataset.rename_column('score', 'labels')

In [145]:
train_dataset = tokenized_dataset.shuffle(seed=21).select(range(int(0.8 * len(tokenized_dataset))))
test_dataset = tokenized_dataset.shuffle(seed=21).select(range(int(0.8 * len(tokenized_dataset)), len(tokenized_dataset)))

In [146]:
train_dataset

Dataset({
    features: ['response_corrected', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 800
})

In [147]:
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [148]:
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

#### 3.2 Loading the BERT model (sequenceclassifier)

In [149]:
from peft import LoraConfig, get_peft_model, TaskType 

In [150]:
from transformers import AutoModelForSequenceClassification
model_path = "google-bert/bert-base-uncased"
model_bert_classifier = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=5 # because from 1 to 5, we have 5 categories.
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### 3.3 Setup a lora configure

In [151]:
lora_config = LoraConfig(
    r=8,               # Low-rank dimension
    lora_alpha=32,      # Scaling factor
    lora_dropout=0.1,   # Dropout rate
    target_modules=["query", "key"],  # Target attention weights
    bias="none",        # Bias type ("none", "all", or "lora_only")
    task_type="SEQ_CLS" # Task type (e.g., "SEQ_CLS" for sequence classification)
)

lora_model = get_peft_model(model_bert_classifier, lora_config)

In [152]:
lora_model.print_trainable_parameters()

trainable params: 298,757 || all params: 109,784,842 || trainable%: 0.2721


In [153]:
training_args_lora_ft = TrainingArguments(
    output_dir="lora-fine-tuned-bert",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,  # Slightly higher learning rate for LoRA
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    #save_total_limit=2,
    load_best_model_at_end=True,
    #report_to="none",
    metric_for_best_model="accuracy",
)

In [154]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [155]:
trainer_lora_ft = Trainer(
    model=lora_model,
    args=training_args_lora_ft,
    train_dataset=train_dataset,              # Training dataset
    eval_dataset=test_dataset,                # Evaluation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  
)

In [156]:
trainer_lora_ft.train()

  0%|          | 0/150 [00:00<?, ?it/s]

{'loss': 1.6379, 'grad_norm': 3.951148271560669, 'learning_rate': 0.0001866666666666667, 'epoch': 0.2}
{'loss': 1.4587, 'grad_norm': 3.3521759510040283, 'learning_rate': 0.00017333333333333334, 'epoch': 0.4}
{'loss': 1.3868, 'grad_norm': 3.232008218765259, 'learning_rate': 0.00016, 'epoch': 0.6}
{'loss': 1.4262, 'grad_norm': 7.572861671447754, 'learning_rate': 0.00014666666666666666, 'epoch': 0.8}
{'loss': 1.3526, 'grad_norm': 2.1482417583465576, 'learning_rate': 0.00013333333333333334, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.4103049039840698, 'eval_accuracy': 0.33, 'eval_f1': 0.1637593984962406, 'eval_precision': 0.10890000000000001, 'eval_recall': 0.33, 'eval_runtime': 245.8323, 'eval_samples_per_second': 0.814, 'eval_steps_per_second': 0.053, 'epoch': 1.0}
{'loss': 1.3892, 'grad_norm': 3.005897283554077, 'learning_rate': 0.00012, 'epoch': 1.2}
{'loss': 1.3955, 'grad_norm': 4.158362865447998, 'learning_rate': 0.00010666666666666667, 'epoch': 1.4}
{'loss': 1.3298, 'grad_norm': 3.062124490737915, 'learning_rate': 9.333333333333334e-05, 'epoch': 1.6}
{'loss': 1.4138, 'grad_norm': 5.912414073944092, 'learning_rate': 8e-05, 'epoch': 1.8}
{'loss': 1.3497, 'grad_norm': 2.7183940410614014, 'learning_rate': 6.666666666666667e-05, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.3777624368667603, 'eval_accuracy': 0.33, 'eval_f1': 0.1637593984962406, 'eval_precision': 0.10890000000000001, 'eval_recall': 0.33, 'eval_runtime': 3518.8656, 'eval_samples_per_second': 0.057, 'eval_steps_per_second': 0.004, 'epoch': 2.0}


KeyboardInterrupt: 

In [157]:
# Predict on the test dataset
predictions = trainer_lora_ft.predict(test_dataset)

# The predictions will contain the predicted labels, logits, and labels
predicted_labels = predictions.predictions.argmax(-1)

# If you want to see the predictions alongside the actual labels
actual_labels = predictions.label_ids

# You can print the first few predictions and the corresponding true labels
for i in range(10):
    print(f"Prediction: {predicted_labels[i]}, Actual: {actual_labels[i]}")


  0%|          | 0/13 [00:00<?, ?it/s]

Prediction: 2, Actual: 4
Prediction: 2, Actual: 1
Prediction: 2, Actual: 3
Prediction: 2, Actual: 4
Prediction: 2, Actual: 2
Prediction: 2, Actual: 3
Prediction: 2, Actual: 3
Prediction: 2, Actual: 2
Prediction: 2, Actual: 4
Prediction: 2, Actual: 3


  _warn_prf(average, modifier, msg_start, len(result))
