In [None]:
import sys
if 'google.colab' in sys.modules:  # Google Colab environment
    # Installing requisite packages
    !pip install datasets transformers==4.37.2 evaluate accelerate optimum auto-gptq

    # Mount google drive to enable access to data files
    from google.colab import drive
    drive.mount('/content/drive')

# Adjust the path below according to the actual location of your file within Google Drive
file_path = '/content/drive/My Drive/VSM_BRIMS_03_02.csv'

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

In [None]:
#load data
df = pd.read_csv('/content/drive/My Drive/VSM_BRIMS_03_02.csv', header=0)

# Manually specify column names if necessary
df.columns = ['task', 'participant', 'trial', 'decision_type', 'choice', 'OEE1', 'OEE2', 'CT1', 'CT2']

print(df.head())
df['multiclass_target'] = df['choice'] * 3 + df['decision_type']
print(df.head())

In [None]:
if 'task' in df.columns:
    print("Task column is present.")
else:
    print("Task column is missing. Available columns:", df.columns)

In [None]:
# iterative prompts
import pandas as pd

# question template
question_template = (
    "Our manufacturing line has two sections with potential defect sources: pre-assembly (0) and assembly (1). "
    "Pre-assembly takes {CT1} seconds with an Overall Equipment Effectiveness(OEE) rate of {OEE1}%, while assembly takes {CT2} seconds with an OEE rate of {OEE2}%. "
    "To reduce total assembly time by 4 seconds, we need to identify which section can be shortened with minimal defect increase. "
    "It's important to note that reducing cycle time will also lead to an increase in headcount costs."
    "There are two options: reduce pre-assembly time (0) or reduce assembly time (1).\nQ: Which section do you choose to optimize? A: "
)

text = []

# Iterate over each task
for task in df['task'].unique():
    df_task = df[df['task'] == task]
    print(task)
    if not df_task.empty:
        OEE1 = df_task['OEE1'].iloc[0]
        OEE2 = df_task['OEE2'].iloc[0]
        CT1 = df_task['CT1'].iloc[0]
        CT2 = df_task['CT2'].iloc[0]
        prompt = question_template.format(OEE1=OEE1, OEE2=OEE2, CT1=CT1, CT2=CT2)

        # Apply the prompt to each row in the task
        for index, row in df_task.iterrows():
            text.append(prompt)
    else:
        # Assuming each task should at least have one entry in 'text'
        num_trials_expected = 15  # Default number of trials if it's a fixed number per participant
        text.extend(["Data not available for this task."] * num_trials_expected)

# Adjust the list size to match the DataFrame in case of any mismatches
if len(text) < len(df):
    # Add placeholder texts if 'text' is shorter
    text.extend(["Data missing due to processing error."] * (len(df) - len(text)))
elif len(text) > len(df):
    # Trim 'text' if it's longer
    text = text[:len(df)]

# Add the generated text as a new column in the DataFrame
df['text'] = text


In [None]:
dat = Dataset.from_pandas(df)
dat

In [None]:
dat[0]

In [None]:
# Importing the necessary class from transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load model
model_ckpt = 'TheBloke/LLama-2-13B-GPTQ'
model = AutoModelForCausalLM.from_pretrained(
    model_ckpt,
    device_map="auto",
    revision="main"
)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

print (model.config.to_json_string())

In [None]:
!pip install peft

In [None]:
tokenizer.pad_token = tokenizer.eos_token
batch_tokenizer = lambda batch: tokenizer(batch['text'], padding=True, truncation=True)

#  Tokenizing the dataset
dat = dat.map(batch_tokenizer, batched=True)
dat[0]

In [None]:
dat[0]

In [None]:
dat.set_format('torch', columns=['input_ids', 'attention_mask'])
dat

In [None]:
import torch
torch.manual_seed(42) # For reproducibility
from transformers import AutoModel

In [None]:
# Loading the model and moving it to the GPU if available
if torch.cuda.is_available():  # for nvidia GPUs
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # for Apple Metal Performance Sharder (mps) GPUs
    device = torch.device('mps')
else:
    device = torch.device('cpu')

device

# Feature extraction for decision making prediction

In [None]:
#feature extraction
def extract_features(batch):
    inputs = {
        k:v.to(device)for k, v in batch.items() if k in tokenizer.model_input_names
    }
    with torch.no_grad():
        last_hidden_state = model(
            **inputs,
            output_hidden_states=True
        ).hidden_states[-1]
    return {
        "hidden_state": last_hidden_state[:, -1].cpu().numpy()
    }

In [None]:
dat = dat.map(extract_features, batched=True, batch_size=1)
dat['hidden_state'].shape

In [None]:
features = pd.DataFrame(dat['hidden_state'])
features

features.to_csv('features.csv', index=False)  # Saving as CSV without the index

## prediction for decision making results and strategy

In [None]:
#muliclass with 10 validation folds
file_path2 = '/content/drive/My Drive/features.csv'
features = pd.read_csv(file_path2)

from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, log_loss
import pandas as pd

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, dat['multiclass_target'], test_size=0.2, random_state=42)
print(f'Train size: {len(X_train)}, test size: {len(X_test)}')

# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Setting up the LogisticRegressionCV model for multiclass classification with 10-fold cross-validation
clf = LogisticRegressionCV(cv=10, multi_class='multinomial', max_iter=1000)

# Fitting the model
clf.fit(X_train, y_train)

# Predicting the test set results
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)  # Get the probability estimates for the test set

# Evaluating performance using different metrics
accuracy = accuracy_score(y_test, y_pred)
nll = log_loss(y_test, y_proba)  # Calculate Negative Log-Likelihood

# Printing out the results
print(f'Accuracy = {accuracy}')

print(f'Negative Log-Likelihood = {nll}')


## Prediction for decision making result only

In [None]:
# single target with 10 validation folds
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, log_loss
import pandas as pd
import numpy as np


X_train, X_test, y_train, y_test = train_test_split(features, dat['choice'], test_size=0.2, random_state=42)
print(f'Train size: {len(X_train)}, test size: {len(X_test)}')

# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Setting up the LogisticRegressionCV model with 10-fold cross-validation
clf = LogisticRegressionCV(cv=10, max_iter=1000, random_state=42, multi_class='auto')

# Fitting the model
clf.fit(X_train, y_train)

# Predicting the test set results
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)  # Get the probability estimates for the test set

# Evaluating performance
accuracy = accuracy_score(y_test, y_pred)

# Calculate Negative Log-Likelihood
nll = log_loss(y_test, y_proba)

# Printing out the results
print(f'Accuracy = {accuracy}')
print(f'Negative Log-Likelihood = {nll}')

# Fine tuning for predicting decision making behavior

In [None]:
num_rows = len(dat['choice'])
print("Number of rows in 'choice' column:", num_rows)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

In [None]:
# Splitting the data into train and test sets
dat = dat.train_test_split(test_size=.2, seed=42)
dat

In [None]:
type(dat['train'])

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "ybelkada/opt-350m-lora"

model.load_adapter(peft_model_id)

In [None]:
#format datasets
from datasets import load_dataset, DatasetDict

# Assuming `dat` is a DatasetDict containing 'train' and 'test' Datasets
train_dataset = dat['train']
test_dataset = dat['test']

 #Modify the datasets to match the expected input format for the Trainer
def format_dataset(example):
    #Ensure 'input_ids' are the hidden_state, and 'labels' are the choice
    #example['features'] = example['hidden_state']
    example['labels'] = example['choice']
    return example

train_dataset = train_dataset.map(format_dataset, batched=False)
test_dataset = test_dataset.map(format_dataset, batched=False)

In [None]:
#build LlaMA for classification
import torch
import torch.nn as nn

class LLaMAForBinaryClassification(nn.Module):
    def __init__(self, base_model, dropout_prob=0.5):
        super(LLaMAForBinaryClassification, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(dropout_prob)  # Add dropout layer
        self.classifier = nn.Linear(base_model.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Move inputs to the appropriate device
        device = input_ids.device
        inputs = {
            "input_ids": input_ids.to(device),
            "attention_mask": attention_mask.to(device)
        }

        # Pass input_ids and attention_mask through the base model to get features
        with torch.no_grad():
            outputs = self.base_model(
                **inputs,
                output_hidden_states=True
            )
            last_hidden_state = outputs.hidden_states[-1]

        features = last_hidden_state[:, -1, :]  # Shape: (batch_size, hidden_size)

        # Apply dropout to the features
        features = self.dropout(features)

        # Ensure input to classifier is float
        logits = self.classifier(features.float())

        if logits.dim() == 1:
            logits = logits.unsqueeze(0)  # Ensure logits is at least 2D

        # Apply softmax to convert logits to probabilities
        probabilities = torch.softmax(logits, dim=-1)

        return logits, probabilities, features

# Initialize the model with dropout
model = LLaMAForBinaryClassification(model, dropout_prob=0.5)

In [None]:
#build trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        print("Received inputs:", inputs)
        if 'labels' not in inputs:
            raise ValueError("Labels key missing in inputs during training.")
        labels = inputs.pop('labels').long()
        outputs = model(**inputs)
        logits = outputs[0] if isinstance(outputs, tuple) else outputs
        #print("Logits shape:", logits.shape)
        # Use Cross-Entropy Loss for binary classification
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss


In [None]:
# build custom evalate metrics
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

def compute_metrics(pred):
    labels = pred.label_ids
    if isinstance(pred.predictions, tuple):
        logits = pred.predictions[0]
    else:
        logits = pred.predictions
    preds = logits.argmax(-1)

    accuracy = accuracy_score(labels, preds)


    # Calculate Negative Log-Likelihood (NLL)
    # Convert logits to probabilities using softmax
    probabilities = torch.softmax(torch.tensor(logits), dim=-1)
    # Create an NLLLoss object
    criterion = nn.NLLLoss()
    # Calculate log probabilities
    log_probs = torch.log(probabilities)
    # Convert labels to tensor
    labels_tensor = torch.tensor(labels)
    # Calculate NLL
    nll_loss = criterion(log_probs, labels_tensor).item()

    print(classification_report(labels, preds))

    return {
        'accuracy': accuracy,
        'nll_loss': nll_loss  # Add NLL to the metrics
    }


In [None]:
import gc
gc.collect()

In [None]:
#huggingface trainer, to train the model
from transformers import Trainer, TrainingArguments
model.to(device)


model_name = f"{model_ckpt}-finetuned"
batch_size = 5
training_args = TrainingArguments(
    output_dir= model_name,
    save_safetensors = False,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    learning_rate=1e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,
    optim='adamw_torch'
)

trainer = CustomTrainer(
    model= model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics

)

trainer.train()

In [None]:
print (model.__dict__)