In [1]:
import sys
if 'google.colab' in sys.modules:  # If in Google Colab environment
    # Installing requisite packages
    !pip install datasets transformers==4.37.2 evaluate accelerate optimum auto-gptq

    # Mount google drive to enable access to data files
    from google.colab import drive
    drive.mount('/content/drive')

file_path = '/content/drive/My Drive/VSM_BRIMS_03_02.csv'

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.37.2
  Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m94.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting optimum
  Downloading optimum-1.21.2-py3-none-any.whl (424 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m424.7/424.7 kB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting auto-gptq
  Downloading auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

In [3]:
# Reading in the .csv data
df = pd.read_csv('/content/drive/My Drive/VSM_BRIMS_03_02.csv', header=0)

# Manually specify column names
df.columns = ['task', 'participant', 'trial', 'decision_type', 'choice', 'OEE1', 'OEE2', 'CT1', 'CT2']

print(df.head())
df['multiclass_target'] = df['choice'] * 3 + df['decision_type']
print(df.head())

   task  participant  trial  decision_type  choice  OEE1  OEE2  CT1  CT2
0     0            0      0              0       1    88    86   46   48
1     0            0      1              1       1    88    86   46   48
2     0            0      2              0       1    88    86   46   48
3     0            0      3              0       1    88    86   46   48
4     0            0      4              0       1    88    86   46   48
   task  participant  trial  decision_type  choice  OEE1  OEE2  CT1  CT2  \
0     0            0      0              0       1    88    86   46   48   
1     0            0      1              1       1    88    86   46   48   
2     0            0      2              0       1    88    86   46   48   
3     0            0      3              0       1    88    86   46   48   
4     0            0      4              0       1    88    86   46   48   

   multiclass_target  
0                  3  
1                  4  
2                  3  
3            

In [4]:
if 'task' in df.columns:
    print("Task column is present.")
else:
    print("Task column is missing. Available columns:", df.columns)

Task column is present.


In [5]:
import pandas as pd

# Prepare the question template
question_template = (
      "Our manufacturing line has two sections with potential defect sources: pre-assembly (0) and assembly (1). "
    "Pre-assembly takes {CT1} seconds with an Overall Equipment Effectiveness(OEE) rate of {OEE1}%, while assembly takes {CT2} seconds with an OEE rate of {OEE2}%. "
    "To reduce total assembly time by 4 seconds, we need to identify which section can be shortened with minimal defect increase. "
    "It's important to note that reducing cycle time will also lead to an increase in headcount costs."
    "There are two options: reduce pre-assembly time (0) or reduce assembly time (1).\nQ: Which section do you choose to optimize? A: "
)

text = []

# Iterate over each task
for task in df['task'].unique():
    df_task = df[df['task'] == task]
    print(task)
    if not df_task.empty:
        OEE1 = df_task['OEE1'].iloc[0]
        OEE2 = df_task['OEE2'].iloc[0]
        CT1 = df_task['CT1'].iloc[0]
        CT2 = df_task['CT2'].iloc[0]
        prompt = question_template.format(OEE1=OEE1, OEE2=OEE2, CT1=CT1, CT2=CT2)

        # Apply the prompt to each row in the task
        for index, row in df_task.iterrows():
            text.append(prompt)
    else:
        # Assuming each task should at least have one entry in 'text'
        num_trials_expected = 15  # Default number of trials if it's a fixed number per participant
        text.extend(["Data not available for this task."] * num_trials_expected)

# Adjust the list size to match the DataFrame in case of any mismatches
if len(text) < len(df):
    # Add placeholder texts if 'text' is shorter
    text.extend(["Data missing due to processing error."] * (len(df) - len(text)))
elif len(text) > len(df):
    # Trim 'text' if it's longer
    text = text[:len(df)]

# Add the generated text as a new column in the DataFrame
df['text'] = text


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31


In [6]:
dat = Dataset.from_pandas(df)
dat

Dataset({
    features: ['task', 'participant', 'trial', 'decision_type', 'choice', 'OEE1', 'OEE2', 'CT1', 'CT2', 'multiclass_target', 'text'],
    num_rows: 2012
})

In [7]:
dat[0]

{'task': 0,
 'participant': 0,
 'trial': 0,
 'decision_type': 0,
 'choice': 1,
 'OEE1': 88,
 'OEE2': 86,
 'CT1': 46,
 'CT2': 48,
 'multiclass_target': 3,
 'text': "Our manufacturing line has two sections with potential defect sources: pre-assembly (0) and assembly (1). Pre-assembly takes 46 seconds with an Overall Equipment Effectiveness(OEE) rate of 88%, while assembly takes 48 seconds with an OEE rate of 86%. To reduce total assembly time by 4 seconds, we need to identify which section can be shortened with minimal defect increase. It's important to note that reducing cycle time will also lead to an increase in headcount costs.There are two options: reduce pre-assembly time (0) or reduce assembly time (1).\nQ: Which section do you choose to optimize? A: "}

In [8]:
# Importing the necessary class from transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load model
model_ckpt = 'TheBloke/LLama-2-13B-GPTQ'
model = AutoModelForCausalLM.from_pretrained(
    model_ckpt,
    device_map="auto",
    revision="main"
)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

print (model.config.to_json_string())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/913 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/7.26G [00:00<?, ?B/s]

Some weights of the model checkpoint at TheBloke/LLama-2-13B-GPTQ were not used when initializing LlamaForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.k_proj.bias', 'model.layers.10.self_attn.o_proj.bias', 'model.layers.10.self_attn.q_proj.bias', 'model.layers.10.self_attn.v_proj.bias', 'model.layers.11.mlp.down_proj.bias', 'model.layers.11.mlp.gate_proj.

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

{
  "_name_or_path": "TheBloke/LLama-2-13B-GPTQ",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 5120,
  "initializer_range": 0.02,
  "intermediate_size": 13824,
  "max_length": 4096,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 40,
  "num_hidden_layers": 40,
  "num_key_value_heads": 40,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "batch_size": 1,
    "bits": 4,
    "block_name_to_quantize": null,
    "cache_block_outputs": true,
    "damp_percent": 0.01,
    "dataset": null,
    "desc_act": false,
    "exllama_config": {
      "version": 1
    },
    "group_size": 128,
    "max_input_length": null,
    "model_seqlen": null,
    "module_name_preceding_first_block": null,
    "modules_in_block_to_quantize": null,
    "pad_token_id": null,
    "quant_method": "gptq",
    "sym": tr

In [9]:
tokenizer.pad_token = tokenizer.eos_token
batch_tokenizer = lambda batch: tokenizer(batch['text'], padding=True, truncation=True)

#  Tokenizing the dataset
dat = dat.map(batch_tokenizer, batched=True)
dat[0]

Map:   0%|          | 0/2012 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'task': 0,
 'participant': 0,
 'trial': 0,
 'decision_type': 0,
 'choice': 1,
 'OEE1': 88,
 'OEE2': 86,
 'CT1': 46,
 'CT2': 48,
 'multiclass_target': 3,
 'text': "Our manufacturing line has two sections with potential defect sources: pre-assembly (0) and assembly (1). Pre-assembly takes 46 seconds with an Overall Equipment Effectiveness(OEE) rate of 88%, while assembly takes 48 seconds with an OEE rate of 86%. To reduce total assembly time by 4 seconds, we need to identify which section can be shortened with minimal defect increase. It's important to note that reducing cycle time will also lead to an increase in headcount costs.There are two options: reduce pre-assembly time (0) or reduce assembly time (1).\nQ: Which section do you choose to optimize? A: ",
 'input_ids': [1,
  8680,
  12012,
  3864,
  1196,
  756,
  1023,
  13926,
  411,
  7037,
  23503,
  8974,
  29901,
  758,
  29899,
  26936,
  313,
  29900,
  29897,
  322,
  11470,
  313,
  29896,
  467,
  4721,
  29899,
  26936,


In [10]:
import torch
import torch.nn as nn

class LLaMAForBinaryClassification(nn.Module):
    def __init__(self, base_model, num_classes=2):
        super(LLaMAForBinaryClassification, self).__init__()
        self.base_model = base_model
        self.classifier = nn.Linear(base_model.config.hidden_size, num_classes)  # Output 2 classes

    def forward(self, input_ids, attention_mask=None):
        device = input_ids.device
        inputs = {
            "input_ids": input_ids.to(device),
            "attention_mask": attention_mask.to(device)
        }

        # Pass input_ids and attention_mask through the base model to get features
        with torch.no_grad():
            outputs = self.base_model(
                **inputs,
                output_hidden_states=True
            )
            last_hidden_state = outputs.hidden_states[-1]

        features = last_hidden_state[:, -1, :]  # Shape: (batch_size, hidden_size)
        logits = self.classifier(features.float())

        return logits



model = LLaMAForBinaryClassification(model)

In [11]:
# Loading the model and moving it to the GPU if available
if torch.cuda.is_available():  # for nvidia GPUs
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # for Apple Metal Performance Sharder (mps) GPUs
    device = torch.device('mps')
else:
    device = torch.device('cpu')

device

device(type='cuda')

In [12]:
model.to (device)

LLaMAForBinaryClassification(
  (base_model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(32000, 5120, padding_idx=0)
      (layers): ModuleList(
        (0-39): 40 x LlamaDecoderLayer(
          (self_attn): LlamaSdpaAttention(
            (rotary_emb): LlamaRotaryEmbedding()
            (k_proj): QuantLinear()
            (o_proj): QuantLinear()
            (q_proj): QuantLinear()
            (v_proj): QuantLinear()
          )
          (mlp): LlamaMLP(
            (act_fn): SiLU()
            (down_proj): QuantLinear()
            (gate_proj): QuantLinear()
            (up_proj): QuantLinear()
          )
          (input_layernorm): LlamaRMSNorm()
          (post_attention_layernorm): LlamaRMSNorm()
        )
      )
      (norm): LlamaRMSNorm()
    )
    (lm_head): Linear(in_features=5120, out_features=32000, bias=False)
  )
  (classifier): Linear(in_features=5120, out_features=2, bias=True)
)

In [13]:
from torch.utils.data import DataLoader, Dataset

texts = dat['text']
labels = dat['choice']

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create dataset and DataLoader, set shuffle to false to ensure the sequence of data is correct each time.
dataset = TextDataset(texts, labels, tokenizer, max_length=128)
data_loader = DataLoader(dataset, batch_size=16, shuffle=False)


In [14]:
#add NLL
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn as nn

model.eval()
all_preds = []
all_labels = []
all_logits = []  # To store logits for NLL calculation
with torch.no_grad():
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        all_logits.append(logits.cpu())
        all_preds.append(torch.argmax(logits, dim=1).cpu())  # Convert logits to predicted class
        all_labels.append(labels.cpu())

# Concatenate all predictions, logits, and labels
all_preds = torch.cat(all_preds, dim=0)
all_labels = torch.cat(all_labels, dim=0)
all_logits = torch.cat(all_logits, dim=0)

# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='weighted')
recall = recall_score(all_labels, all_preds, average='weighted')
f1 = f1_score(all_labels, all_preds, average='weighted')

# Calculate Negative Log-Likelihood (NLL)
criterion = nn.NLLLoss()
log_probs = torch.log_softmax(all_logits, dim=-1)  # Apply log softmax to logits to get log probabilities
nll_loss = criterion(log_probs, all_labels).item()

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"NLL: {nll_loss:.4f}")


Accuracy: 0.3564
Precision: 0.1270
Recall: 0.3564
F1 Score: 0.1873
NLL: 0.7623


  _warn_prf(average, modifier, msg_start, len(result))
