In [1]:
import sys
if 'google.colab' in sys.modules:  # If in Google Colab environment
    # Installing requisite packages
    !pip install datasets transformers==4.37.2 evaluate accelerate optimum auto-gptq

    # Mount google drive to enable access to data files
    from google.colab import drive
    drive.mount('/content/drive')

# Adjust the path below according to the actual location of your file within Google Drive
file_path = '/content/drive/My Drive/VSM_BRIMS_03_02.csv'

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers==4.37.2
  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting optimum
  Downloading optimum-1.21.2-py3-none-any.whl.metadata (19 kB)
Collecting auto-gptq
  Downloading auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.37.2)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadat

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

In [3]:
# Reading in the .csv data
# Load the DataFrame assuming no header
df = pd.read_csv('/content/drive/My Drive/VSM_BRIMS_03_02.csv', header=0)

# Manually specify column names
df.columns = ['task', 'participant', 'trial', 'decision_type', 'choice', 'OEE1', 'OEE2', 'CT1', 'CT2']

print(df.head())
df['multiclass_target'] = df['choice'] * 3 + df['decision_type']
print(df.head())

   task  participant  trial  decision_type  choice  OEE1  OEE2  CT1  CT2
0     0            0      0              0       1    88    86   46   48
1     0            0      1              1       1    88    86   46   48
2     0            0      2              0       1    88    86   46   48
3     0            0      3              0       1    88    86   46   48
4     0            0      4              0       1    88    86   46   48
   task  participant  trial  decision_type  choice  OEE1  OEE2  CT1  CT2  \
0     0            0      0              0       1    88    86   46   48   
1     0            0      1              1       1    88    86   46   48   
2     0            0      2              0       1    88    86   46   48   
3     0            0      3              0       1    88    86   46   48   
4     0            0      4              0       1    88    86   46   48   

   multiclass_target  
0                  3  
1                  4  
2                  3  
3            

In [4]:
dat = Dataset.from_pandas(df)
dat

Dataset({
    features: ['task', 'participant', 'trial', 'decision_type', 'choice', 'OEE1', 'OEE2', 'CT1', 'CT2', 'multiclass_target'],
    num_rows: 2012
})

In [None]:
if 'task' in df.columns:
    print("Task column is present.")
else:
    print("Task column is missing. Available columns:", df.columns)

Task column is present.


In [5]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
import torch.nn as nn

def random_classifier(labels, num_classes):
    np.random.seed(5)  # For reproducibility
    return np.random.randint(0, num_classes, size=len(labels))

def compute_nll(labels, num_classes):
    uniform_probs = np.ones((len(labels), num_classes)) / num_classes
    labels_tensor = torch.tensor(labels, dtype=torch.long)
    log_probs = torch.log(torch.tensor(uniform_probs, dtype=torch.float))
    criterion = nn.NLLLoss()
    nll_loss = criterion(log_probs, labels_tensor).item()
    return nll_loss

def evaluate_random_classifier(labels, num_classes):
    preds = random_classifier(labels, num_classes)
    accuracy = accuracy_score(labels, preds)
    nll = compute_nll(labels, num_classes)

    return {
        'accuracy': accuracy,
        'nll': nll
    }

# Assuming 'dat' is your dataset with 'texts' and 'labels'
labels = np.array(dat['choice'])
num_classes = len(np.unique(labels))  # Determine the number of unique classes

# Evaluate the random classifier
random_classifier_results = evaluate_random_classifier(labels, num_classes)

print(f"Random Classifier - Accuracy: {random_classifier_results['accuracy']:.4f}")
print(f"Random Classifier - NLL: {random_classifier_results['nll']:.4f}")



Random Classifier - Accuracy: 0.4826
Random Classifier - NLL: 0.6931
