# Imports

In [2]:
from sklearn.linear_model import LogisticRegression
import torch
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# Constants

In [18]:
DATASET = "fantasy_reasoning"
MODEL_NAME = "gemma-2-2b"
DATA_DIR = f"./experimental_data/{MODEL_NAME}/{DATASET}/"
WEIGHTS_DIR = f"./weights/linear_analysis/{MODEL_NAME}/{DATASET}/"
TRAIN_SIZE = 0.8
TOP_K = 10  # Set to -1 to see every element
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data

Load data

In [20]:
acts_exp_resid = torch.load(DATA_DIR + "acts_exp_resid.pt", map_location=device)
acts_resid = torch.load(DATA_DIR + "acts_resid.pt", map_location=device)

  acts_exp_resid = torch.load(DATA_DIR + "acts_exp_resid.pt", map_location=device)
  acts_resid = torch.load(DATA_DIR + "acts_resid.pt", map_location=device)


In [21]:
print(acts_exp_resid.shape)

torch.Size([201, 26, 2304])


Split data

In [22]:
X_train_index = int(TRAIN_SIZE * acts_exp_resid.shape[0])
X_train_exp_resid = acts_exp_resid[:X_train_index, :, :]
X_train_resid = acts_resid[:X_train_index, :, :]
X_test_exp_resid = acts_exp_resid[X_train_index:, :, :]
X_test_resid = acts_resid[X_train_index:, :, :]

In [23]:
print(X_train_resid.shape, X_test_resid.shape)

torch.Size([160, 26, 2304]) torch.Size([41, 26, 2304])


1 indicates CoT, 0 is Non-CoT

In [24]:
y_train_exp_resid = torch.ones(X_train_exp_resid.shape[0])
y_train_resid = torch.zeros(X_train_resid.shape[0])
y_test_exp_resid = torch.ones(X_test_exp_resid.shape[0])
y_test_resid = torch.zeros(X_test_resid.shape[0])

In [25]:
print(y_train_resid.shape, y_test_resid.shape)

torch.Size([160]) torch.Size([41])


Concatenate the data

In [26]:
X_train = torch.cat((X_train_exp_resid, X_train_resid), dim=0)
X_test = torch.cat((X_test_exp_resid, X_test_resid), dim=0)
y_train = torch.cat((y_train_exp_resid, y_train_resid), dim=0)
y_test = torch.cat((y_test_exp_resid, y_test_resid), dim=0)

In [27]:
print(X_train.shape, y_train.shape)

torch.Size([320, 26, 2304]) torch.Size([320])


# Initialize and train classifiers

In [28]:
classifiers = [LogisticRegression(fit_intercept=False)] * acts_resid.shape[1]

In [29]:
X_train, y_train, X_test, y_test = X_train.cpu().numpy(), y_train.cpu().numpy(), X_test.cpu().numpy(), y_test.cpu().numpy()
for i, classifier in enumerate(classifiers):
    classifier.fit(X_train[:, i, :], y_train)

# Evaluate classifiers

In [30]:
weights_list = []
for i, classifier in enumerate(classifiers):
    y_pred = classifier.predict(X_test[:, i, :])
    print(f"layer {i}")
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", conf_matrix)
    
    class_report = classification_report(y_test, y_pred)
    print("Classification Report:\n", class_report)

    weights = classifier.coef_
    print(f"Weights: {weights}")
    weights_list.append(weights)

    top_k = np.argsort(weights[0, :])[::-1][:TOP_K]
    print(f"Top k indeces: {top_k}")

    print("\n")

layer 0
Accuracy: 1.0
Confusion Matrix:
 [[41  0]
 [ 0 41]]
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        41
         1.0       1.00      1.00      1.00        41

    accuracy                           1.00        82
   macro avg       1.00      1.00      1.00        82
weighted avg       1.00      1.00      1.00        82

Weights: [[-0.00040262 -0.00112977 -0.0011051  ... -0.00107047  0.00148265
  -0.00164308]]
Top k indeces: [1393  689 1570 1824 2269 2251 1610  564 1413 2030]


layer 1
Accuracy: 1.0
Confusion Matrix:
 [[41  0]
 [ 0 41]]
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        41
         1.0       1.00      1.00      1.00        41

    accuracy                           1.00        82
   macro avg       1.00      1.00      1.00        82
weighted avg       1.00      1.00      1.00        82

Weights: [[-0.0004

Starting from layer 7 we are able to obtain perfect prediction. However, it seems like certain neurons are always activated.

# Save weights

In [15]:
for i, weights in enumerate(weights_list):
    weights = torch.tensor(weights)
    torch.save(weights, WEIGHTS_DIR + f"layer_{i}")

RuntimeError: Parent directory ./weights/linear_analysis/gemma-2-2b/com2sense does not exist.