|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[30] Sentiment analysis with decision trees</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch

from tqdm import tqdm
from scipy.stats import ttest_ind

from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from transformers import BertTokenizer, BertModel
from datasets import load_dataset

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300
})

# **Part 1: Import BERT and dataset**

In [None]:
# load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased', output_hidden_states=True)
model.eval()

In [None]:
# move the model to the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device);

In [None]:
# load a subset (5%) of sst-2 sentiment dataset
dataset = load_dataset('glue','sst2',split='train[:5%]')

dataset

In [None]:
dataset[123]

In [None]:
# extract and count the labels
labels = [sample['label'] ]

num_samples =

uniq,counts = np.unique(labels,return_counts=True)
for u,c in zip(uniq,counts):
  print

# **Part 2: Create batches**

In [None]:
batchsize = 32
num_batch =
sample_size =

print(f'There are {} batches of size {}, leading to {} total samples.')

In [None]:
allbatches = []
alllabels = []

for batchi in range(num_batch):

  # start and end indices
  startidx =
  endidx =

  # append texts
  tmp_texts = []
  tmp_labels = []
  for samplei in range(startidx,endidx):
    tmp_texts.append(
    tmp_labels.append(

  # tokenize
  tokens = tokenizer(tmp_texts,return_tensors='pt',padding=True)
  allbatches.append(
  alllabels.append(

In [None]:
len(alllabels), len(allbatches)

In [None]:
len(alllabels[10]), allbatches[10], allbatches[10]['input_ids'].shape

In [None]:
# lengths of different batches
seqlens = [allbatches[i]['input_ids'].shape[1] for i in ]
numseqs =

plt.figure(figsize=(8,3))
plt.plot(label='Number of sequences')
plt.plot(label='Number of tokens')
plt.gca().set(xlabel='Batch number',ylabel='Counts',ylim=[10,70])
plt.legend()

plt.tight_layout()
plt.savefig('ch5_proj30_part2.png')
plt.show()

In [None]:
# just one batch to confirm shapes
outputs = model(
outputs.hidden_states[4].shape

# **Part 3: Get the [CLS] hidden state activations from one layer**

In [None]:
# this code block takes a very long time on a standard CPU, ~5 mins on a high-power CPU, and a few seconds on a GPU
cls_activations = []

for batchi in tqdm(range(num_batch)):

  # run the model
  outputs = model(

  # extract the CLS activation from the hidden states
  cls_batch =

  # append to a list
  cls_activations.append(

# and convert to numpy
cls_activations = np.vstack()
labels = np.hstack()

cls_activations.shape, labels.shape

In [None]:
# extract average activations
acts0 = cls_activations
acts1 =

# t-test
t = ttest_ind(,)

# the violin plot
plt.figure(figsize=(8,5))
v = plt.violinplot

# change the colors
v['bodies'][0].set_facecolor([.9,.7,.7])
v['bodies'][1].set_facecolor([.7,.9,.7])
v['cbars'].set_edgecolor('k')
v['cmins'].set_edgecolor('k')
v['cmaxes'].set_edgecolor('k')

# draw all the dots
plt.plot(,'.',color=[.9,.7,.7,.3])
plt.plot(,'.',color=[.7,.9,.7,.3])

# and finishing touches
plt.axhline(0,linestyle='--',linewidth=.2,color='k')
plt.gca().set(xticks=[1,2],xticklabels=['Negative','Positive'],ylabel='Mean [CLS] Activation',xlim=[.5,2.5])

plt.tight_layout()
plt.savefig('ch5_proj30_part3.png')
plt.show()

# **Part 4: Single-layer decision-tree classifier**

In [None]:
# dimension reduction with pca
pca = PCA(
act_reduced = pca.fit_transform(

print(f'Kept {} components explaining {} variance.\n')
print(f'Original data size is {}')
print(f' Reduced data size is {}\n')
print(f'Observations:features ratio is {}:{} = {}')

In [None]:
# split the data into train and test
X_train,X_test, y_train,y_test = train_test_split(,,test_size=.2,)
dectree = DecisionTreeClassifier(
dectree.fit

# train and test accuracy
train_acc = (dectree.predict(X_train) == ).mean()
test_acc  =

print(f'Accuracies: Train {train_acc:.2%}, Test: {test_acc:.2%}')

In [None]:
# how many times to repeat the random data split
num_reps = 20

# re-initialize as arrays
train_acc = np.zeros(num_reps)
test_acc = np.zeros(num_reps)

# loop over reps
for i in

  # run the analysis
  X_train,X_test,y_train,y_test = train_test_split
  dectree =
  dectree.fit(

  # calculate and store accuracies
  train_acc[i] =
  test_acc[i]  =


In [None]:
plt.figure(figsize=(10,4))

plt.plot(,label='Train')
plt.plot(,label='Test')

plt.axhline(,linestyle='--',color=[.7,.7,.9])
plt.axhline(,linestyle='--',color=[.9,.7,.7])


plt.legend()

plt.tight_layout()
plt.savefig('ch5_proj30_part4a.png')
plt.show()

In [None]:
# feature importances (of final run from previous cell)
importances = dectree.feature_importances_
indices = np.argsort(

plt.figure(figsize=(10,3))
plt.title("Feature Importances (Top 10 PCs)")
for i in range(10):
  imp_val = importances[indices[i]]
  plt.bar(,,color=plt.cm.plasma(imp_val/importances.max()),edgecolor='k')
plt.xticks(range(10), [f"PC{indices[i]+1}" for i in range(10)], rotation=45)

plt.tight_layout()
plt.savefig('ch5_proj30_part4b.png')
plt.show()

In [None]:
# FYI (not part of the exercise)
# The tree plot shows which features were used in each leaf split.
# However, this plot isn't really interpretable for PCA data.
import sklearn.tree
plt.figure(figsize=(14,10))
sklearn.tree.plot_tree(dectree,fontsize=10);

# **Part 5: Collect [CLS] activations from all layers**

In [None]:
# the number of hidden-states from the model output (transformers + 1)
num_layers = model.config.num_hidden_layers + 1

# list to store all the activations
hs_acts = [np.zeros((sample_size,model.config.hidden_size)) for _ in range(num_layers)]

# loop over all batches
idx = 0
for batchi in tqdm(range(num_batch)):

  # run the model
  outputs = model

  # extract the CLS activation from the hidden states
  for hsi,hs in enumerate(outputs.hidden_states):
    hs_acts[hsi][:+] =
  idx += batchsize


In [None]:
len(hs_acts), hs_acts[hsi].shape

In [None]:
# visualize the vector averages
plt.figure(figsize=(10,4))

for hsi in range(len(hs_acts)):
  plt.plot(,,'ko',markerfacecolor=[.9,.7,.7,.3],linewidth=0)
  plt.plot(,,'ks',markerfacecolor=[.7,.9,.7,.3],linewidth=.1)


plt.gca().set(xlabel='Hidden layer',ylabel='CLS activation (mean)')

plt.tight_layout()
plt.savefig('ch5_proj30_part5a.png')
plt.show()

# **Part 6: Laminar profile of classification accuracy**

In [None]:
# main analysis

# initializations
accuracies = np.zeros((num_layers,2))

num_reps = 5
train_acc = np.zeros(num_reps)
test_acc = np.zeros(num_reps)


# loop over layers
for layeri in range(num_layers):


  # loop over repetitions for stability
  for i in range(num_reps):

    # 1) split the data
    X_train,X_test,y_train,y_test =

    # 2) fit the PCA on the train data
    pca =
    X_train_pca =

    # 3) apply the PCA transform to the test set
    X_test_pca  =

    # 4) fit the decision-tree model
    dectree =
    dectree.fit

    # train accuracy
    train_acc[i] =
    test_acc[i]  =

  # average accuracies for this layer
  accuracies[layeri,0] =
  accuracies[layeri,1] =

  print(f'Finished layer {}/{} with {} test accuracy.')

In [None]:
plt.figure(figsize=(10,4))
plt.plot(,'ks-',linewidth=.2,markerfacecolor=[.7,.9,.7],markersize=12,label='Train')
plt.plot(,'ko-',linewidth=.2,markerfacecolor=[.7,.7,.9],markersize=12,label='Test')

plt.legend()
plt.gca().set(xlabel='Hidden state layer',ylabel='Accuracy')

plt.tight_layout()
plt.savefig('ch5_proj30_part5b.png')
plt.show()

# **Part 7: Performance benefit per transformer**

In [None]:
# create the predictor variable (IV)
predictor = np.arange(1,num_layers

# fit the model
reg = LinearRegression(
print(f'beta_0: {:6.3%}')
print(f'beta_1: {:7.3%}')

# calculate predicted accuracies
yHat = reg.predict()

In [None]:
# and visualize
plt.figure(figsize=(10,4))
plt.plot(,,'k',label='Linear fit')
plt.plot(,,'ko-',linewidth=.2,markerfacecolor=[.7,.7,.9],markersize=12,label='Data')

plt.gca().set(xlabel='Number of transformers',ylabel='Accuracy',
              title=f'Increase of {reg.coef_[0]:.2%} accuracy per transformer')
plt.legend(loc='upper left')

plt.tight_layout()
plt.savefig('ch5_proj30_part6.png')
plt.show()

In [None]:
print(f'Regression-predicted boost per transformer  : {:.4%}')
print(f'Empirically calculated boost per transformer: {}')