<a href="https://colab.research.google.com/github/PraveenSH/sparse-ae-bias-llms/blob/main/SAE_LMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
try:
    import google.colab # type: ignore
    IN_COLAB = True
except:
    IN_COLAB = False

import os, sys
chapter = "chapter1_transformer_interp"
repo = "ARENA_3.0"

if IN_COLAB:
    # Install packages
    %pip install jaxtyping
    %pip install transformer_lens
    %pip install git+https://github.com/callummcdougall/eindex.git

    # Code to download the necessary files (e.g. solutions, test funcs)
    if not os.path.exists(f"/content/{chapter}"):
        !wget https://github.com/callummcdougall/ARENA_3.0/archive/refs/heads/main.zip
        !unzip /content/main.zip 'ARENA_3.0-main/chapter1_transformer_interp/exercises/*'
        sys.path.append(f"/content/{repo}-main/{chapter}/exercises")
        os.remove("/content/main.zip")
        os.rename(f"{repo}-main/{chapter}", chapter)
        os.rmdir(f"{repo}-main")
        os.chdir(f"{chapter}/exercises")
else:
    chapter_dir = r"./" if chapter in os.listdir() else os.getcwd().split(chapter)[0]
    sys.path.append(chapter_dir + f"{chapter}/exercises")

In [2]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
import torch as t
from torch import nn, Tensor
from torch.distributions.categorical import Categorical
from torch.nn import functional as F
from dataclasses import dataclass
import numpy as np
import einops
from jaxtyping import Float, Int
from typing import Optional, Callable, Union, List, Tuple
from functools import partial
from tqdm.notebook import tqdm
from dataclasses import dataclass
from rich import print as rprint
from rich.table import Table
from IPython.display import display, HTML
from pathlib import Path

# Make sure exercises are in the path
exercises_dir = Path(f"{os.getcwd().split(chapter)[0]}/{chapter}/exercises").resolve()
section_dir = (exercises_dir / "part4_superposition_and_saes").resolve()
if str(exercises_dir) not in sys.path: sys.path.append(str(exercises_dir))

from plotly_utils import imshow, line, hist
from part4_superposition_and_saes.utils import (
    plot_features_in_2d,
    plot_features_in_Nd,
    plot_features_in_Nd_discrete,
    plot_correlated_features,
    plot_feature_geometry,
    frac_active_line_plot,
)
import part4_superposition_and_saes.tests as tests
import part4_superposition_and_saes.solutions as solutions

device = t.device("cuda" if t.cuda.is_available() else "cpu")

MAIN = __name__ == "__main__"

In [3]:
def linear_lr(step, steps):
    return (1 - (step / steps))

def constant_lr(*_):
    return 1.0

def cosine_decay_lr(step, steps):
    return np.cos(0.5 * np.pi * step / (steps - 1))


@dataclass
class Config:
    # We optimize n_instances models in a single training loop to let us sweep over
    # sparsity or importance curves  efficiently. You should treat `n_instances` as
    # kinda like a batch dimension, but one which is built into our training setup.
    n_instances: int
    n_features: int = 5
    n_hidden: int = 2
    n_correlated_pairs: int = 0
    n_anticorrelated_pairs: int = 0


class Model(nn.Module):
    W: Float[Tensor, "n_instances n_hidden n_features"]
    b_final: Float[Tensor, "n_instances n_features"]
    # Our linear map is x -> ReLU(W.T @ W @ x + b_final)

    def __init__(
        self,
        cfg: Config,
        feature_probability: Optional[Union[float, Tensor]] = None,
        importance: Optional[Union[float, Tensor]] = None,
        device = device,
    ):
        super().__init__()
        self.cfg = cfg

        if feature_probability is None: feature_probability = t.ones(())
        if isinstance(feature_probability, float): feature_probability = t.tensor(feature_probability)
        self.feature_probability = feature_probability.to(device).broadcast_to((cfg.n_instances, cfg.n_features))
        if importance is None: importance = t.ones(())
        if isinstance(importance, float): importance = t.tensor(importance)
        self.importance = importance.to(device).broadcast_to((cfg.n_instances, cfg.n_features))

        self.W = nn.Parameter(nn.init.xavier_normal_(t.empty((cfg.n_instances, cfg.n_hidden, cfg.n_features))))
        self.b_final = nn.Parameter(t.zeros((cfg.n_instances, cfg.n_features)))
        self.to(device)


    def forward(
        self,
        features: Float[Tensor, "... instances features"]
    ) -> Float[Tensor, "... instances features"]:
        hidden = einops.einsum(
           features, self.W,
           "... instances features, instances hidden features -> ... instances hidden"
        )
        out = einops.einsum(
            hidden, self.W,
            "... instances hidden, instances hidden features -> ... instances features"
        )
        return F.relu(out + self.b_final)


    def generate_batch(self, batch_size) -> Float[Tensor, "batch_size instances features"]:
        '''
        Generates a batch of data. We'll return to this function later when we apply correlations.
        '''
        pass # See below for solutions


    def calculate_loss(
        self,
        out: Float[Tensor, "batch instances features"],
        batch: Float[Tensor, "batch instances features"],
    ) -> Float[Tensor, ""]:
        '''
        Calculates the loss for a given batch, using this loss described in the Toy Models paper:

            https://transformer-circuits.pub/2022/toy_model/index.html#demonstrating-setup-loss

        Remember, `model.importance` will always have shape (n_instances, n_features).
        '''
        pass # See below for solutions


    def optimize(
        self,
        batch_size: int = 1024,
        steps: int = 10_000,
        log_freq: int = 100,
        lr: float = 1e-3,
        lr_scale: Callable[[int, int], float] = constant_lr,
    ):
        '''
        Optimizes the model using the given hyperparameters.
        '''
        optimizer = t.optim.Adam(list(self.parameters()), lr=lr)

        progress_bar = tqdm(range(steps))

        for step in progress_bar:

            # Update learning rate
            step_lr = lr * lr_scale(step, steps)
            for group in optimizer.param_groups:
                group['lr'] = step_lr

            # Optimize
            optimizer.zero_grad()
            batch = self.generate_batch(batch_size)
            out = self(batch)
            loss = self.calculate_loss(out, batch)
            loss.backward()
            optimizer.step()

            # Display progress bar
            if step % log_freq == 0 or (step + 1 == steps):
                progress_bar.set_postfix(loss=loss.item()/self.cfg.n_instances, lr=step_lr)

In [4]:
@dataclass
class AutoEncoderConfig:
    n_instances: int
    n_input_ae: int
    n_hidden_ae: int
    l1_coeff: float = 0.5
    tied_weights: bool = False
    weight_normalize_eps: float = 1e-8


class AutoEncoder(nn.Module):
    W_enc: Float[Tensor, "n_instances n_input_ae n_hidden_ae"]
    W_dec: Float[Tensor, "n_instances n_hidden_ae n_input_ae"]
    b_enc: Float[Tensor, "n_instances n_hidden_ae"]
    b_dec: Float[Tensor, "n_instances n_input_ae"]


    def __init__(self, cfg: AutoEncoderConfig):
        '''
        Initializes the two weights and biases according to the type signature above.

        If self.cfg.tied_weights = True, then we only create W_enc, not W_dec.
        '''
        super(AutoEncoder, self).__init__()
        self.cfg = cfg

        self.W_enc = nn.Parameter(nn.init.xavier_normal_(t.empty(cfg.n_instances, cfg.n_input_ae, cfg.n_hidden_ae)))
        if not(cfg.tied_weights):
            self.W_dec = nn.Parameter(nn.init.xavier_normal_(t.empty(cfg.n_instances, cfg.n_hidden_ae, cfg.n_input_ae)))

        self.b_enc = nn.Parameter(t.zeros(cfg.n_instances, cfg.n_hidden_ae))
        self.b_dec = nn.Parameter(t.zeros(cfg.n_instances, cfg.n_input_ae))

        self.to(device)


    def normalize_and_return_W_dec(self) -> Float[Tensor, "n_instances n_hidden_ae n_input_ae"]:
        '''
        If self.cfg.tied_weights = True, we return the normalized & transposed encoder weights.
        If self.cfg.tied_weights = False, we normalize the decoder weights in-place, and return them.

        Normalization should be over the `n_input_ae` dimension, i.e. each feature should have a noramlized decoder weight.
        '''
        if self.cfg.tied_weights:
            return self.W_enc.transpose(-1, -2) / (self.W_enc.transpose(-1, -2).norm(dim=1, keepdim=True) + self.cfg.weight_normalize_eps)
        else:
            self.W_dec.data = self.W_dec.data / (self.W_dec.data.norm(dim=2, keepdim=True) + self.cfg.weight_normalize_eps)
            return self.W_dec


    def forward(self, h: Float[Tensor, "batch_size n_instances n_input_ae"]):
        '''
        Runs a forward pass on the autoencoder, and returns several outputs.

        Inputs:
            h: Float[Tensor, "batch_size n_instances n_input_ae"]
                hidden activations generated from a Model instance

        Returns:
            l1_loss: Float[Tensor, "batch_size n_instances"]
                L1 loss for each batch elem & each instance (sum over the `n_hidden_ae` dimension)
            l2_loss: Float[Tensor, "batch_size n_instances"]
                L2 loss for each batch elem & each instance (take mean over the `n_input_ae` dimension)
            loss: Float[Tensor, ""]
                Sum of L1 and L2 loss (with the former scaled by `self.cfg.l1_coeff). We sum over the `n_instances`
                dimension but take mean over the batch dimension
            acts: Float[Tensor, "batch_size n_instances n_hidden_ae"]
                Activations of the autoencoder's hidden states (post-ReLU)
            h_reconstructed: Float[Tensor, "batch_size n_instances n_input_ae"]
                Reconstructed hidden states, i.e. the autoencoder's final output
        '''
        hid = F.relu(einops.einsum((h - self.b_dec), self.W_enc, "batch instance input_ae, instance input_ae hidden_ae -> batch instance hidden_ae") + self.b_enc)
        rec = einops.einsum(hid, self.normalize_and_return_W_dec(), "batch instance hidden_ae, instance hidden_ae input_ae -> batch instance input_ae") + self.b_dec

        l1_loss = hid.abs().sum(dim=-1)
        l2_loss = (rec - h).pow(2).mean(dim=-1)
        loss = (l1_loss * self.cfg.l1_coeff + l2_loss).mean(dim=0).sum()

        return l1_loss, l2_loss, loss, hid, rec


    def optimize(
        self,
        model: Model,
        batch_size: int = 1024,
        steps: int = 10_000,
        log_freq: int = 100,
        lr: float = 1e-3,
        lr_scale: Callable[[int, int], float] = constant_lr,
        neuron_resample_window: Optional[int] = None,
        dead_neuron_window: Optional[int] = None,
        neuron_resample_scale: float = 0.2,
    ):
        '''
        Optimizes the autoencoder using the given hyperparameters.

        The autoencoder is trained on the hidden state activations produced by 'model', and it
        learns to reconstruct the features which this model represents in superposition.
        '''
        if neuron_resample_window is not None:
            assert (dead_neuron_window is not None) and (dead_neuron_window < neuron_resample_window)

        optimizer = t.optim.Adam(list(self.parameters()), lr=lr)
        frac_active_list = []
        progress_bar = tqdm(range(steps))

        # Create lists to store data we'll eventually be plotting
        data_log = {"W_enc": [], "W_dec": [], "colors": [], "titles": [], "frac_active": []}
        colors = None
        title = "no resampling yet"

        for step in progress_bar:

            # Update learning rate
            step_lr = lr * lr_scale(step, steps)
            for group in optimizer.param_groups:
                group['lr'] = step_lr

            # Get a batch of hidden activations from the model (for the training step, and the neuron resampling)
            with t.inference_mode():
                features = model.generate_batch(batch_size)
                h = einops.einsum(features, model.W, "batch instances feats, instances hidden feats -> batch instances hidden")

            # Resample dead neurons
            if (neuron_resample_window is not None) and ((step + 1) % neuron_resample_window == 0):
                # Get the fraction of neurons active in the previous window
                frac_active_in_window = t.stack(frac_active_list[-neuron_resample_window:], dim=0)
                # Apply resampling
                colors, title = self.resample_neurons(h, frac_active_in_window, neuron_resample_scale)

            # Optimize
            l1_loss, l2_loss, loss, acts, _ = self.forward(h)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            # Calculate the mean sparsities over batch dim for each (instance, feature)
            frac_active = (acts.abs() > 1e-8).float().mean(0)
            frac_active_list.append(frac_active)

            # Display progress bar, and append new values for plotting
            if step % log_freq == 0 or (step + 1 == steps):
                progress_bar.set_postfix(l1_loss=self.cfg.l1_coeff * l1_loss.mean(0).sum().item(), l2_loss=l2_loss.mean(0).sum().item(), lr=step_lr)
                data_log["W_enc"].append(self.W_enc.detach().cpu().clone())
                data_log["W_dec"].append(self.normalize_and_return_W_dec().detach().cpu().clone())
                data_log["colors"].append(colors)
                data_log["titles"].append(f"Step {step}/{steps}: {title}")
                data_log["frac_active"].append(frac_active.detach().cpu().clone())

        return data_log


    @t.no_grad()
    def resample_neurons(
        self,
        h: Float[Tensor, "batch_size n_instances n_input_ae"],
        frac_active_in_window: Float[Tensor, "window n_instances n_hidden_ae"],
        neuron_resample_scale: float,
    ) -> Tuple[List[List[str]], str]:
        '''
        Resamples neurons that have been dead for `dead_neuron_window` steps, according to `frac_active`.
        '''
        pass # See below for a solution to this function

In [5]:
from transformer_lens import HookedTransformer, FactoredMatrix
from transformer_lens.hook_points import HookPoint

from transformer_lens.utils import (
    load_dataset,
    tokenize_and_concatenate,
    download_file_from_hf,
)

In [6]:
VERSION_DICT = {"run1": 25, "run2": 47}

def load_autoencoder_from_huggingface(versions: List[str] = ["run1", "run2"]):
    state_dict = {}

    for version in versions:
        version_id = VERSION_DICT[version]
        # Load the data from huggingface (both metadata and state dict)
        sae_data: dict = download_file_from_hf("NeelNanda/sparse_autoencoder", f"{version_id}_cfg.json")
        new_state_dict: dict = download_file_from_hf("NeelNanda/sparse_autoencoder", f"{version_id}.pt", force_is_torch=True)
        # Add new state dict to the existing one
        for k, v in new_state_dict.items():
            state_dict[k] = t.stack([state_dict[k], v]) if k in state_dict else v

    # Get data about the model dimensions, and use that to initialize our model (with 2 instances)
    d_mlp = sae_data["d_mlp"]
    dict_mult = sae_data["dict_mult"]
    n_hidden_ae = d_mlp * dict_mult

    cfg = AutoEncoderConfig(
        n_instances = 2,
        n_input_ae = d_mlp,
        n_hidden_ae = n_hidden_ae,
    )

    # Initialize our model, and load in state dict
    autoencoder = AutoEncoder(cfg)
    print(autoencoder.state_dict().keys())
    print(state_dict.keys())
    autoencoder.load_state_dict(state_dict)

    return autoencoder


autoencoder = load_autoencoder_from_huggingface()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


25_cfg.json:   0%|          | 0.00/283 [00:00<?, ?B/s]

25.pt:   0%|          | 0.00/269M [00:00<?, ?B/s]

47_cfg.json:   0%|          | 0.00/309 [00:00<?, ?B/s]

47.pt:   0%|          | 0.00/269M [00:00<?, ?B/s]

odict_keys(['W_enc', 'W_dec', 'b_enc', 'b_dec'])
dict_keys(['W_enc', 'W_dec', 'b_enc', 'b_dec'])


In [7]:
model = HookedTransformer.from_pretrained("gelu-1l").to(device)

config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

model_final.pth:   0%|          | 0.00/213M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.04M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

Loaded pretrained model gelu-1l into HookedTransformer
Moving model to device:  cuda


In [None]:
'''
sentences = [
    "The company launched a multi-functional device last week.",
    "She is an expert in multi-lingual communication.",
    "The project requires a multi-disciplinary team.",
    "We attended a multi-day conference on technology.",
    "The building is designed with a multi-level parking lot.",
    "The new software supports multi-user access.",
    "They conducted a multi-phase study on climate change.",
    "The product comes with a multi-year warranty.",
    "He is working on a multi-faceted approach to the problem.",
    "The multi-national corporation operates in over 50 countries."
]

sentences = [
    "Muslims around the world observe the holy month of Ramadan by fasting from dawn to sunset.",
    "The Quran is the central religious text of Islam, considered by Muslims to be a revelation from God.",
    "Muslims pray five times a day facing the holy city of Mecca.",
    "The majority of Muslims belong to either the Sunni or Shia branches of Islam.",
    "Muslims celebrate Eid al-Fitr at the end of Ramadan with feasts and prayers.",
    "The Hajj pilgrimage to Mecca is one of the Five Pillars of Islam that Muslims are encouraged to undertake at least once in their lifetime.",
    "Muslims believe in the oneness of God and the prophethood of Muhammad.",
    "Muslim communities can be found across the globe, with significant populations in the Middle East, Southeast Asia, and Africa.",
    "Muslims follow dietary laws that include the consumption of halal food and the prohibition of pork and alcohol.",
    "The Islamic calendar is lunar, and Muslims celebrate important events like Eid al-Adha according to its dates."
]


sentences = [
    "Hindus celebrate the festival of Diwali, also known as the Festival of Lights, with great enthusiasm.",
    "The Bhagavad Gita is a sacred text in Hinduism that is revered by Hindus around the world.",
    "Hindus believe in karma, the law of cause and effect, which influences their actions and future lives.",
    "The majority of Hindus live in India, but there are significant Hindu communities in Nepal, Bangladesh, and other countries.",
    "Hindus worship a variety of deities, with the most widely worshiped being Vishnu, Shiva, and Devi.",
    "The Ganges River is considered sacred by Hindus, and many make pilgrimages to bathe in its waters.",
    "Hindus practice rituals and ceremonies at various life stages, such as birth, marriage, and death.",
    "The caste system has historically played a significant role in Hindu society, though it is now officially abolished.",
    "Hindus celebrate Holi, the festival of colors, to mark the arrival of spring and the victory of good over evil.",
    "Hinduism is one of the oldest religions in the world, with a rich tradition of philosophy, art, and culture."
]

sentences = [
    "In the quiet village, men would often gather at the local pub.",
    "The project was led by a team where men and women worked together.",
    "During the winter, male deer are often seen foraging for food.",
    "In many species, male animals display vibrant colors to attract mates.",
    "The ceremony honored brave men who served in the military.",
    "As the sun set, men began lighting the bonfire for the evening celebration.",
    "In the animal kingdom, male peacocks are known for their colorful feathers.",
    "The workshop was attended by both men and women interested in carpentry.",
    "The discussion focused on the roles of men in modern society.",
    "In ancient times, male warriors were revered for their strength and courage."
]

sentences = [
    "The cultural diversity in America enriches the lives of its citizens.",
    "Many people dream of visiting America to experience its iconic landmarks.",
    "America has a significant influence on global music and entertainment industries.",
    "The economy of America is one of the largest in the world.",
    "Education in America offers a wide range of opportunities for students."
]


sentences = [
    "The vibrant culture of Mexico is celebrated worldwide for its colorful festivals and delicious cuisine.",
    "Tourists flock to Mexico to explore the ancient Mayan ruins and relax on the beautiful beaches.",
    "Mexico City, the bustling capital of Mexico, is known for its stunning architecture and lively markets.",
    "Tequila, a popular spirit, is produced in the highlands of central Mexico and exported globally.",
    "The diverse landscapes of Mexico range from arid deserts in the north to lush tropical rainforests in the south."
]

sentences = [
    "The conference highlighted the achievements of women in technology.",
    "In many cultures, women play a central role in family traditions.",
    "The team of women worked tirelessly to complete the project on time.",
    "During the event, women were recognized for their contributions to science.",
    "The community center offers classes where women can learn new skills.",
    "In history, women have often led movements for social change.",
    "The debate focused on the rights and opportunities available to women.",
    "In the art gallery, women showcased their paintings and sculptures.",
    "The organization provides support for women seeking higher education.",
    "Throughout the world, women continue to break barriers in various fields."
]
'''

sentences = [
    "In the quiet village, men would often gather at the local pub.",
    "The project was led by a team where men and women worked together.",
    "During the winter, male deer are often seen foraging for food.",
    "In many species, male animals display vibrant colors to attract mates.",
    "The ceremony honored brave men who served in the military.",
    "As the sun set, men began lighting the bonfire for the evening celebration.",
    "In the animal kingdom, male peacocks are known for their colorful feathers.",
    "The workshop was attended by both men and women interested in carpentry.",
    "The discussion focused on the roles of men in modern society.",
    "In ancient times, male warriors were revered for their strength and courage."
]

tokens = model.tokenizer(sentences, return_tensors="pt", padding=True)["input_ids"]
print(tokens.shape)

Token ids for stereotype tokens
```c
tensor(45879)  Nurse
tensor(14641)  Doctor
tensor(37909)  Engineer
tensor(22656)  Leader
tensor(30835)  Kitchen
tensor(16672) terroist
tensor(28268) refugee
tensor(3972)  poor
tensor(33004) drug
tensor(48049) criminal
tensor(32584)  communist
tensor(17004)  intelligent
```

In [49]:
fid = 1770
feature_ind = t.where(tokens == fid)
feature_non_ind = t.where(tokens != fid)
print(feature_ind)

(tensor([0, 1, 4, 5, 7, 8]), tensor([5, 8, 4, 5, 6, 7]))


In [50]:
logits, cache = model.run_with_cache(tokens)
acts = cache['blocks.0.mlp.hook_post']
ae_inputs = einops.rearrange(acts, "n len hid -> (n len) 1 hid")
ae_inputs = einops.repeat(ae_inputs, 'b n d -> b (n r) d', r=2)

In [53]:
feats = autoencoder.forward(ae_inputs)
ae_acts = einops.rearrange(feats[3], "n inst dim -> inst n dim")
ae_acts1 = einops.rearrange(ae_acts[0], "(n len) d -> n len d", n=len(sentences))

In [None]:
#get the top activating feature

for i in range(len(feature_ind[0])):
  #print(t.topk(ae_acts1[feature_ind[0][i]][feature_ind[1][i]], k=3))
  feat_ind = t.argmax(ae_acts1[feature_ind[0][i]][feature_ind[1][i]])
  print(feat_ind)
  #print(ae_acts1[feature_ind[0][i]][feature_ind[1][i]][feat_ind.item()])

**Detected features for tokens**
```c
tensor(15) "-"
tensor(37210) Muslim
tensor(25454)  Hindu
tensor(1770)  men
tensor(2185)  women
tensor(3840)  America
tensor(8753)  Mexico
```

In [51]:
# Check which head is responsible for detecting the feature
# Analysing head's norm when feature present vs absent.

head_outs = cache['blocks.0.attn.hook_z']
head_norms = t.norm(head_outs, dim=-1)

for hd in range(model.cfg.n_heads):
  sm = 0.0
  for i in range(len(feature_ind[0])):
    sm += head_norms[feature_ind[0][i]][feature_ind[1][i]][hd].item()

  avg_feature_present = sm / len(feature_ind[0])

  sm = 0.0
  for i in range(len(feature_non_ind[0])):
    sm += head_norms[feature_non_ind[0][i]][feature_non_ind[1][i]][hd].item()

  avg_feature_absent = sm / len(feature_non_ind[0])
  print(hd, avg_feature_present - avg_feature_absent)

0 -0.055694500863294305
1 -0.23050969452052916
2 -0.3889436058667828
3 -0.6091021170347797
4 -0.012307689045414794
5 0.07947155717131382
6 -0.6148875221545562
7 -0.1273134794586146


**Head analysis Output**

```c
Head 7 seems to have higher norm when "women" and "Muslim" is present compared when they are not.

Head 5 for "men"
```

**Feature analysis Outputs**
```c
Feature 826 = "-" in "multi-"

Feature 5703 = "Muslims"
Feature 7497 = "Hindu"

Feature 9786 = "men"
Feature 9162 = "women"
No single feature for Country
```

In [32]:
'''
Detecting bias in the model.
When an inherent trait is present (gender, religion), does model has higher or lower likelihood for certain stereotype tokens
'''

stereo_type = {"Nurse": 45879, "Doctor": 14641}
gender = {"men": 9786, "women": 9162}

#stereo_type = {"terrorist": 16672, "Refugee": 28268}
#relegion = {"Hindu": 7497, "Muslim": 5703}
for g, feat in gender.items():

  print(f"Likelyhood for {g}")
  W_dec_vector = autoencoder.W_dec[0, feat]
  W_dec_logits = W_dec_vector @ model.W_out[0] @ model.W_U
  probs = F.softmax(W_dec_logits, dim=0)


  for k, v in stereo_type.items():
    print(k, probs[v])
  print("*************")


Likelyhood for men
Nurse tensor(1.4257e-05, device='cuda:0', grad_fn=<SelectBackward0>)
Doctor tensor(1.4540e-05, device='cuda:0', grad_fn=<SelectBackward0>)
*************
Likelyhood for women
Nurse tensor(2.6437e-05, device='cuda:0', grad_fn=<SelectBackward0>)
Doctor tensor(2.3831e-05, device='cuda:0', grad_fn=<SelectBackward0>)
*************


**Bias analysis output**
```c
Likelyhood for Hindu
terrorist tensor(4.2447e-05, device='cuda:0', grad_fn=<SelectBackward0>)
Refugee tensor(2.9436e-05, device='cuda:0', grad_fn=<SelectBackward0>)
*************
Likelyhood for Muslim
terrorist tensor(1.5144e-05, device='cuda:0', grad_fn=<SelectBackward0>)
Refugee tensor(1.4916e-05, device='cuda:0', grad_fn=<SelectBackward0>)
*************

Hindu biased towards terrorist and refugee more than Muslim
```

```c
Likelyhood for men
Nurse tensor(1.4257e-05, device='cuda:0', grad_fn=<SelectBackward0>)
Doctor tensor(1.4540e-05, device='cuda:0', grad_fn=<SelectBackward0>)
*************
Likelyhood for women
Nurse tensor(2.6437e-05, device='cuda:0', grad_fn=<SelectBackward0>)
Doctor tensor(2.3831e-05, device='cuda:0', grad_fn=<SelectBackward0>)
*************

Men have higher likely for doctor than nurse while it is reverse for Women
```

In [None]:
'''
tids = model.tokenizer(["of Mexico is", "central Mexico and"], return_tensors="pt", padding=True)["input_ids"]
for j in range(len(tids)):
  for tid in tids[j]:
    tks = model.tokenizer.decode(tid)
    print(tid, tks)

'''