**TRAINING SECTION -- YOU ONLY NEED TO RUN IF YOU DONT HAVE MODEL FOLDER**

CHANGE ALL PATHS FIRST: CHANGE DATA READING PATH AND CHANGE DATA SAVING PATH

In [2]:
import pandas as pd
import transformers, torch

df = pd.read_csv('/Users/raasikh/Documents/Coding/ai.xperience/carlos-artifact-tagging-bias/carlos_data/clean_data_annotated_v2.csv', encoding='latin1')
df = df.head(101)

print(df.head)

descs = df['TextEntry'].tolist()
bias_clas = df[['Subjective', 'Gender', 'Jargon', 'Social']].values

print(descs[0])

print(bias_clas)

tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts
inputs = tokenizer(descs, padding=True, truncation=True, return_tensors="pt", max_length=512)

<bound method NDFrame.head of      ObjectID                               Title  \
0           0  Engaged Corner Capital with Leaves   
1           1                              Trivet   
2           2                               Point   
3           3                    A Refugee Family   
4           6        Perfume Vessel (Amphoriskos)   
..        ...                                 ...   
96        170                          Copper Ore   
97        171         Three Pages from a Bestiary   
98        173                                Ball   
99        175       Bell with Demotic Inscription   
100       176                    Tanagra Figurine   

                                             TextEntry  Subjective  Gender  \
0                  No provenance information in files.           0       0   
1    "The Carlos Museum's collection of ancient Ame...           0       0   
2    Codes used specifically to disguise the conten...           0       0   
3    Inhouse exhibiti

In [3]:
from torch.utils.data import Dataset, DataLoader

class BiasDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

dataset = BiasDataset(inputs, bias_clas)

dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
model.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [4]:
from transformers import AdamW
from torch.nn import BCEWithLogitsLoss

optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = BCEWithLogitsLoss()

# Training loop
epochs = 3
for epoch in range(epochs):
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'],
                        labels=batch['labels'])
        
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}")



Epoch 1, Loss: 0.6035284570285252
Epoch 2, Loss: 0.5183091163635254
Epoch 3, Loss: 0.4555529398577554


In [5]:
model.save_pretrained('/Users/raasikh/Documents/Coding/ai.xperience/carlos-artifact-tagging-bias/bert/model')
tokenizer.save_pretrained('/Users/raasikh/Documents/Coding/ai.xperience/carlos-artifact-tagging-bias/bert/model')

('/Users/raasikh/Documents/Coding/ai.xperience/carlos-artifact-tagging-bias/bert/model/tokenizer_config.json',
 '/Users/raasikh/Documents/Coding/ai.xperience/carlos-artifact-tagging-bias/bert/model/special_tokens_map.json',
 '/Users/raasikh/Documents/Coding/ai.xperience/carlos-artifact-tagging-bias/bert/model/vocab.txt',
 '/Users/raasikh/Documents/Coding/ai.xperience/carlos-artifact-tagging-bias/bert/model/added_tokens.json')

**PREDICTION**

Run the first code block only if you have the model folder and have NOT done training above

In [7]:
import transformers

# CHANGE TO YOUR PATH OF MODEL FOLDER
tokenizer = transformers.BertTokenizer.from_pretrained('/Users/raasikh/Documents/Coding/ai.xperience/carlos-artifact-tagging-bias/bert/model')
model = transformers.BertForSequenceClassification.from_pretrained('/Users/raasikh/Documents/Coding/ai.xperience/carlos-artifact-tagging-bias/bert/model')


In [13]:
model.eval()

# Whatever you want it to predict
validation_texts = ["Harsh, uncomfortably brilliant, usually reflected light.  W. April 1993 descriptor moved.", 
                    "December 1992 lead-in term added. January 1991 alternate term added. Object fumigated in Orkin's Piedmont vault with Vikane in 1994"]

# Tokenize validation texts
validation_inputs = tokenizer(validation_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)

with torch.no_grad():
    outputs = model(**validation_inputs)
    logits = outputs.logits
    predictions = torch.sigmoid(logits)

# Print predictions
print(predictions)


tensor([[0.2832, 0.2683, 0.5392, 0.2065],
        [0.2868, 0.2462, 0.5434, 0.2164]])
