In [4]:
#imports
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup)
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score

In [8]:
#Separte columns in datasets a write changes to a new .csv
original_file_path = 'C:\\Desktop\\training\\TRAINING\\training.csv'
df = pd.read_csv(original_file_path, delimiter='\t', encoding='utf-8')
training1 = 'C:\\Desktop\\training\\TRAINING\\training1.csv'
df.to_csv(training1, index=False, sep=',')

original_file_path = 'C:\\Desktop\\test\\Test.csv'
df = pd.read_csv(original_file_path, delimiter='\t', encoding='utf-8')
test1 = 'C:\\Desktop\\test\\test1.csv'
df.to_csv(test1, index=False, sep=',')

# combine test1.csv with test_labels.txt
testing_df = pd.read_csv('C:\\Desktop\\test\\test1.csv')

# Open the text file and read the additional values
additional_values = []
with open('C:\\Desktop\\test_labels.txt', 'r') as file:
    for line in file:
        # Now using '\t' to split since the format is "filename    misogynous    shaming    stereotype    objectification    violence"
        values = line.strip().split('\t')
        # Omit the filename from the values to be added to the DataFrame
        additional_values.append(values[1:])  # Skip the filename

# Convert additional_values to a DataFrame
additional_df = pd.DataFrame(additional_values, columns=['misogynous', 'shaming', 'stereotype', 'objectification', 'violence'])

# Ensure the correct data types, assuming all additional columns should be integers
additional_df = additional_df.astype(int)

# Concatenate the original testing_df with the additional_df
final_df = pd.concat([testing_df, additional_df], axis=1)

# Save the updated dataframe to a new CSV file
final_df.to_csv('C:\\Desktop\\updated_testing.csv', index=False)



In [9]:
# Reads CSV file to pandas DataFrame, extracts texts + labels into lists + returns them. load training + testing data
def load_dataset(file_path):
    df1 = pd.read_csv(file_path)
    texts = df1['Text Transcription'].tolist()
    labels = df1['misogynous'].tolist()
    return texts, labels

#Initializes the BERT tokenizer, which converts text into tokens that can be fed to the BERT model.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_texts, train_labels = load_dataset('C:\\Desktop\\training\\TRAINING\\training1.csv')
test_texts, test_labels = load_dataset('C:\\Desktop\\updated_testing.csv') 

In [10]:
 #  tokenizes the list texts, adding necessary padding and truncation, to create a uniform input size. It returns PyTorch tensors.
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

#Tokenizes the training and testing text data.
train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

# Convert labels to tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)


In [11]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)
    
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [13]:
# Training loop
model.train()
for epoch in range(3):  # Adjust epochs based on your dataset and needs
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [7]:
# Evaluation
model.eval()
predictions, true_labels = [], []
for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions.extend(torch.argmax(logits, dim=-1).tolist())
    true_labels.extend(batch['labels'].tolist())

# Print classification report and accuracy
print(classification_report(true_labels, predictions))
print("Accuracy:", accuracy_score(true_labels, predictions))

# Save the model
model.save_pretrained('your_model_directory')
tokenizer.save_pretrained('your_model_directory')
