In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
# Loading  existing DataFrame
dataset = pd.read_csv("/content/a2_test.csv", encoding="utf-8")

# List to store processed rows
dataframe_rows = []

# Iterate through rows in the DataFrame
for index, row in dataset.iterrows():
    # Extract columns from the row
    label_sexist = row["label_sexist"]
    text = row["text"]

    # Construct the row dictionary without the `rewire_id` column
    dataframe_row = {
        "text": text,
        "label_sexist": label_sexist
    }

    dataframe_rows.append(dataframe_row) # Append to the list
df = pd.DataFrame(dataframe_rows) #dataframe constructed
df['label_sexist'] = df['label_sexist'].map({'sexist': 1, 'not sexist': 0}) # mapping 'sexist' = 1, "not_sexist"=0
print(df.head())


                                                text  label_sexist
0  The boys must be gaming because there goes the...             0
1  Look at those eyes. Either someone unexpectedl...             1
2                  Old man mogs everyone in this sub             0
3  Excellent, I was just looking at another post ...             0
4  So you run back to daddy whenever you need hel...             1


In [2]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")
# Load your existing DataFrame
# Function to clean text using spaCy
def clean_text(text):
    doc = nlp(text) # Process the text with spaCy
    cleaned_tokens = [token.text.lower() for token in doc if token.is_alpha]  ## Filter out tokens that are not alphabetic (no punctuation, , no numbers)
    cleaned_text = " ".join(cleaned_tokens) # Join tokens back into a clean string
    return cleaned_text


df['text'] = df['text'].apply(clean_text) # Apply the cleaning function to the 'text' column and update it in place
print("Cleaned datframe", df.head()) # Verify cleaned text
print("DataFrame info", df.info())

Cleaned datframe                                                 text  label_sexist
0  the boys must be gaming because there goes the...             0
1  look at those eyes either someone unexpectedly...             1
2                  old man mogs everyone in this sub             0
3  excellent i was just looking at another post w...             0
4  so you run back to daddy whenever you need hel...             1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   text          300 non-null    object
 1   label_sexist  300 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 4.8+ KB
DataFrame info None


In [3]:

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"], df["label_sexist"], test_size=0.2, random_state=42
)


In [4]:
from huggingface_hub import login
login(token="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")

In [5]:
from transformers import AutoTokenizer
import torch
# Load tokenizer





tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize function
def tokenize_data(texts, labels):
    return tokenizer(
        list(texts),
        truncation=True,
        padding="max_length",
        max_length=256,
        return_tensors="pt"
    ), torch.tensor(labels.values)

# Tokenize train and validation data
train_encodings, train_labels = tokenize_data(train_texts, train_labels)
val_encodings, val_labels = tokenize_data(val_texts, val_labels)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
from torch.utils.data import DataLoader, Dataset

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx],
        }

# Create datasets
train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)


In [7]:
import torch
import torch.nn as nn
from transformers import AutoModel

class Phi3Mini(nn.Module):
    def __init__(self, base_model_name, hidden_size=128, dropout_rate=0.1):
        super(Phi3Mini, self).__init__()
        self.base_model = AutoModel.from_pretrained(base_model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Sequential(
            nn.Linear(self.base_model.config.hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, 1),
            nn.Sigmoid()
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        cls_embedding = self.dropout(cls_embedding)
        return self.classifier(cls_embedding)


In [12]:
from transformers import AdamW
from sklearn.metrics import accuracy_score

# Initialize model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Phi3Mini(base_model_name="distilbert-base-uncased").to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.BCELoss()

# Training function
def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=5):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = (
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["labels"].float().to(device),
            )
            outputs = model(input_ids, attention_mask).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        print(f"Epoch {epoch+1}, Train Loss: {train_loss/len(train_loader):.4f}")

        # Validation
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = (
                    batch["input_ids"].to(device),
                    batch["attention_mask"].to(device),
                    batch["labels"].float().to(device),
                )
                outputs = model(input_ids, attention_mask).squeeze()
                preds = (outputs > 0.5).int()
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        acc = accuracy_score(all_labels, all_preds)
        print(f"Epoch {epoch+1}, Validation Accuracy: {acc:.4f}")

# Train the model
train_model(model, train_loader, val_loader, optimizer, criterion, epochs=10)




Epoch 1, Train Loss: 0.6903
Epoch 1, Validation Accuracy: 0.6500
Epoch 2, Train Loss: 0.5404
Epoch 2, Validation Accuracy: 0.7833
Epoch 3, Train Loss: 0.1864
Epoch 3, Validation Accuracy: 0.8000
Epoch 4, Train Loss: 0.1353
Epoch 4, Validation Accuracy: 0.7167
Epoch 5, Train Loss: 0.0872
Epoch 5, Validation Accuracy: 0.7167
Epoch 6, Train Loss: 0.0301
Epoch 6, Validation Accuracy: 0.7167
Epoch 7, Train Loss: 0.0162
Epoch 7, Validation Accuracy: 0.7500
Epoch 8, Train Loss: 0.0093
Epoch 8, Validation Accuracy: 0.7667
Epoch 9, Train Loss: 0.0072
Epoch 9, Validation Accuracy: 0.7667
Epoch 10, Train Loss: 0.0057
Epoch 10, Validation Accuracy: 0.7667


In [9]:
torch.save(model.state_dict(), "phi3mini_sexist_classifier.pt")


In [13]:
def predict(model, tokenizer, text):
    model.eval()
    inputs = tokenizer(text, truncation=True, padding="max_length", max_length=128, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(inputs["input_ids"], inputs["attention_mask"]).squeeze()
    return "Sexist" if outputs > 0.5 else "Not Sexist"

# Example inference
example_text = "Look at those eyes. Either someone unexpectedly just fisted her up the dumper, or that bitch is certifiably a nutburger.."
print(predict(model, tokenizer, example_text))


Sexist


# New section