In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datasets import Dataset
import numpy as np

In [2]:
data = pd.read_csv("/kaggle/input/dataset/dataset.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,user_gender,comment
0,0,Female,wow...love it jess
1,1,Female,This is a great picture of u!!!! Beautiful
2,2,Female,Hey boo!! Your G loves you to the moon and the...
3,3,Female,He said he loves you more and to not argue wit...
4,4,Female,I can't wait either! I miss him so very much!


In [3]:
data['user_gender'] = data['user_gender'].map({'Male' : 0, 'Female' : 1})

In [4]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

In [5]:
# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

In [6]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["comment"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/152083 [00:00<?, ? examples/s]

Map:   0%|          | 0/38021 [00:00<?, ? examples/s]

In [8]:
# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'user_gender'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'user_gender'])

In [9]:
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [10]:
# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [12]:
# Training loop
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [13]:
best_val_accuracy = 0

for epoch in range(2):  
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['user_gender'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Average Training Loss: {avg_train_loss}")

    # Validation loop
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['user_gender'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    
    val_accuracy = accuracy_score(val_labels, val_preds)
    print(f"Validation Accuracy: {val_accuracy}")
    
    # Save the best model to avoid overfitting
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        model.save_pretrained("/kaggle/working/bert")
        tokenizer.save_pretrained("/kaggle/working/token")

Epoch 1, Average Training Loss: 0.5197292381140048
Validation Accuracy: 0.7800163067778333
Epoch 2, Average Training Loss: 0.36271893750353407
Validation Accuracy: 0.7999000552326346


In [None]:
from sklearn.metrics import accuracy_score

# Evaluate the model on the validation set
model.eval()  
val_preds, val_labels = [], []

with torch.no_grad(): 
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['user_gender'].to(device)
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        # Get predicted labels by finding the index with the highest logit value
        preds = torch.argmax(logits, dim=-1)
        
        # Store predictions and labels
        val_preds.extend(preds.cpu().numpy())
        val_labels.extend(labels.cpu().numpy())

# Calculate accuracy
val_accuracy = accuracy_score(val_labels, val_preds)
print(f"Validation Accuracy: {val_accuracy}")

Validation Accuracy: 0.7999000552326346


In [15]:
def load_model():
    model = BertForSequenceClassification.from_pretrained("/kaggle/working/bert")
    tokenizer = BertTokenizer.from_pretrained("/kaggle/working/token")
    model.to(device)
    model.eval()
    return model, tokenizer

In [16]:
model, tokenizer = load_model()

In [None]:
# Predict gender function 
def predict_party(comment, model, tokenizer, device):
    model.eval()
    encoding = tokenizer(comment, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        prediction = torch.argmax(outputs.logits, dim=1).cpu().item()
    return "Male" if prediction == 0 else "Female"

In [None]:
# Predict gender probability function 
def predict_prob(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        softmax = torch.nn.Softmax(dim=1)
        probabilities = softmax(outputs.logits).cpu().numpy()
    return probabilities

In [None]:
# Predict instance function 
def predict_instance(text, model, tokenizer, device):
    preds = predict_gender_prob(text, model, tokenizer, device)
    
    return {
        "Male Probability": preds[0][0],  
        "Female Probability": preds[0][1]  
    }

In [20]:
# Example usage
txt = "I've had the 50watter since Oct of last year and I'm still impressed and blown away every time I play it. It's unreal. Killer choice"
predict_party(txt, model, tokenizer, device)

'Male'

In [21]:
txt = "You l've got a good man there hun. Take care of each other and it'll last a long time.\nFor sore throats my dad used to take 2 tablespoons of apple cider vinegar and the same amount of honey, mix it in at least 8oz of hot water. Drink it while ot's still hot, but not burning. Worked every time for me...still does."
predict_party(txt, model, tokenizer, device)

'Female'