In [None]:
pip install transformers

In [None]:
pip install --upgrade jupyterlab

In [None]:
pip install --upgrade ipywidgets

In [None]:
import pandas as pd
import kagglehub
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

In [None]:

df = pd.read_csv(kagglehub.dataset_download('samdeeplearning/deepnlp').join(['', '/Sheet_2.csv']), encoding='latin_1')

df['label'] = df['class'].map({'flagged': 1, 'not_flagged': 0})

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
class ResumeDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.iloc[idx]['resume_text']
        label = self.df.iloc[idx]['label']
        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
train_dataset = ResumeDataset(train_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

val_dataset = ResumeDataset(val_df, tokenizer)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [None]:
num_samples = len(train_df)
num_class_0 = len(train_df[train_df['label'] == 0])
num_class_1 = len(train_df[train_df['label'] == 1])
weight_class_0 = num_samples / (2.0 * num_class_0)
weight_class_1 = num_samples / (2.0 * num_class_1)
class_weights = torch.tensor([weight_class_0, weight_class_1], dtype=torch.float).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

In [None]:
print("Training set class distribution:")
print(train_df['label'].value_counts())
print("\nValidation set class distribution:")
print(val_df['label'].value_counts())
print(f'first class weights\' scale = {weight_class_0}, second = {weight_class_1}')

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = CosineAnnealingLR(optimizer, T_max=len(train_loader) * 2)

# Define loss function with class weights
class_weights = torch.tensor([weight_class_0, weight_class_1], dtype=torch.float).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(50):
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0
    
    for batch in train_loader:
        optimizer.zero_grad()
        
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            'labels': batch['labels'].to(device)
        }
        
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, inputs['labels'])
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = torch.max(outputs.logits, 1)
        correct += (predicted == inputs['labels']).sum().item()
        total += inputs['labels'].size(0)
    
    scheduler.step()
    
    # Calculate and print training loss and accuracy
    train_loss /= len(train_loader)
    train_accuracy = correct / total
    print(f'Epoch {epoch+1}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}, Learning Rate: {scheduler.get_lr()}')
    
    # Validation loop
    model.eval()
    correct_val = 0
    total_val = 0
    
    with torch.no_grad():
        for batch in val_loader:
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device),
                'labels': batch['labels'].to(device)
            }
            
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct_val += (predictions == inputs['labels']).sum().item()
            total_val += inputs['labels'].size(0)
    
    # Calculate and print validation accuracy
    val_accuracy = correct_val / total_val
    print(f'Epoch {epoch+1}, Validation Accuracy: {val_accuracy:.4f}')