In [16]:
!pip install pandas pyarrow fastparquet



In [17]:
import pandas as pd

df = pd.read_parquet("hf://datasets/KAIST-IC-LAB721/SDCNL/data/train-00000-of-00001.parquet")

In [18]:
!pip install huggingface_hub



In [19]:
df

Unnamed: 0,title,text,label,label_text
0,Need help,Hi I don't really know how to phrase this situ...,0,depression
1,feeling so overwhelmed and hopeless,i have been so depressed these past couple wee...,1,suicidal
2,"Nothing matters anymore, getting worse",Hi..I don't know where else to go. I am devast...,0,depression
3,Who’s tired of hearing bullshit,"The shit like “it will get better, everyone is...",1,suicidal
4,I wish I was someone else.,I wish I was prettier. I wish I didn’t feel li...,0,depression
...,...,...,...,...
1890,think its over,i just don’t wanna live anymore so yeah,0,depression
1891,To all of those feeling isolated and suffering...,I’ve learned that life is fucking sad sometime...,0,depression
1892,I just really wish I had died the first time I...,That's all. Nothing has gotten better and I've...,1,suicidal
1893,I feel unimportant.,Not the first time I'm going through this of c...,0,depression


In [20]:
df['text'].head()

0    Hi I don't really know how to phrase this situ...
1    i have been so depressed these past couple wee...
2    Hi..I don't know where else to go. I am devast...
3    The shit like “it will get better, everyone is...
4    I wish I was prettier. I wish I didn’t feel li...
Name: text, dtype: object

In [21]:
for idx, str in enumerate(df['text']):
    # print(string)

    clean_str = list([val for val in str if val.isalnum() or val == ' '])
    clean_str = ''.join(clean_str)
    low_clean_str = clean_str.lower()

    # print(low_clean_str)

    df = df.replace(df['text'][idx], low_clean_str)

In [22]:
df['text'][0]

'hi i dont really know how to phrase this situation but ill try my life is at a really good point right now im never really depressed over stuff and 99 percent of the time my mind is clear im about to graduate high school and im really excited however people in my family and friend group have tons of issues wether they sleep all day hate themselves or have no ambition to keep living on in this world theyve got problems i dont wanna sound like im gloating but usually im the person that a lot of these people end up going to because usually im able to talk people through issues and help them in the long run yeah sometimes their issues make me really sad and stuff because who doesnt feel sad when people are telling you they feel worthlessbut today one of my best friends showed me that he was cutting and it really effected me i talked to him about it and its mostly because of how painfully bored he is and he doesnt even know why hes doing it hes a pretty logical guy he will go to class and 

In [23]:
!pip install transformers sentencepiece torch



In [24]:
!pip install transformers

from transformers import AutoTokenizer, XLMRobertaModel

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaModel.from_pretrained('xlm-roberta-base', num_labels=2)



In [25]:
!pip install  tqdm



In [26]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm


In [27]:
print("MPS available:", torch.backends.mps.is_available())

MPS available: True


In [28]:
def get_device():
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    return device


In [29]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        # Remove the str() conversion since texts are already strings
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [30]:
# Training function
def train_model(model, train_loader, val_loader, device, num_epochs=3):
    optimizer = AdamW(model.parameters(), lr=2e-5)
    
    best_val_acc = 0
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}')
        
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            loss.backward()
            optimizer.step()
            
            # Calculate accuracy
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
            
            progress_bar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{(correct/total)*100:.2f}%'
            })
        
        # Validation
        model.eval()
        val_correct = 0
        val_total = 0
        
        print("\nRunning validation...")
        with torch.no_grad():
            for batch in tqdm(val_loader):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )
                
                predictions = torch.argmax(outputs.logits, dim=1)
                val_correct += (predictions == labels).sum().item()
                val_total += labels.size(0)
        
        val_acc = (val_correct/val_total)*100
        print(f'Validation Accuracy: {val_acc:.2f}%')
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pt')
            print(f'Saved new best model with validation accuracy: {val_acc:.2f}%')



In [31]:
def main():
    # Set device
    device = get_device()
    print(f"Using device: {device}")
    
    # Initialize model with proper configuration
    model = XLMRobertaForSequenceClassification.from_pretrained(
        'xlm-roberta-base',
        num_labels=2,
        problem_type="single_label_classification"
    ).to(device)
    
    # Split data - making sure to convert to list
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        list(df['text']),
        list(df['label']),
        test_size=0.1,
        random_state=42
    )
    
    # Create datasets
    train_dataset = TextClassificationDataset(
        train_texts,
        train_labels,
        tokenizer
    )
    
    val_dataset = TextClassificationDataset(
        val_texts,
        val_labels,
        tokenizer
    )
    
    # Create dataloaders with smaller batch size for CPU
    batch_size = 4 if device == 'cpu' else 8
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0  # Set to 0 for CPU
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0  # Set to 0 for CPU
    )
    
    # Print dataset sizes
    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")
    
    # Train the model
    try:
        train_model(model, train_loader, val_loader, device)
    except KeyboardInterrupt:
        print("Training interrupted by user")
    except Exception as e:
        print(f"Error during training: {str(e)}")
        raise e


In [None]:
# Run the main function
if __name__ == "__main__":
    main()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: mps
Training samples: 1705
Validation samples: 190




Epoch 1/3:   0%|          | 0/214 [00:00<?, ?it/s]