In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords

In [None]:
# Load Data
train_data = pd.read_csv('train.csv')
train_data_no_id = train_data.drop(columns=['id'])

# Convert Potential Suicide Post to 1 and Not Suicide Post to 0
train_data_no_id['Suicide'] = train_data_no_id['Suicide'].map({'Potential Suicide Post': 1, 'Not Suicide Post': 0})

In [None]:
import nltk
from nltk.corpus import stopwords

# 1. Remove punctuation and special characters (also convert %20 to space)
train_data_no_id['tweet'] = train_data_no_id['tweet'].str.replace('%20', ' ')
train_data_no_id['tweet'] = train_data_no_id['tweet'].apply(lambda x: ''.join(char for char in str(x) if (char.isalnum() or char.isspace())) if isinstance(x, str) else x)

# 2. Convert text to lowercase
train_data_no_id['tweet'] = train_data_no_id['tweet'].str.lower()

# 3. Remove stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def remove_stopwords(text):
    if isinstance(text, str):
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in stopwords]
        return ' '.join(filtered_words)
    else:
        return ''
train_data_no_id['tweet'] = train_data_no_id['tweet'].apply(remove_stopwords)

In [None]:
# Split the dataset
X = train_data_no_id['tweet'].values
y = train_data_no_id['Suicide'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define Dataset Class
class TweetDataset(Dataset):
    def __init__(self, tweets, labels, tokenizer, max_len):
        self.tweets = tweets
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        tweet = str(self.tweets[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'tweet_text': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }


In [None]:
def create_data_loader(X, y, tokenizer, max_len, batch_size):
    dataset = TweetDataset(tweets=X, labels=y, tokenizer=tokenizer, max_len=max_len)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 160
BATCH_SIZE = 8
EPOCHS = 3
accumulation_steps = 4

In [None]:
# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=1)
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_data_loader = create_data_loader(X_train, y_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(X_val, y_val, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS



In [None]:
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
criterion = nn.BCEWithLogitsLoss().to(device)

In [None]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    optimizer.zero_grad()
    for i, d in enumerate(train_data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze()
        loss = criterion(logits, labels)

        loss = loss / accumulation_steps
        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        total_loss += loss.item() * accumulation_steps

    avg_train_loss = total_loss / len(train_data_loader)

    # Validation step
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for d in val_data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze()
            val_loss = criterion(logits, labels)

            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_data_loader)

    print(f"Epoch {epoch + 1}, Loss: {avg_train_loss}, Val Loss: {avg_val_loss}")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1, Loss: 0.386919764507758, Val Loss: 0.22568057649033635
Epoch 2, Loss: 0.11930541289161499, Val Loss: 0.1321916256933228
Epoch 3, Loss: 0.05763599867197244, Val Loss: 0.23005340327998916
Epoch 3, Loss: 0.05763599867197244, Val Loss: 0.23005340327998916


In [None]:
# Accuracy Calculation
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for d in val_data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze()
        probabilities = torch.sigmoid(logits)

        predictions = (probabilities >= 0.5).float()
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy using accuracy_score
accuracy = accuracy_score(all_labels, all_predictions)
print("Accuracy:", accuracy)

Accuracy: 0.9375


In [None]:
# Load test data
test_data = pd.read_csv('test.csv')
test_documents = test_data['tweet'].apply(lambda x: ' '.join(str(x).split())).tolist()

# Tokenize test data
class TestDataset(Dataset):
    def __init__(self, tweets, tokenizer, max_len):
        self.tweets = tweets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        tweet = str(self.tweets[idx])

        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'tweet_text': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

def create_test_data_loader(tweets, tokenizer, max_len, batch_size):
    dataset = TestDataset(tweets, tokenizer, max_len)
    return DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Create test data loader
test_data_loader = create_test_data_loader(test_documents, tokenizer, MAX_LEN, BATCH_SIZE)

# Move tensor to GPU
model = model.to(device)

# Predict on test data
model.eval()
test_predictions = []

with torch.no_grad():
    for d in test_data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze()
        probabilities = torch.sigmoid(logits)

        predictions = (probabilities >= 0.5).float()
        test_predictions.extend(predictions.cpu().numpy())

# Save predictions to a CSV file
test_data['Suicide'] = test_predictions

# Convert score 1 to 'Potential Suicide Post' and score 0 to 'Not Suicide Post'
test_data['Suicide'] = test_data['Suicide'].map({1.0: 'Potential Suicide Post', 0.0: 'Not Suicide Post'})

# Save predictions (id and target)
test_data[['id', 'Suicide']].to_csv('predictions.csv', index=False)


