# Use Synonym and Random Insertion/Deletion

In [1]:
import nltk
import os

# 指定自定義下載目錄
nltk_data_path = os.path.expanduser('~/nltk_data')
if not os.path.exists(nltk_data_path):
    os.makedirs(nltk_data_path)

# 告訴 nltk 在指定目錄中查找資源
nltk.data.path.append(nltk_data_path)

# 下載所需資源
nltk.download('wordnet', download_dir=nltk_data_path)
nltk.download('omw-1.4', download_dir=nltk_data_path)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [2]:
import zipfile
import os

# 解壓縮 wordnet 和 omw-1.4
nltk_data_path = '/root/nltk_data/corpora/'

# 解壓 wordnet.zip
wordnet_zip = os.path.join(nltk_data_path, 'wordnet.zip')
if os.path.exists(wordnet_zip):
    with zipfile.ZipFile(wordnet_zip, 'r') as zip_ref:
        zip_ref.extractall(nltk_data_path)

# 解壓 omw-1.4.zip
omw_zip = os.path.join(nltk_data_path, 'omw-1.4.zip')
if os.path.exists(omw_zip):
    with zipfile.ZipFile(omw_zip, 'r') as zip_ref:
        zip_ref.extractall(nltk_data_path)

print("WordNet and OMw-1.4 have been extracted.")

WordNet and OMw-1.4 have been extracted.


In [3]:
import nltk
from nltk.corpus import wordnet

# 測試 WordNet 是否能夠正常工作
try:
    synonyms = wordnet.synsets("number")
    print("WordNet is working. Found synonyms for 'number':", synonyms)
except Exception as e:
    print("Error loading WordNet:", e)

WordNet is working. Found synonyms for 'number': [Synset('number.n.01'), Synset('number.n.02'), Synset('act.n.04'), Synset('phone_number.n.01'), Synset('numeral.n.01'), Synset('issue.n.02'), Synset('number.n.07'), Synset('number.n.08'), Synset('number.n.09'), Synset('number.n.10'), Synset('number.n.11'), Synset('total.v.01'), Synset('number.v.02'), Synset('number.v.03'), Synset('count.v.05'), Synset('count.v.01'), Synset('number.v.06')]


# Save Only result.csv

In [None]:
import os
import json
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.metrics import f1_score
import numpy as np
import random
import nltk
from nltk.corpus import wordnet

# Disable Weights and Biases and logging
os.environ["WANDB_DISABLED"] = "true"

# Download the WordNet data if not already downloaded
nltk.download('wordnet')

# Load train, validation, and test data
with open('/kaggle/input/inlphw2dataset/train.json', 'r') as f:
    train_data = json.load(f)

with open('/kaggle/input/inlphw2dataset/val.json', 'r') as f:
    val_data = json.load(f)

with open('/kaggle/input/inlphw2dataset/test.json', 'r') as f:
    test_data = json.load(f)

# Load the sample submission format to get label columns
sample_submission = pd.read_csv('/kaggle/input/inlphw2dataset/sample_submission.csv')
label_columns = sample_submission.columns[1:]  # Label columns

# Data Augmentation Functions
def synonym_replacement(text, n=1):
    """
    Replaces `n` words in the text with their synonyms.
    """
    words = text.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    random.shuffle(random_word_list)

    num_replaced = 0
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()  # Pick the first synonym
            if synonym != random_word:
                new_words = [synonym if word == random_word else word for word in new_words]
                num_replaced += 1
            if num_replaced >= n:
                break

    return ' '.join(new_words)

def random_insertion(text, n=1):
    """
    Inserts `n` random synonyms of existing words into the text at random positions.
    """
    words = text.split()
    new_words = words.copy()
    
    for _ in range(n):
        random_word = random.choice(words)
        synonyms = wordnet.synsets(random_word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            random_idx = random.randint(0, len(new_words))
            new_words.insert(random_idx, synonym)
    
    return ' '.join(new_words)

def random_deletion(text, p=0.1):
    """
    Randomly deletes words from the text with a probability `p`.
    """
    words = text.split()
    if len(words) == 1:
        return text  # Avoid deleting the only word in text

    new_words = [word for word in words if random.uniform(0, 1) > p]
    if not new_words:
        return random.choice(words)  # Avoid empty string

    return ' '.join(new_words)

# Prepare the training data with augmentation
train_texts = [entry["tweet"] for entry in train_data]
train_labels = [{label: 1 for label in entry["labels"].keys()} for entry in train_data]

# Apply data augmentation
augmented_train_texts = []
augmented_train_labels = []

for text, labels in zip(train_texts, train_labels):
    augmented_train_texts.append(text)  # Original text
    augmented_train_labels.append(labels)
    
    # Synonym Replacement
    augmented_train_texts.append(synonym_replacement(text, n=2))
    augmented_train_labels.append(labels)
    
    # Random Insertion
    augmented_train_texts.append(random_insertion(text, n=2))
    augmented_train_labels.append(labels)
    
    # Random Deletion
    augmented_train_texts.append(random_deletion(text, p=0.3))
    augmented_train_labels.append(labels)

# Prepare the validation data
val_texts = [entry["tweet"] for entry in val_data]
val_labels = [{label: 1 for label in entry["labels"].keys()} for entry in val_data]

# Map labels to indices
label_to_id = {label: idx for idx, label in enumerate(label_columns)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

# Convert labels to a binary matrix format
def convert_to_multilabel(labels_list, label_to_id):
    multilabel = np.zeros((len(labels_list), len(label_to_id)), dtype=int)
    for i, labels in enumerate(labels_list):
        for label in labels:
            multilabel[i][label_to_id[label]] = 1
    return multilabel

train_labels = convert_to_multilabel(augmented_train_labels, label_to_id)
val_labels = convert_to_multilabel(val_labels, label_to_id)

# Prepare the test data
test_texts = [entry["tweet"] for entry in test_data]
test_ids = [entry["ID"] for entry in test_data]

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define custom dataset class for BERT
class TweetDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], truncation=True, padding='max_length', max_length=128, return_tensors='pt')
        item = {key: val.squeeze() for key, val in encoding.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# Create train, validation, and test datasets
train_dataset = TweetDataset(augmented_train_texts, train_labels)
val_dataset = TweetDataset(val_texts, val_labels)
test_dataset = TweetDataset(test_texts)

# Initialize the BERT model for multilabel classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_columns))

# Define a custom compute_metrics function for strict macro F1 score calculation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions >= 0.5  # Threshold for multilabel classification
    macro_f1 = f1_score(labels, preds, average='macro')
    return {"macro_f1": macro_f1}

# Set training arguments with minimal logging
training_args = TrainingArguments(
    output_dir='./results',  # Required but will only store checkpoints here
    eval_strategy="epoch",  # Perform evaluation at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir=None,  # Disable logging directory to avoid saving log files
    logging_steps=5000,  # Set high logging steps to avoid frequent logs
    save_strategy="no",  # Disable checkpoint saving
)

# Define the Trainer with validation and compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Use val.json as validation dataset
    compute_metrics=compute_metrics  # Calculate macro F1 score
)

# Train the model
trainer.train()

# Make predictions on the test set
predictions = trainer.predict(test_dataset)
pred_labels = (torch.sigmoid(torch.tensor(predictions.predictions)) >= 0.5).int().numpy()

# Create submission file in the correct format
submission = pd.DataFrame(pred_labels, columns=label_columns)
submission.insert(0, "index", test_ids)  # Renaming the first column to "index"
submission.to_csv('/kaggle/working/result.csv', index=False)


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Macro F1
1,No log,0.150629,0.537036
