In [5]:
%%capture
!pip install transformers datasets evaluate accelerate

In [6]:
from datasets import load_dataset

ds = load_dataset("Jinyan1/COLING_2025_MGT_en")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 610767/610767 [00:02<00:00, 227112.42 examples/s]
Generating dev split: 100%|██████████| 261758/261758 [00:01<00:00, 245776.75 examples/s]


In [8]:
import pandas as pd

In [9]:
df_train = ds['train'].to_pandas()[['text','label']].sample(3000)

In [10]:
# Convert the 'dev' split to a Pandas DataFrame and rename the columns
df = ds['dev'].to_pandas()[['text', 'label']].rename(columns={'labels': 'label'})

# Separate dataframes for label 0 and label 1
df_label_0 = df[df['label'] == 0]
df_label_1 = df[df['label'] == 1]
# Sample 1000 rows from each dataframe
sample_label_0 = df_label_0.sample(n=1000, random_state=42)  # Set random_state for reproducibility
sample_label_1 = df_label_1.sample(n=1000, random_state=42)  # Set random_state for reproducibility

# Concatenate the samples
test_df = pd.concat([sample_label_0, sample_label_1], axis=0).sample(frac=1, random_state=42) #frac=1 for shuffling

In [11]:
df_train.to_csv('train_df (1).csv',index=False)

In [None]:
print(df_train['label'].value_counts())
#print(test['label'].value_counts())

In [12]:
test_df.to_csv('test_df (1).csv',index=False)

In [34]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import DebertaV2Tokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
import torch
from torch.nn.functional import softmax
import logging
import os
from typing import Tuple, Dict

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class CFG:
    """Configuration class for training parameters"""
    train_path = 'train_df (1).csv'
    test_path = 'test_df (1).csv'
    model_name = "microsoft/deberta-v3-large"
    epochs = 10
    learning_rate = 0.00003
    batch_size = 128
    max_length = 128
    test_size = 0.25
    output_dir = "./model"
    checkpoint_dir = "./checkpoints"
    random_seed = 42

class TextClassificationTrainer:
    def __init__(self):
        self.tokenizer = DebertaV2Tokenizer.from_pretrained(
            CFG.model_name,
            add_prefix_space=True  # This can help with better tokenization
        )
        self.setup_directories()

    @staticmethod
    def setup_directories():
        """Create necessary directories for model and checkpoint saving"""
        os.makedirs(CFG.output_dir, exist_ok=True)
        os.makedirs(CFG.checkpoint_dir, exist_ok=True)

    def load_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Load training and test data from CSV files"""
        try:
            train = pd.read_csv(CFG.train_path)
            test = pd.read_csv(CFG.test_path)

            # Validate required columns
            required_columns = ['text', 'label']
            for df, name in [(train, 'train'), (test, 'test')]:
                missing_cols = [col for col in required_columns if col not in df.columns]
                if missing_cols:
                    raise ValueError(f"{name} dataset missing required columns: {missing_cols}")

            return train, test
        except FileNotFoundError as e:
            raise Exception(f"Error loading data: {e}")
        except Exception as e:
            raise Exception(f"Unexpected error while loading data: {e}")

    def preprocess_function(self, examples):
        """Tokenize text data"""
        return self.tokenizer(
            examples["text"],
            truncation=True,
            max_length=CFG.max_length,
            padding=True
        )

    def preprocess_data(self, train: pd.DataFrame, test: pd.DataFrame) -> Tuple[DatasetDict, Dataset]:
        """Convert DataFrames to Datasets and preprocess"""
        try:
            # Convert to Dataset format
            ds = Dataset.from_pandas(train)
            ds_test = Dataset.from_pandas(test)

            # Tokenize datasets
            tok_ds = ds.map(self.preprocess_function, batched=True)
            dds = tok_ds.train_test_split(test_size=CFG.test_size, seed=CFG.random_seed)
            eval_dataset = ds_test.map(self.preprocess_function, batched=True)

            return dds, eval_dataset
        except Exception as e:
            raise Exception(f"Error in data preprocessing: {e}")

    @staticmethod
    def compute_metrics(eval_pred) -> Dict[str, float]:
        """Compute evaluation metrics"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
    
        metrics = {}
        
        # Accuracy
        accuracy_metric = evaluate.load("accuracy")
        accuracy_result = accuracy_metric.compute(predictions=predictions, references=labels)
        metrics["accuracy"] = accuracy_result["accuracy"]
    
        # Precision
        precision_metric = evaluate.load("precision")
        precision_result = precision_metric.compute(
            predictions=predictions,
            references=labels,
            average="weighted",
            zero_division=0
        )
        metrics["precision"] = precision_result["precision"]
    
        # Recall
        recall_metric = evaluate.load("recall")
        recall_result = recall_metric.compute(
            predictions=predictions,
            references=labels,
            average="weighted",
            zero_division=0
        )
        metrics["recall"] = recall_result["recall"]
    
        # F1 Score
        f1_metric = evaluate.load("f1")
        f1_result = f1_metric.compute(
            predictions=predictions,
            references=labels,
            average="weighted"
        )
        metrics["f1"] = f1_result["f1"]
    
        return metrics

    def train_model(self, dds: DatasetDict) -> Trainer:
        """Initialize and train the model"""
        try:
            # Initialize model and data collator
            model = AutoModelForSequenceClassification.from_pretrained(
                CFG.model_name,
                num_labels=2
            )
            data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

            # Set up training arguments
            training_args = TrainingArguments(
                output_dir=CFG.output_dir,
                learning_rate=CFG.learning_rate,
                per_device_train_batch_size=CFG.batch_size,
                per_device_eval_batch_size=CFG.batch_size,
                num_train_epochs=CFG.epochs,
                evaluation_strategy="epoch",
                save_strategy="epoch",
                save_total_limit=2,
                fp16=torch.cuda.is_available(),
                logging_dir=f"{CFG.output_dir}/logs",
                load_best_model_at_end=True,
                metric_for_best_model="f1",
            )

            # Initialize trainer
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=dds['train'],
                eval_dataset=dds['test'],
                tokenizer=self.tokenizer,
                data_collator=data_collator,
                compute_metrics=self.compute_metrics,
            )

            # Train the model
            trainer.train()
            return trainer
        except Exception as e:
            raise Exception(f"Error in model training: {e}")

    def get_predictions(self, trainer: Trainer, eval_dataset: Dataset) -> pd.DataFrame:
        """Get probability predictions for the evaluation dataset and return as DataFrame"""
        try:
            predictions = trainer.predict(eval_dataset)
            logits = predictions.predictions
            probabilities = softmax(torch.tensor(logits), dim=-1).numpy()

            # Create DataFrame with model-specific column names
            model_name = CFG.model_name.split('/')[-1]  # Get last part of model name
            df_predictions = pd.DataFrame(
                probabilities,
                columns=[f'p0_{model_name}', f'p1_{model_name}']
            )

            return df_predictions

        except Exception as e:
            raise Exception(f"Error in getting predictions: {e}")

    def cleanup(self):
        """Clean up GPU memory"""
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

def main():
    """Main execution function"""
    try:
        # Initialize trainer
        text_classifier = TextClassificationTrainer()

        # Load and preprocess data
        logger.info("Loading data...")
        train, test = text_classifier.load_data()

        logger.info("Preprocessing data...")
        dds, eval_dataset = text_classifier.preprocess_data(train, test)

        # Train model
        logger.info("Training model...")
        trainer = text_classifier.train_model(dds)

        # Get predictions as DataFrame
        logger.info("Getting predictions...")
        df_predictions = text_classifier.get_predictions(trainer, eval_dataset)

        # Save predictions DataFrame
        output_path = f"{CFG.output_dir}/predictions_{CFG.model_name.split('/')[-1]}.csv"
        df_predictions.to_csv(output_path, index=False)
        logger.info(f"Predictions saved to {output_path}")

        # Cleanup
        text_classifier.cleanup()

        return df_predictions

    except Exception as e:
        logger.error(f"Error in main execution: {e}")
        raise

if __name__ == "__main__":
    main()

ERROR:__main__:Error in main execution: 
DebertaV2Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.



ImportError: 
DebertaV2Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [33]:
!pip install sentencepiece

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0
