# Requirements

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/Feger/am-limited-generalizability

In [None]:
! pip install simpletransformers==0.70.1
! pip install transformers==4.41.1
! pip install emoji==2.4.0
! python -m spacy download en_core_web_lg

In [None]:
import gc
import os
import time
import math
import torch
import spacy
import string
import warnings
import numpy as np
import pandas as pd
from itertools import product
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.model_selection import BaseCrossValidator
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, make_scorer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.exceptions import ConvergenceWarning
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.tree import DecisionTreeClassifier
from simpletransformers.classification import ClassificationModel
from IPython.display import clear_output
from tqdm.notebook import tqdm
from transformers import logging as transformers_logging
from google.colab import runtime

# Set logging level for transformers to ERROR
transformers_logging.set_verbosity_error()
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

# Generate stop words
nlp = spacy.load("en_core_web_lg", disable=['ner', 'parser', 'textcat', 'senter'])
stop_words = nlp.Defaults.stop_words

# Set several random seed for reproducibility and classification
random_seed = 201221

# Define a mapping of class names to labels
class2label = {"Argument": 1, "No-Argument": 0}
label2class = {1: "Argument", 0: "No-Argument"}

In [None]:
df_sample = pd.read_csv('./data/sample_374318.csv')
df_sample["label"] = df_sample["label"].replace(class2label)
assert not df_sample.isna().any().any()

# Methodology

In [None]:
class TransformerModel(BaseEstimator):
    def __init__(self,
                 model_type=None,
                 model_name=None,
                 learning_rate=None,
                 num_train_epochs=None,
                 batch_size=None,
                 random_state=None):
        # Initialize parameters
        self.model_type = model_type
        self.model_name = model_name
        self.learning_rate = learning_rate
        self.num_train_epochs = num_train_epochs
        self.batch_size = batch_size
        self.random_state = random_state
        self.model = None  # Placeholder for the model
        self.args = None  # Placeholder for model arguments

    def show_information(self):
        # Display model configuration information
        print("Model:", self.model_name)
        print("Learning Rate:", self.model.args.learning_rate)
        print("Number of Train Epochs:", self.model.args.num_train_epochs)
        print("Train Batch Size:", self.model.args.train_batch_size)
        print("Manual Seed:", self.model.args.manual_seed)

    def check_parameters(self):
        # Verify that all parameters are correctly set
        for key, value in self.args.items():
            model_value = getattr(self.model.args, key, None)
            # Ensure all parameters are initialized and match the expected values
            assert value is not None, f"Parameter {key} is None (default) in the argument dict"
            assert model_value is not None, f"Parameter {key} is None (default) in the model"
            assert model_value == value, f"Parameter {key} is {model_value} but expected {value}"

    def fit(self, X, y):
        # Set up model arguments based on current parameters
        self.args = {
            'eval_batch_size': self.batch_size,
            'learning_rate': self.learning_rate,
            'manual_seed': self.random_state,
            'no_cache': True,
            'no_save': True,
            'num_train_epochs': self.num_train_epochs,
            'overwrite_output_dir': True,
            'save_eval_checkpoints': False,
            'save_model_every_epoch': False,
            'silent': False,
            'train_batch_size': self.batch_size,
            'use_multiprocessing': False,
            'use_multiprocessing_for_evaluation': False
        }
        # Create a DataFrame from the input data
        train_data = pd.DataFrame(list(zip(X, y)), columns=['text', 'labels'])
        # Initialize the model with the specified type and name, and the arguments
        self.model = ClassificationModel(self.model_type, self.model_name, args=self.args, use_cuda=torch.cuda.is_available())
        # Verify that all parameters are correctly set
        self.check_parameters()
        # Train the model and clean up memory
        self.model.train_model(train_data)
        gc.collect()
        return self

    def predict(self, X):
        # Make predictions on the provided data
        predictions, _ = self.model.predict(X)
        return predictions

In [None]:
class Fold(BaseCrossValidator):
    def __init__(self, df, source, target):
        # Initialize with a DataFrame, source dataset, and target dataset
        self.df = df
        self.source = source
        self.target = target

    def split(self, X, y=None, groups=None):
        # Find indices for the training set from the source dataset
        train_idx = self.df[(self.df["dataset"].isin(self.source)) & (self.df["split"] == "train")].index.values
        # Find indices for the dev set from the target dataset
        dev_idx = self.df[(self.df["dataset"].isin(self.target)) & (self.df["split"] == "dev")].index.values
        # Yield the training and dev indices
        yield train_idx, dev_idx

    def get_n_splits(self, X=None, y=None, groups=None):
        # Return the number of splits, which is 1 in this case
        return 1

In [None]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    # Use when features are in a list of lists
    def __init__(self, key):
        self.key = key  # Key to select feature(s)

    def fit(self, X, y=None):
        return self  # No fitting needed

    def transform(self, X):
        # Select features based on key
        return [x[self.key] for x in X]

In [None]:
def select_features(df, features):
    # Use when features are in columns of a dataframe
    # Check if features is a single string, if so, return values as a flat list
    if isinstance(features, str):
        return df[features].tolist()
    # Otherwise, return values as a list of lists
    return df[features].values.tolist()

In [None]:
def create_non_existing_folders(path):
    # Get the directory name from the given path
    directory = os.path.dirname(path)
    # Check if the directory does not exist
    if not os.path.exists(directory):
        # Create the directory and any necessary intermediate directories
        os.makedirs(directory)
    # Return the original path
    return path

In [None]:
def run_experiment(df, model, model_type, model_name, cv, params, file_name, features):
    # Reset indices for consistent access
    df.reset_index(drop=True, inplace=True)
    # Create the output path and check if file already exists
    path = create_non_existing_folders(f"./output/classification/{file_name}.npy")
    print(path)
    if os.path.exists(path):
        print(f"File already exists: {path}")
        return
    # Define parameter grid for TransformerModelWrapper
    if isinstance(model, TransformerModel):
        params.update({
            'model_type': [model_type],
            'model_name': [model_name]
        })
    # Initialize and fit GridSearchCV
    start_time = int(time.perf_counter())
    clf = GridSearchCV(model, params, cv=cv, scoring='f1_macro', refit=False, verbose=3)
    clf.fit(select_features(df, features), df['label'].tolist())
    # Set and verify best parameters
    best_estimator = model.set_params(**clf.best_params_)
    for k, v in clf.best_params_.items():
        assert v == best_estimator.get_params()[k], f"Parameter {k} was not set correctly."
    # Split data into training, dev, and test sets
    df_train = df[(df["dataset"].isin(cv.source)) & (df["split"] == "train")]
    df_dev = df[(df["dataset"].isin(cv.target)) & (df["split"] == "dev")]
    df_test = df[(df["dataset"].isin(cv.target)) & (df["split"] == "test")]
    # Select the features of the train, dev and test
    train_features = select_features(df_train, features)
    dev_features = select_features(df_dev, features)
    test_features = select_features(df_test, features)
    # Fit best estimator on training data
    best_estimator.fit(train_features, df_train["label"].tolist())
    # Generate predictions and store them as variables
    df_dev["prediction"] = best_estimator.predict(dev_features)
    df_test["prediction"] = best_estimator.predict(test_features)
    # Generate classification reports
    target_names = [v for k, v in sorted(label2class.items(), key=lambda item: item[0])]
    report_dev = classification_report(df_dev["label"], df_dev["prediction"], output_dict=True, target_names=target_names)
    report_test = classification_report(df_test["label"], df_test["prediction"], output_dict=True, target_names=target_names)
    # Check for equivalence of grid search results and refit, but use tolerance as floating point numbers are compared
    assert math.isclose(clf.best_score_, report_dev["macro avg"]["f1-score"], rel_tol=1e-8, abs_tol=1e-8)
    # Check the type of best_estimator and retrieve the random_state accordingly
    rnd_state = None
    if isinstance(best_estimator, Pipeline):
        last_step_name, last_step = best_estimator.steps[-1]
        rnd_state = last_step.random_state
    elif isinstance(best_estimator, TransformerModel):
        rnd_state = best_estimator.model.args.manual_seed
    else:
        rnd_state = best_estimator.random_state
    # Write the results for each datasize
    df_dev["prediction"] = df_dev["prediction"].replace(label2class)
    df_test["prediction"] = df_test["prediction"].replace(label2class)
    scores_params = [{
        "model": model_name,
        "random_state": rnd_state,
        "source": cv.source,
        "target": cv.target,
        "size": df.shape[0],
        "best_params": clf.best_params_,
        "dev_report": report_dev,
        "test_report": report_test,
        "time_sec": int(time.perf_counter()) - start_time,
        "df_dev": df_dev[["dataset_id", "prediction"]],
        "df_test": df_test[["dataset_id", "prediction"]]
    }]
    np.save(path, scores_params, allow_pickle=True)

In [None]:
def calculate_progress(executed, total, start_time):
    # Calculate the progress percentage and round it to 2 decimal places
    progress = round(100 * (executed / total), 2)
    # Calculate the total time elapsed since start_time
    total_time = int(time.perf_counter()) - start_time
    # Convert total_time to hours, minutes, and seconds
    hours, remainder = divmod(total_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    # Return the formatted progress and uptime string
    return f"Finished: {executed}/{total} ({progress}%)\nUptime {hours}h:{minutes}m:{seconds}s\nCurrent:"

In [None]:
def experiment(df, params, model, model_type, model_name, folder, experiment_type, features):
    assert experiment_type in ["train-on-one-test-on-another", "leave-one-out", "to-what-transformers-pay-attention"], "Experiment type is not valid"
    # Ensure the model is properly initialized if it's an instance of TransformerModel
    if isinstance(model, TransformerModel):
        _ = ClassificationModel(model_type, model_name, use_cuda=torch.cuda.is_available())
        clear_output(wait=True)
    # Generate a list of experiment configurations based on experiment_type
    unique_datasets = df["dataset"].unique()
    if experiment_type == "train-on-one-test-on-another" or experiment_type == "to-what-transformers-pay-attention":
        experiments = list(product(unique_datasets, repeat=2))
    elif experiment_type == "leave-one-out":
        experiments = [(unique_datasets[unique_datasets != out].tolist(), out) for out in unique_datasets]
    assert experiments, "Experiment must be defined"
    total_experiments = len(experiments)  # Total number of experiments
    start_time = int(time.perf_counter())  # Record the start time for progress tracking
    # Iterate through each experiment configuration
    for executed, (source, target) in enumerate(experiments):
        # Generate file appendix
        base_path = f"{experiment_type}/{folder}/"
        path = f"{base_path}{target.lower()}" if experiment_type == "leave-one-out" else f"{base_path}{source.lower()}-{target.lower()}"
        # Ensure source and target are lists
        source = source if isinstance(source, list) else [source]
        target = target if isinstance(target, list) else [target]
        # Print the current progress
        print(calculate_progress(executed, total_experiments, start_time))
        # Filter the DataFrame to include only the relevant datasets for the current experiment
        df_ = df[df["dataset"].isin(source + target)]
        assert sorted(df_["dataset"].unique()) == sorted(set(source + target))
        # Initialize a custom cross-validator with the filtered DataFrame and current datasets
        cv = Fold(df=df_, source=source, target=target)
        # Run the experiment with the current configuration
        run_experiment(
            df=df_,
            model=model,
            model_type=model_type,
            model_name=model_name,
            cv=cv,
            params=params,
            file_name=path,
            features=features
        )
        # Clear the output and collect garbage to manage memory usage
        clear_output(wait=True)
        gc.collect()
    print("Finished")

# Setup

In [None]:
# Parameters for random prediction (random seed later changed)
random_params = {}
random_model = DummyClassifier(strategy="uniform", random_state=random_seed)

# Predefined POS tags
OPEN_CLASS_TAGS = ['ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN', 'VERB']
CLOSED_CLASS_TAGS = ['ADP', 'AUX', 'CCONJ', 'DET', 'NUM', 'PART', 'PRON', 'SCONJ']
OTHER_TAGS = ['PUNCT', 'SYM', 'X']
predefined_pos_tags = OPEN_CLASS_TAGS + CLOSED_CLASS_TAGS + OTHER_TAGS

# Combined parameters for decision tree and pipeline
dt_combined_params = {
    'kbst__k': [1, 2, 3, 4, 5, 6, 7, 'all'],
    'clf__max_depth': [1, 2, 3, 4, 5, None],
    'clf__criterion': ["gini", "entropy", "log_loss"],
}

# Create the combined pipeline with merged parameters
dt_combined_model = Pipeline([
    ('union', FeatureUnion([
        ('pos', Pipeline([
            ('selector', FeatureSelector(key=0)),
            ('cvect', CountVectorizer(vocabulary=predefined_pos_tags, lowercase=False)) #Counts POS tags in the string representation of pos_tags using the predefined vocabulary.
        ])),
        ('num', FeatureSelector(key=slice(1, None)))
    ])),
    ('vtrsh', VarianceThreshold()),  # Remove constant pos features
    ('kbst', SelectKBest(f_classif)),  # Select k-best features
    ('clf', DecisionTreeClassifier(random_state=random_seed))
])

# Parameters for transformer models, based on recommendations for text classification (GLUE) in the BERT/RoBERTa paper (random seed later changed)
transformer_params = {
    'learning_rate': [2e-5, 3e-5, 4e-5, 5e-5],
    'num_train_epochs': [3],
    'batch_size': [32]
}
transformer_model = TransformerModel(random_state=random_seed)

# Classification

## Train on one, test on another

### Dummy

In [None]:
experiment(df_sample, random_params, random_model, "Random", "Random", folder="sample_374318/Random", experiment_type="train-on-one-test-on-another", features="sentence")

### Decision Tree

In [None]:
experiment(df_sample, dt_combined_params, dt_combined_model, "DTree", "DTree", folder=f"sample_374318/DTree", experiment_type="train-on-one-test-on-another", features=df_sample.iloc[:, 7:].columns)

### Transformer

In [None]:
experiment(df_sample, transformer_params, transformer_model, "bertweet", "TomatenMarc/WRAPresentations", folder="sample_374318/Wrap", experiment_type="train-on-one-test-on-another", features="sentence")
experiment(df_sample, transformer_params, transformer_model, "bert", "bert-base-uncased", folder="sample_374318/Bert", experiment_type="train-on-one-test-on-another", features="sentence")
experiment(df_sample, transformer_params, transformer_model, "roberta", "roberta-base", folder="sample_374318/Roberta", experiment_type="train-on-one-test-on-another", features="sentence")
experiment(df_sample, transformer_params, transformer_model, "distilbert", "distilbert-base-uncased", folder="sample_374318/Distilbert", experiment_type="train-on-one-test-on-another", features="sentence")

## To what transformers pay attention

In [None]:
df_sample["content_words"] = df_sample.sentence.apply(lambda row: " ".join([token for token in row.split() if token not in stop_words and token not in string.punctuation]))

In [None]:
experiment(df_sample, transformer_params, transformer_model, "bertweet", "TomatenMarc/WRAPresentations", folder="sample_374318/Wrap", experiment_type="to-what-transformers-pay-attention", features="content_words")
experiment(df_sample, transformer_params, transformer_model, "bert", "bert-base-uncased", folder="sample_374318/Bert", experiment_type="to-what-transformers-pay-attention", features="content_words")
experiment(df_sample, transformer_params, transformer_model, "roberta", "roberta-base", folder="sample_374318/Roberta", experiment_type="to-what-transformers-pay-attention", features="content_words")
experiment(df_sample, transformer_params, transformer_model, "distilbert", "distilbert-base-uncased", folder="sample_374318/Distilbert", experiment_type="to-what-transformers-pay-attention", features="content_words")

## Leave one out

### Dummy

In [None]:
experiment(df_sample, random_params, random_model, "Random", "Random", folder="sample_374318/Random", experiment_type="leave-one-out", features="sentence")

### Decision Tree

In [None]:
experiment(df_sample, dt_combined_params, dt_combined_model, "DTree", "DTree", folder=f"sample_374318/DTree", experiment_type="leave-one-out", features=df_sample.iloc[:, 7:].columns)

### Transformer

In [None]:
experiment(df_sample, transformer_params, transformer_model, "bertweet", "TomatenMarc/WRAPresentations", folder="sample_374318/Wrap", experiment_type="leave-one-out", features="sentence")
experiment(df_sample, transformer_params, transformer_model, "bert", "bert-base-uncased", folder="sample_374318/Bert", experiment_type="leave-one-out", features="sentence")
experiment(df_sample, transformer_params, transformer_model, "roberta", "roberta-base", folder="sample_374318/Roberta", experiment_type="leave-one-out", features="sentence")
experiment(df_sample, transformer_params, transformer_model, "distilbert", "distilbert-base-uncased", folder="sample_374318/Distilbert", experiment_type="leave-one-out", features="sentence")

#### Manipulation

In [None]:
df_sample["content_words"] = df_sample.sentence.apply(lambda row: " ".join([token for token in row.split() if token not in stop_words and token not in string.punctuation]))

In [None]:
experiment(df_sample, transformer_params, transformer_model, "bertweet", "TomatenMarc/WRAPresentations", folder="sample_374318_manipulated/Wrap", experiment_type="leave-one-out", features="content_words")
experiment(df_sample, transformer_params, transformer_model, "bert", "bert-base-uncased", folder="sample_374318_manipulated/Bert", experiment_type="leave-one-out", features="content_words")
experiment(df_sample, transformer_params, transformer_model, "roberta", "roberta-base", folder="sample_374318_manipulated/Roberta", experiment_type="leave-one-out", features="content_words")
experiment(df_sample, transformer_params, transformer_model, "distilbert", "distilbert-base-uncased", folder="sample_374318_manipulated/Distilbert", experiment_type="leave-one-out", features="content_words")

# Clean Up

In [None]:
runtime.unassign()