In [None]:
# Make predictions on the test set for MiniLM
minilm_predictions = trainer.predict(test_dataset)

# Save predicted probabilities
minilm_probabilities = torch.nn.functional.softmax(
    torch.tensor(minilm_predictions.predictions), dim=-1).numpy()
np.save('minilm_probabilities.npy', minilm_probabilities)

# Save the actual predicted labels
minilm_pred_labels = np.argmax(minilm_probabilities, axis=1)
np.save('minilm_pred_labels.npy', minilm_pred_labels)

In [2]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from datasets import Dataset
from sklearn import metrics
import os

In [3]:
# Load datasets
train_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/train.csv', encoding='ISO-8859-1')
test_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/test.csv', encoding='ISO-8859-1')
sample_submission = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/sample_submission.csv', encoding='ISO-8859-1')

In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder


electra_probabilities = np.load('electra_probabilities.npy')
minilm_probabilities = np.load('minilm_probabilities.npy')
nofine_probabilities = np.load('nofine_model_probabilities.npy')
deberta_probabilities = np.load('deberta_probabilities.npy')

#ensemble_probabilities = (0.1 * electra_probabilities + 0.6 * minilm_probabilities + 0.3 * nofine_probabilities )

# Soft voting: Averaging the probabilities from both models
ensemble_probabilities = (deberta_probabilities + minilm_probabilities + nofine_probabilities + electra_probabilities) / 4


ensemble_pred_labels = np.argmax(ensemble_probabilities, axis=1)

label_encoder = LabelEncoder()
train_df = pd.read_csv('C:/Users/91898/Code/fibe/dataset_fibe/train.csv', encoding='ISO-8859-1')

label_encoder.fit(train_df['target'])
final_predictions = label_encoder.inverse_transform(ensemble_pred_labels)


submission_df = pd.DataFrame({
    'Index': 'Article_' + test_df.index.astype(str),
    'target': final_predictions
})
submission_df.to_csv('ensemble4_submission.csv', index=False)

# Display the first few rows of the submission file
print(submission_df.head())

       Index              target
0  Article_0  academic interests
1  Article_1             careers
2  Article_2              health
3  Article_3  academic interests
4  Article_4  academic interests


In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Load the predicted probabilities from the base models
electra_probabilities = np.load('electra_probabilities.npy')
bert_probabilities = np.load('nofine_model_probabilities.npy')
deberta_probabilities = np.load('deberta_probabilities.npy')
minilm_probabilities = np.load('minilm_probabilities.npy')

# Load the original training data to get the true labels
train_df = pd.read_csv('train.csv', encoding='ISO-8859-1')
train_df['Index'] = 'Article_' + train_df.index.astype(str)
train_labels = train_df['target']

# Ensure all probability arrays have the same length as the training data
train_df = train_df.iloc[:len(electra_probabilities)]
train_labels = train_labels.iloc[:len(electra_probabilities)]

# Combine the predictions from all models into one single array
combined_probabilities = np.hstack(
    (electra_probabilities, bert_probabilities, deberta_probabilities, minilm_probabilities))

# Split into training and validation sets for the meta-learner
X_train, X_val, y_train, y_val = train_test_split(
    combined_probabilities, train_labels, test_size=0.1, random_state=42)

# Initialize the meta-model (Logistic Regression in this case)
meta_model = LogisticRegression(max_iter=1000, verbose=1)
meta_model.fit(X_train, y_train)

# Validate the meta-model
val_predictions = meta_model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)
val_f1 = f1_score(y_val, val_predictions, average='weighted')

print(f"Validation Accuracy of Stacked Model: {val_accuracy}")
print(f"Validation F1 Score of Stacked Model: {val_f1}")

# Apply the stacking model to the test set
test_df = pd.read_csv('test.csv', encoding='ISO-8859-1')

# Load predicted probabilities for the test set from each base model
electra_test_probabilities = np.load('electra_probabilities.npy')
bert_test_probabilities = np.load('nofine_model_probabilities.npy')
deberta_test_probabilities = np.load('deberta_test_probabilities.npy')
minilm_test_probabilities = np.load('minilm_test_probabilities.npy')

# Combine test probabilities
combined_test_probabilities = np.hstack(
    (electra_test_probabilities, bert_test_probabilities, deberta_test_probabilities, minilm_test_probabilities))

# Get final predictions from the meta-model
final_predictions = meta_model.predict(combined_test_probabilities)

# Create the submission file
submission_df = pd.DataFrame({
    'Index': 'Article_' + test_df.index.astype(str),
    'target': final_predictions
})
submission_df.to_csv('stacked_ensemble_submission.csv', index=False)

print("Stacked ensemble submission file generated successfully.")

FileNotFoundError: [Errno 2] No such file or directory: 'bert_probabilities.npy'

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from tensorflow import keras
from keras.layers import Dense, Dropout
from keras.models import Sequential, load_model
from keras.callbacks import EarlyStopping
from tqdm import tqdm

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical


def load_data():
    # Load the predicted probabilities from the base models
    electra_probs = np.load('electra_train_probabilities.npy')
    bert_probs = np.load('nofine_train_probabilities.npy')
    deberta_probs = np.load('deberta_train_probabilities.npy')
    minilm_probs = np.load('minilm_train_probabilities.npy')

    # Load the original training data to get the true labels
    train_df = pd.read_csv('dataset_fibe/train.csv', encoding='ISO-8859-1')
    train_labels = train_df['target']

    # Ensure all arrays have the same length
    min_length = min(len(electra_probs), len(bert_probs), len(
        deberta_probs), len(minilm_probs), len(train_labels))
    electra_probs = electra_probs[:min_length]
    bert_probs = bert_probs[:min_length]
    deberta_probs = deberta_probs[:min_length]
    minilm_probs = minilm_probs[:min_length]
    train_labels = train_labels[:min_length]

    # Combine the predictions from all models
    X = np.hstack((electra_probs, bert_probs, deberta_probs, minilm_probs))
    y = train_labels

    return X, y


def create_model(input_dim, num_classes):
    model = Sequential([
        Dense(256, input_shape=(input_dim,), activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model


def train_stacking_model(X, y, n_splits=5):
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    y_onehot = to_categorical(y_encoded)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    oof_predictions = np.zeros((X.shape[0], y_onehot.shape[1]))
    test_predictions = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_encoded), 1):
        print(f"Training fold {fold}")

        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y_onehot[train_idx], y_onehot[val_idx]

        model = create_model(X.shape[1], y_onehot.shape[1])

        early_stopping = EarlyStopping(
            monitor='val_loss', patience=10, restore_best_weights=True)

        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=100,
            batch_size=32,
            callbacks=[early_stopping],
            verbose=0
        )

        oof_predictions[val_idx] = model.predict(X_val)

        # For test set predictions (if available)
        # test_pred = model.predict(X_test)
        # test_predictions.append(test_pred)

    # Convert OOF predictions to class labels
    oof_pred_labels = np.argmax(oof_predictions, axis=1)
    oof_pred_labels = label_encoder.inverse_transform(oof_pred_labels)

    # Calculate metrics
    accuracy = accuracy_score(y, oof_pred_labels)
    f1 = f1_score(y, oof_pred_labels, average='weighted')

    print(f"Out-of-fold Accuracy: {accuracy:.4f}")
    print(f"Out-of-fold F1 Score: {f1:.4f}")

    return model, label_encoder


def main():
    X, y = load_data()
    final_model, label_encoder = train_stacking_model(X, y)

    # Load and prepare test data
    test_df = pd.read_csv('dataset_fibe/test.csv', encoding='ISO-8859-1')
    electra_test = np.load('electra_probabilities.npy')
    bert_test = np.load('nofine_model_probabilities.npy')
    deberta_test = np.load('deberta_probabilities.npy')
    minilm_test = np.load('minilm_probabilities.npy')

    X_test = np.hstack((electra_test, bert_test, deberta_test, minilm_test))

    # Make predictions on test data
    test_predictions = final_model.predict(X_test)
    test_pred_labels = np.argmax(test_predictions, axis=1)
    test_pred_labels = label_encoder.inverse_transform(test_pred_labels)

    # Create submission file
    submission_df = pd.DataFrame({
        'Index': 'Article_' + test_df.index.astype(str),
        'target': test_pred_labels
    })
    submission_df.to_csv('stacked_ensemble_submission.csv', index=False)
    print("Stacked ensemble submission file generated successfully.")


if __name__ == "__main__":
    main()

Training fold 1


KeyboardInterrupt: 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm

# Check for GPU availability and set memory growth
print("Num GPUs Available: ", len(
    tf.config.experimental.list_physical_devices('GPU')))
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("Memory growth set to True for all GPUs")
    except RuntimeError as e:
        print(e)
else:
    print("No GPUs found. Running on CPU.")


def load_data():
    # Load the predicted probabilities from the base models
    electra_probs = np.load('electra_train_probabilities.npy')
    bert_probs = np.load('nofine_train_probabilities.npy')
    deberta_probs = np.load('deberta_train_probabilities.npy')
    minilm_probs = np.load('minilm_train_probabilities.npy')

    # Load the original training data to get the true labels
    train_df = pd.read_csv('dataset_fibe/train.csv', encoding='ISO-8859-1')
    train_labels = train_df['target']

    # Ensure all arrays have the same length
    min_length = min(len(electra_probs), len(bert_probs), len(
        deberta_probs), len(minilm_probs), len(train_labels))
    electra_probs = electra_probs[:min_length]
    bert_probs = bert_probs[:min_length]
    deberta_probs = deberta_probs[:min_length]
    minilm_probs = minilm_probs[:min_length]
    train_labels = train_labels[:min_length]

    # Combine the predictions from all models
    X = np.hstack((electra_probs, bert_probs, deberta_probs, minilm_probs))
    y = train_labels

    print(f"Data loaded. X shape: {X.shape}, y shape: {y.shape}")
    return X, y


def create_model(input_dim, num_classes):
    model = Sequential([
        Dense(256, input_shape=(input_dim,), activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model


class TqdmProgressCallback(EarlyStopping):
    def __init__(self, epochs, verbose=1, **kwargs):
        super(TqdmProgressCallback, self).__init__(**kwargs)
        self.epochs = epochs
        self.verbose = verbose
        self.progbar = None

    def on_train_begin(self, logs=None):
        super().on_train_begin(logs)
        if self.verbose:
            self.progbar = tqdm(total=self.epochs, desc="Epochs", leave=False)

    def on_epoch_end(self, epoch, logs=None):
        super().on_epoch_end(epoch, logs)
        if self.verbose:
            self.progbar.update(1)
            self.progbar.set_postfix({
                'loss': f"{logs['loss']:.4f}",
                'val_loss': f"{logs['val_loss']:.4f}"
            })

    def on_train_end(self, logs=None):
        super().on_train_end(logs)
        if self.verbose:
            self.progbar.close()


def train_stacking_model(X, y, n_splits=5):
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    y_onehot = to_categorical(y_encoded)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    oof_predictions = np.zeros((X.shape[0], y_onehot.shape[1]))

    fold_progress = tqdm(enumerate(skf.split(X, y_encoded),
                         1), total=n_splits, desc="Folds")

    for fold, (train_idx, val_idx) in fold_progress:
        print(f"\nStarting fold {fold}")
        fold_progress.set_description(f"Training fold {fold}")

        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y_onehot[train_idx], y_onehot[val_idx]

        print(
            f"Fold {fold}: X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
        print(
            f"Fold {fold}: X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")

        model = create_model(X.shape[1], y_onehot.shape[1])
        print("Model created successfully")

        tqdm_callback = TqdmProgressCallback(
            epochs=100,
            verbose=1,
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        )

        try:
            print("Starting model fitting...")
            # For the first epoch, we'll do step-by-step execution
            if fold == 1:
                print("Executing first batch...")
                model.train_on_batch(X_train[:32], y_train[:32])
                print("First batch executed successfully")

                print("Predicting on validation data...")
                model.predict(X_val[:32])
                print("Prediction on validation data successful")

            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=100,
                batch_size=16,  # Reduced batch size
                callbacks=[tqdm_callback],
                verbose=1  # Changed to 1 for more detailed output
            )
            print(f"Model fitting completed for fold {fold}")
        except Exception as e:
            print(f"An error occurred during training: {str(e)}")
            raise e

        print(f"Generating OOF predictions for fold {fold}")
        oof_predictions[val_idx] = model.predict(X_val)
        print(f"OOF predictions generated for fold {fold}")

    fold_progress.close()

    # Convert OOF predictions to class labels
    oof_pred_labels = np.argmax(oof_predictions, axis=1)
    oof_pred_labels = label_encoder.inverse_transform(oof_pred_labels)

    # Calculate metrics
    accuracy = accuracy_score(y, oof_pred_labels)
    f1 = f1_score(y, oof_pred_labels, average='weighted')

    print(f"Out-of-fold Accuracy: {accuracy:.4f}")
    print(f"Out-of-fold F1 Score: {f1:.4f}")

    return model, label_encoder

def main():
    print("Loading data...")
    X, y = load_data()

    print("Training stacking model...")
    final_model, label_encoder = train_stacking_model(X, y)

    print("Loading test data...")
    test_df = pd.read_csv('test.csv', encoding='ISO-8859-1')
    electra_test = np.load('electra_probabilities.npy')
    bert_test = np.load('nofine_model_probabilities.npy')
    deberta_test = np.load('deberta_probabilities.npy')
    minilm_test = np.load('minilm_probabilities.npy')

    X_test = np.hstack((electra_test, bert_test, deberta_test, minilm_test))
    print(f"Test data shape: {X_test.shape}")

    print("Making predictions on test data...")
    test_predictions = final_model.predict(X_test, verbose=0)
    test_pred_labels = np.argmax(test_predictions, axis=1)
    test_pred_labels = label_encoder.inverse_transform(test_pred_labels)

    print("Generating submission file...")
    submission_df = pd.DataFrame({
        'Index': 'Article_' + test_df.index.astype(str),
        'target': test_pred_labels
    })
    submission_df.to_csv('stacked_ensemble_submission.csv', index=False)
    print("Stacked ensemble submission file generated successfully.")


if __name__ == "__main__":
    main()

Num GPUs Available:  1
Memory growth set to True for all GPUs
Loading data...
Data loaded. X shape: (627774, 104), y shape: (627774,)
Training stacking model...


Training fold 1:   0%|          | 0/5 [00:00<?, ?it/s]


Starting fold 1
Fold 1: X_train shape: (502219, 104), y_train shape: (502219, 26)
Fold 1: X_val shape: (125555, 104), y_val shape: (125555, 26)
Model created successfully
Starting model fitting...
Executing first batch...
First batch executed successfully
Predicting on validation data...
Prediction on validation data successful




Epoch 1/100



Epoch 2/100

Training fold 1:   0%|          | 0/5 [05:19<?, ?it/s]


KeyboardInterrupt: 

In [8]:
# Check the shapes of all individual model probabilities
print(
    f"Shape of DeBERTa train probabilities: {deberta_train_probabilities.shape}")
print(
    f"Shape of ELECTRA train probabilities: {electra_train_probabilities.shape}")
print(
    f"Shape of NoFine-tuned BERT train probabilities: {nofine_train_probabilities.shape}")
print(
    f"Shape of MiniLM train probabilities: {minilm_train_probabilities.shape}")

# Check the shape of the target labels
print(f"Shape of y_meta_train: {y_meta_train.shape}")

# Ensure all predicted probabilities have the same number of rows as y_meta_train
num_samples = len(y_meta_train)

# Truncate each model's probability array if necessary
deberta_train_probabilities = deberta_train_probabilities[:num_samples]
electra_train_probabilities = electra_train_probabilities[:num_samples]
nofine_train_probabilities = nofine_train_probabilities[:num_samples]
minilm_train_probabilities = minilm_train_probabilities[:num_samples]

# Concatenate the adjusted probabilities to form the meta-training set
X_meta_train = np.concatenate(
    [deberta_train_probabilities, electra_train_probabilities,
     nofine_train_probabilities, minilm_train_probabilities],
    axis=1
)

# Verify the shape of X_meta_train again
print(f"Shape of X_meta_train after adjustment: {X_meta_train.shape}")
print(f"Shape of y_meta_train: {y_meta_train.shape}")

# Now split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_meta_train, y_meta_train, test_size=0.1, random_state=42
)

Shape of DeBERTa train probabilities: (627774, 26)
Shape of ELECTRA train probabilities: (627774, 26)
Shape of NoFine-tuned BERT train probabilities: (627774, 26)
Shape of MiniLM train probabilities: (627774, 26)


NameError: name 'y_meta_train' is not defined

In [1]:
import numpy as np
import pandas as pd

# Load the predicted probabilities from each model
# Adjust the path as needed
probs_electra = np.load('electra_probabilities.npy')
probs_bert = np.load('nofine_model_probabilities.npy')
probs_minilm = np.load('minilm_probabilities.npy')
probs_deberta = np.load('deberta_probabilities.npy')  # DeBERTa probabilities

# Ensure all probability arrays have the same shape
assert probs_electra.shape == probs_bert.shape == probs_minilm.shape == probs_deberta.shape

In [2]:
# Perform soft voting by averaging the predicted probabilities
average_probs = (probs_electra + probs_bert + probs_minilm + probs_deberta) / 4

# Get the final predicted class labels by selecting the class with the highest average probability
final_predictions = np.argmax(average_probs, axis=1)

In [3]:
# Assuming your label_encoder was saved, let's load it
import joblib
# Replace with the correct path if different
label_encoder = joblib.load('label_encoder.pkl')

# Decode the numerical predictions to the original category labels
predicted_labels = label_encoder.inverse_transform(final_predictions)

In [4]:
# Load the test file to get the correct index values
test_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/test.csv', encoding='ISO-8859-1')

# Create the submission DataFrame with correct index and target columns
submission_df = pd.DataFrame(
    {'Index': test_df['Index'], 'target': predicted_labels})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('whyyyyyy_submission.csv', index=False)

# Display the first few rows to ensure correctness
print(submission_df.head())

       Index              target
0  Article_0  academic interests
1  Article_1             careers
2  Article_2              health
3  Article_3  academic interests
4  Article_4  academic interests


In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder  # Import LabelEncoder

# Load submission files generated by each model
deberta_df = pd.read_csv('deberta_submission.csv')
electra_df = pd.read_csv('electra_submission.csv')
minilm_df = pd.read_csv('mini2_submission.csv')
nofine_df = pd.read_csv('nofine_submission.csv')

# Assuming 'target' column contains the predictions
model_predictions = {
    "deberta": deberta_df['target'],
    "electra": electra_df['target'],
    "minilm": minilm_df['target'],
    "nofine": nofine_df['target']
}

# Convert to numerical format for weighted voting
label_encoder = LabelEncoder()  # Now LabelEncoder is defined
all_predictions = np.array([label_encoder.fit_transform(pred)
                           for pred in model_predictions.values()])

# Assign weights based on individual model performance
weights = np.array([0.4, 0.2, 0.3, 0.1])

# Perform weighted voting (multiplying weights by the one-hot encoding of each class prediction)
weighted_votes = np.zeros((len(deberta_df), len(label_encoder.classes_)))

for i, model_pred in enumerate(all_predictions):
    for j, class_index in enumerate(model_pred):
        weighted_votes[j, class_index] += weights[i]

# Final predictions based on the maximum weighted vote
final_predictions_indices = np.argmax(weighted_votes, axis=1)
final_predictions_labels = label_encoder.inverse_transform(
    final_predictions_indices)

# Create submission DataFrame
submission_df = pd.DataFrame(
    {'Index': deberta_df['Index'], 'target': final_predictions_labels})
submission_df.to_csv('weighted_voting_submission.csv', index=False)

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

# Load the predicted probabilities for the training set
deberta_train_probabilities = np.load(
    'deberta_full_train_probabilities.npy')  # DeBERTa probabilities
minilm_train_probabilities = np.load(
    'minilm_full_train_probabilities.npy')    # MiniLM probabilities

# Load your true labels for the training set
train_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/train.csv', encoding='ISO-8859-1')
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['target'])

# Combine the probabilities to form the meta-features (X_train for meta-model)
X_train_meta = np.hstack((deberta_train_probabilities,
                          minilm_train_probabilities))

# Use the true labels as the target (y_train for meta-model)
y_train_meta = train_labels

In [10]:
# Split data into training and validation sets for meta-classifier training
X_meta_train, X_meta_val, y_meta_train, y_meta_val = train_test_split(
    X_train_meta, y_train_meta, test_size=0.2, random_state=42)

# Initialize XGBoost Classifier
meta_classifier = xgb.XGBClassifier(
    objective='multi:softmax',  # For classification tasks
    num_class=len(label_encoder.classes_),  # Number of classes
    n_estimators=1000,  # Number of trees
    max_depth=6,       # Depth of each tree
    learning_rate=0.1,  # Learning rate
    use_label_encoder=False,
    eval_metric="mlogloss",
    random_state=42,
    tree_method = "hist", 
    device = "cuda"  # If you want to use GPU acceleration
)

# Train the meta-classifier on the training data
meta_classifier.fit(X_meta_train, y_meta_train,
                    eval_set=[(X_meta_val, y_meta_val)],
                    early_stopping_rounds=10,
                    verbose=True)



[0]	validation_0-mlogloss:2.22619
[1]	validation_0-mlogloss:1.90317
[2]	validation_0-mlogloss:1.68073
[3]	validation_0-mlogloss:1.51079
[4]	validation_0-mlogloss:1.37409
[5]	validation_0-mlogloss:1.26058
[6]	validation_0-mlogloss:1.16439
[7]	validation_0-mlogloss:1.08169
[8]	validation_0-mlogloss:1.00982
[9]	validation_0-mlogloss:0.94686
[10]	validation_0-mlogloss:0.89138
[11]	validation_0-mlogloss:0.84227
[12]	validation_0-mlogloss:0.79863
[13]	validation_0-mlogloss:0.75975
[14]	validation_0-mlogloss:0.72500
[15]	validation_0-mlogloss:0.69383
[16]	validation_0-mlogloss:0.66591
[17]	validation_0-mlogloss:0.64082
[18]	validation_0-mlogloss:0.61830
[19]	validation_0-mlogloss:0.59803
[20]	validation_0-mlogloss:0.57975
[21]	validation_0-mlogloss:0.56329
[22]	validation_0-mlogloss:0.54846
[23]	validation_0-mlogloss:0.53503
[24]	validation_0-mlogloss:0.52291
[25]	validation_0-mlogloss:0.51198
[26]	validation_0-mlogloss:0.50208
[27]	validation_0-mlogloss:0.49315
[28]	validation_0-mlogloss:0.4

In [11]:
# Load the predicted probabilities for the test set
deberta_test_probabilities = np.load('deberta_probabilities.npy')
minilm_test_probabilities = np.load('minilm_probabilities.npy')

# Combine the test probabilities to form the meta-features (X_test for meta-model)
X_test_meta = np.hstack((deberta_test_probabilities,
                         minilm_test_probabilities))

# Predict using the trained meta-classifier
meta_predictions = meta_classifier.predict(X_test_meta)

# Convert numeric predictions back to the original labels
final_predictions = label_encoder.inverse_transform(meta_predictions)

In [12]:
# Load the test data to get the index column
test_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/test.csv', encoding='ISO-8859-1')

# Create the submission DataFrame
submission_df = pd.DataFrame({
    # Assuming 'Index' column is present in your test CSV
    'Index': test_df['Index'],
    'target': final_predictions
})

# Save the submission file
submission_df.to_csv('meta_classifier_submission2.csv', index=False)

# Display the first few rows to ensure correctness
print(submission_df.head())

       Index              target
0  Article_0  academic interests
1  Article_1             careers
2  Article_2  academic interests
3  Article_3  academic interests
4  Article_4  academic interests


In [13]:
# Predict on the validation set
val_predictions = meta_classifier.predict(X_meta_val)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_meta_val, val_predictions)
f1 = f1_score(y_meta_val, val_predictions, average='weighted')

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation F1 Score: {f1:.4f}")


Validation Accuracy: 0.8831
Validation F1 Score: 0.8832


In [18]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score

# Define the LightGBM model
lightgbm_meta_model = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=len(label_encoder.classes_),
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=6,
    random_state=42,
    device='gpu',  # Use 'gpu' if you have a compatible GPU, else remove this line
)

# Train the LightGBM model
lightgbm_meta_model.fit(
    X_train_meta, y_train_meta,
    eval_set=[(X_meta_val, y_meta_val)],
)

# Predict on validation set
lightgbm_val_predictions = lightgbm_meta_model.predict(X_meta_val)

# Evaluate performance
lightgbm_accuracy = accuracy_score(y_meta_val, lightgbm_val_predictions)
lightgbm_f1 = f1_score(
    y_meta_val, lightgbm_val_predictions, average='weighted')
print(f"LightGBM Meta-Model Validation Accuracy: {lightgbm_accuracy:.4f}")
print(f"LightGBM Meta-Model Validation F1 Score: {lightgbm_f1:.4f}")

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 13260
[LightGBM] [Info] Number of data points in the train set: 697527, number of used features: 52
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 52 dense feature groups (34.59 MB) transferred to GPU in 0.033337 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score -2.478378
[LightGBM] [Info] Start training from score -3.275618
[LightGBM] [Info] Start training from score -3.215194
[LightGBM] [Info] Start training from score -2.847845
[LightGBM] [Info] Start training from score -3.213980
[LightGBM] [Info] Start training from score -3.130339
[LightGBM] [Info] Start training from score -3.181936
[LightGBM] [Info] Start training from score -3.393353
[LightGBM] [Info] Start tra

LightGBMError: Check failed: (best_split_info.left_count) > (0) at D:\a\1\s\lightgbm-python\src\treelearner\serial_tree_learner.cpp, line 846 .


In [19]:
from catboost import CatBoostClassifier

# Define the CatBoost model
catboost_meta_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.01,
    depth=6,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    random_seed=42,
    task_type='GPU',  # Use 'GPU' if you have a compatible GPU, else use 'CPU'
    verbose=100
)

# Train the CatBoost model
catboost_meta_model.fit(X_train_meta, y_train_meta, eval_set=(
    X_meta_val, y_meta_val), early_stopping_rounds=50)

# Predict on validation set
catboost_val_predictions = catboost_meta_model.predict(X_meta_val)
# Flatten the predictions array
catboost_val_predictions = catboost_val_predictions.flatten()

# Evaluate performance
catboost_accuracy = accuracy_score(y_meta_val, catboost_val_predictions)
catboost_f1 = f1_score(
    y_meta_val, catboost_val_predictions, average='weighted')
print(f"CatBoost Meta-Model Validation Accuracy: {catboost_accuracy:.4f}")
print(f"CatBoost Meta-Model Validation F1 Score: {catboost_f1:.4f}")



0:	learn: 3.1869920	test: 3.1872050	best: 3.1872050 (0)	total: 129ms	remaining: 2m 8s
100:	learn: 1.5964124	test: 1.6127490	best: 1.6127490 (100)	total: 5.42s	remaining: 48.2s
200:	learn: 1.1102014	test: 1.1335193	best: 1.1335193 (200)	total: 10.7s	remaining: 42.4s
300:	learn: 0.8636026	test: 0.8906784	best: 0.8906784 (300)	total: 16s	remaining: 37.1s
400:	learn: 0.7130083	test: 0.7428167	best: 0.7428167 (400)	total: 21.3s	remaining: 31.8s
500:	learn: 0.6125032	test: 0.6444363	best: 0.6444363 (500)	total: 26.7s	remaining: 26.6s
600:	learn: 0.5495639	test: 0.5831553	best: 0.5831553 (600)	total: 32.1s	remaining: 21.3s
700:	learn: 0.5062327	test: 0.5411540	best: 0.5411540 (700)	total: 37.6s	remaining: 16s
800:	learn: 0.4771041	test: 0.5130277	best: 0.5130277 (800)	total: 43.1s	remaining: 10.7s
900:	learn: 0.4556245	test: 0.4921727	best: 0.4921727 (900)	total: 48.6s	remaining: 5.34s
999:	learn: 0.4407748	test: 0.4775970	best: 0.4775970 (999)	total: 54.1s	remaining: 0us
bestTest = 0.4775970

In [15]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load predicted probabilities from DeBERTa and MiniLM models
deberta_train_probabilities = np.load('deberta_full_train_probabilities.npy')
minilm_train_probabilities = np.load('minilm_full_train_probabilities.npy')

# Combine the probabilities as input features for the meta-model
X_train_meta = np.hstack(
    (deberta_train_probabilities, minilm_train_probabilities))

# Load true labels
train_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/train.csv', encoding='ISO-8859-1')
label_encoder = LabelEncoder()
y_train_meta = label_encoder.fit_transform(train_df['target'])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_meta, y_train_meta, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.long).to(device)

# Define a simple feed-forward neural network


class MetaLearnerNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MetaLearnerNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)  # First hidden layer
        self.fc2 = nn.Linear(64, 32)          # Second hidden layer
        self.output = nn.Linear(32, num_classes)  # Output layer

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.output(x)
        return x


# Initialize the neural network model
input_size = X_train_tensor.shape[1]  # Number of features (input size)
num_classes = len(label_encoder.classes_)  # Number of target classes
model = MetaLearnerNN(input_size, num_classes).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training parameters
num_epochs = 20
batch_size = 64

# Training loop
for epoch in range(num_epochs):
    model.train()
    permutation = torch.randperm(X_train_tensor.size(0))

    for i in range(0, X_train_tensor.size(0), batch_size):
        optimizer.zero_grad()

        indices = permutation[i:i + batch_size]
        batch_X, batch_y = X_train_tensor[indices], y_train_tensor[indices]

        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        loss.backward()
        optimizer.step()

    # Validation after each epoch
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_loss = criterion(val_outputs, y_val_tensor).item()

        # Calculate accuracy
        val_predictions = torch.argmax(val_outputs, dim=1)
        val_accuracy = accuracy_score(
            y_val_tensor.cpu(), val_predictions.cpu())
        val_f1 = f1_score(y_val_tensor.cpu(),
                          val_predictions.cpu(), average='weighted')

        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, "
              f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}, Validation F1: {val_f1:.4f}")

# Predict on test set
deberta_test_probabilities = np.load('deberta_probabilities.npy')
minilm_test_probabilities = np.load('minilm_probabilities.npy')
X_test_meta = np.hstack(
    (deberta_test_probabilities, minilm_test_probabilities))
X_test_tensor = torch.tensor(X_test_meta, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    test_predictions = torch.argmax(test_outputs, dim=1)

# Convert predictions back to original labels
final_predictions = label_encoder.inverse_transform(
    test_predictions.cpu().numpy())

# Create submission file
test_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/test.csv', encoding='ISO-8859-1')
submission_df = pd.DataFrame({
    'Index': test_df['Index'],
    'target': final_predictions
})
submission_df.to_csv('nn_meta_submission.csv', index=False)
print("Submission file 'nn_meta_submission.csv' generated successfully.")

Using device: cuda
Epoch [1/20], Loss: 0.4990, Validation Loss: 0.4946, Validation Accuracy: 0.8820, Validation F1: 0.8821
Epoch [2/20], Loss: 0.1115, Validation Loss: 0.4851, Validation Accuracy: 0.8813, Validation F1: 0.8816
Epoch [3/20], Loss: 0.2935, Validation Loss: 0.4717, Validation Accuracy: 0.8818, Validation F1: 0.8819
Epoch [4/20], Loss: 0.0496, Validation Loss: 0.4599, Validation Accuracy: 0.8813, Validation F1: 0.8813
Epoch [5/20], Loss: 0.1208, Validation Loss: 0.4475, Validation Accuracy: 0.8822, Validation F1: 0.8823
Epoch [6/20], Loss: 0.9564, Validation Loss: 0.4441, Validation Accuracy: 0.8808, Validation F1: 0.8808
Epoch [7/20], Loss: 0.1694, Validation Loss: 0.4404, Validation Accuracy: 0.8818, Validation F1: 0.8819
Epoch [8/20], Loss: 1.3750, Validation Loss: 0.4362, Validation Accuracy: 0.8817, Validation F1: 0.8817
Epoch [9/20], Loss: 0.0893, Validation Loss: 0.4354, Validation Accuracy: 0.8818, Validation F1: 0.8819
Epoch [10/20], Loss: 0.5197, Validation Loss: