In [None]:
!git clone https://github.com/pooja-premnath/CheckThat-Task2-Subjectivity

Cloning into 'CheckThat-Task2-Subjectivity'...
remote: Enumerating objects: 44, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 44 (delta 27), reused 34 (delta 17), pack-reused 0[K
Receiving objects: 100% (44/44), 179.73 KiB | 316.00 KiB/s, done.
Resolving deltas: 100% (27/27), done.


# Neural Network Architecture

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import f1_score

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_final['sentence'])

# Encode text data to sequences
X_train = tokenizer.texts_to_sequences(df_final['sentence'])
X_test = tokenizer.texts_to_sequences(df_test_final['sentence'])
X_submission = tokenizer.texts_to_sequences(df_submission_final['sentence'])

# Pad sequences to ensure uniform length
max_sequence_length = max([len(seq) for seq in X_train + X_test + X_submission])
X_train = pad_sequences(X_train, maxlen=max_sequence_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_sequence_length, padding='post')
X_submission = pad_sequences(X_submission, maxlen=max_sequence_length, padding='post')

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_final['label'])
y_test = label_encoder.transform(df_test_final['label'])

# Build neural network model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
model.fit(X_train, y_train, epochs=30, batch_size=128, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Predict probabilities for test set
y_pred_probs = model.predict(X_test)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_probs, axis=1)

# Calculate accuracy
accuracy = (y_pred == y_test).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test, y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
y_submission_pred_probs = model.predict(X_submission)

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_neural_network.tsv', sep='\t', index=False)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Accuracy on Test Set: 0.4773662551440329
Macro Average F1 Score on Test Set: 0.3231197771587744
Unique Predicted Labels: ['OBJ']


# idk -Fine Tune the DNN as much as you can

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import f1_score

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_final['sentence'])

# Encode text data to sequences
X_train = tokenizer.texts_to_sequences(df_final['sentence'])
X_test = tokenizer.texts_to_sequences(df_test_final['sentence'])
X_submission = tokenizer.texts_to_sequences(df_submission_final['sentence'])

# Pad sequences to ensure uniform length
max_sequence_length = max([len(seq) for seq in X_train + X_test + X_submission])
X_train = pad_sequences(X_train, maxlen=max_sequence_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_sequence_length, padding='post')
X_submission = pad_sequences(X_submission, maxlen=max_sequence_length, padding='post')

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_final['label'])
y_test = label_encoder.transform(df_test_final['label'])

# Build deep neural network model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))  # Additional layer
model.add(Dropout(0.5))  # Additional dropout
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
model.fit(X_train, y_train, epochs=30, batch_size=128, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Predict probabilities for test set
y_pred_probs = model.predict(X_test)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_probs, axis=1)

# Calculate accuracy
accuracy = (y_pred == y_test).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test, y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
y_submission_pred_probs = model.predict(X_submission)

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_deep_neural_network.tsv', sep='\t', index=False)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30


Exception ignored in: <function _xla_gc_callback at 0x7eb2b549caf0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 98, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 


Accuracy on Test Set: 0.4773662551440329
Macro Average F1 Score on Test Set: 0.3231197771587744
Unique Predicted Labels: ['OBJ']


# BERT

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import torch

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
X_train = tokenizer(df_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_test = tokenizer(df_test_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_submission = tokenizer(df_submission_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Encode labels
label_encoder = LabelEncoder()
y_train = torch.tensor(label_encoder.fit_transform(df_final['label']), dtype=torch.long)
y_test = torch.tensor(label_encoder.transform(df_test_final['label']), dtype=torch.long)

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train['input_ids'], X_train['attention_mask'], y_train)
test_dataset = TensorDataset(X_test['input_ids'], X_test['attention_mask'], y_test)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    scheduler.step()

# Evaluate the model on the test set
model.eval()
y_pred = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=1)
        y_pred.extend(pred_labels.cpu().numpy())

# Calculate accuracy
accuracy = (y_pred == y_test.cpu().numpy()).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test.cpu().numpy(), y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
submission_dataset = TensorDataset(X_submission['input_ids'], X_submission['attention_mask'])
submission_dataloader = DataLoader(submission_dataset, batch_size=16, shuffle=False)

model.eval()
y_submission_pred_probs = []
with torch.no_grad():
    for batch in submission_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        y_submission_pred_probs.extend(probs.cpu().numpy())

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_bert.tsv', sep='\t', index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy on Test Set: 0.7407407407407407
Macro Average F1 Score on Test Set: 0.7401069518716578
Unique Predicted Labels: ['OBJ' 'SUBJ']


# Large BERT


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import torch

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
X_train = tokenizer(df_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_test = tokenizer(df_test_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_submission = tokenizer(df_submission_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Encode labels
label_encoder = LabelEncoder()
y_train = torch.tensor(label_encoder.fit_transform(df_final['label']), dtype=torch.long)
y_test = torch.tensor(label_encoder.transform(df_test_final['label']), dtype=torch.long)

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train['input_ids'], X_train['attention_mask'], y_train)
test_dataset = TensorDataset(X_test['input_ids'], X_test['attention_mask'], y_test)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=len(label_encoder.classes_))

# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    scheduler.step()

# Evaluate the model on the test set
model.eval()
y_pred = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=1)
        y_pred.extend(pred_labels.cpu().numpy())

# Calculate accuracy
accuracy = (y_pred == y_test.cpu().numpy()).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test.cpu().numpy(), y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
submission_dataset = TensorDataset(X_submission['input_ids'], X_submission['attention_mask'])
submission_dataloader = DataLoader(submission_dataset, batch_size=16, shuffle=False)

model.eval()
y_submission_pred_probs = []
with torch.no_grad():
    for batch in submission_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        y_submission_pred_probs.extend(probs.cpu().numpy())

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_bert_large.tsv', sep='\t', index=False)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy on Test Set: 0.7037037037037037
Macro Average F1 Score on Test Set: 0.7028532608695652
Unique Predicted Labels: ['OBJ' 'SUBJ']


# mBERT

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import torch

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data using mBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
X_train = tokenizer(df_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_test = tokenizer(df_test_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_submission = tokenizer(df_submission_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Encode labels
label_encoder = LabelEncoder()
y_train = torch.tensor(label_encoder.fit_transform(df_final['label']), dtype=torch.long)
y_test = torch.tensor(label_encoder.transform(df_test_final['label']), dtype=torch.long)

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train['input_ids'], X_train['attention_mask'], y_train)
test_dataset = TensorDataset(X_test['input_ids'], X_test['attention_mask'], y_test)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load pre-trained mBERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(label_encoder.classes_))

# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    scheduler.step()

# Evaluate the model on the test set
model.eval()
y_pred = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=1)
        y_pred.extend(pred_labels.cpu().numpy())

# Calculate accuracy
accuracy = (y_pred == y_test.cpu().numpy()).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test.cpu().numpy(), y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
submission_dataset = TensorDataset(X_submission['input_ids'], X_submission['attention_mask'])
submission_dataloader = DataLoader(submission_dataset, batch_size=16, shuffle=False)

model.eval()
y_submission_pred_probs = []
with torch.no_grad():
    for batch in submission_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        y_submission_pred_probs.extend(probs.cpu().numpy())

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_mbert.tsv', sep='\t', index=False)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy on Test Set: 0.7078189300411523
Macro Average F1 Score on Test Set: 0.7012622720897617
Unique Predicted Labels: ['OBJ' 'SUBJ']


# RobertA

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import torch

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data using RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
X_train = tokenizer(df_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_test = tokenizer(df_test_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_submission = tokenizer(df_submission_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Encode labels
label_encoder = LabelEncoder()
y_train = torch.tensor(label_encoder.fit_transform(df_final['label']), dtype=torch.long)
y_test = torch.tensor(label_encoder.transform(df_test_final['label']), dtype=torch.long)

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train['input_ids'], X_train['attention_mask'], y_train)
test_dataset = TensorDataset(X_test['input_ids'], X_test['attention_mask'], y_test)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load pre-trained RoBERTa model for sequence classification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(10):  # Adjust the number of epochs as needed
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    scheduler.step()

# Evaluate the model on the test set
model.eval()
y_pred = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=1)
        y_pred.extend(pred_labels.cpu().numpy())

# Calculate accuracy
accuracy = (y_pred == y_test.cpu().numpy()).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test.cpu().numpy(), y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
submission_dataset = TensorDataset(X_submission['input_ids'], X_submission['attention_mask'])
submission_dataloader = DataLoader(submission_dataset, batch_size=16, shuffle=False)

model.eval()
y_submission_pred_probs = []
with torch.no_grad():
    for batch in submission_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        y_submission_pred_probs.extend(probs.cpu().numpy())

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_roberta.tsv', sep='\t', index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy on Test Set: 0.7983539094650206
Macro Average F1 Score on Test Set: 0.798230898276651
Unique Predicted Labels: ['OBJ' 'SUBJ']


# RoBERTa with TextBlob Features

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import torch
from textblob import TextBlob

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data using RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
X_train = tokenizer(df_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_test = tokenizer(df_test_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_submission = tokenizer(df_submission_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Encode labels
label_encoder = LabelEncoder()
y_train = torch.tensor(label_encoder.fit_transform(df_final['label']), dtype=torch.long)
y_test = torch.tensor(label_encoder.transform(df_test_final['label']), dtype=torch.long)

# Calculate TextBlob subjectivity scores
def calculate_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

df_final['subjectivity'] = df_final['sentence'].apply(calculate_subjectivity)
df_test_final['subjectivity'] = df_test_final['sentence'].apply(calculate_subjectivity)
df_submission_final['subjectivity'] = df_submission_final['sentence'].apply(calculate_subjectivity)

# Convert subjectivity scores to tensor
subjectivity_train = torch.tensor(df_final['subjectivity'].values, dtype=torch.float).unsqueeze(1)
subjectivity_test = torch.tensor(df_test_final['subjectivity'].values, dtype=torch.float).unsqueeze(1)
subjectivity_submission = torch.tensor(df_submission_final['subjectivity'].values, dtype=torch.float).unsqueeze(1)

# Create DataLoader for subjectivity scores
subjectivity_train_dataset = TensorDataset(subjectivity_train, y_train)
subjectivity_test_dataset = TensorDataset(subjectivity_test, y_test)
subjectivity_train_dataloader = DataLoader(subjectivity_train_dataset, batch_size=16, shuffle=True)
subjectivity_test_dataloader = DataLoader(subjectivity_test_dataset, batch_size=16, shuffle=False)

# Load pre-trained RoBERTa model for sequence classification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Train the model with subjectivity scores included
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
train_dataset = TensorDataset(X_train['input_ids'], X_train['attention_mask'], y_train, subjectivity_train)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2],
            'subjectivity': batch[3]  # Include subjectivity scores as input
        }
        optimizer.zero_grad()
        outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=inputs['labels'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    scheduler.step()

# Evaluate the model on the test set
model.eval()
test_dataset = TensorDataset(X_test['input_ids'], X_test['attention_mask'], y_test, subjectivity_test)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)
y_pred = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'subjectivity': batch[3]  # Include subjectivity scores as input
        }
        outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=1)
        y_pred.extend(pred_labels.cpu().numpy())

# Calculate accuracy
accuracy = (y_pred == y_test.cpu().numpy()).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test.cpu().numpy(), y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
submission_dataset = TensorDataset(X_submission['input_ids'], X_submission['attention_mask'], subjectivity_submission)
submission_dataloader = DataLoader(submission_dataset, batch_size=16, shuffle=False)

model.eval()
y_submission_pred_probs = []
with torch.no_grad():
    for batch in submission_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'subjectivity': batch[2]  # Include subjectivity scores as input
        }
        outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        y_submission_pred_probs.extend(probs.cpu().numpy())

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id, predicted labels, and subjectivity scores
df_submission_predicted = pd.DataFrame({
    'sentence_id': df_submission_final['sentence_id'],
    'predicted_label': predicted_labels,
    'subjectivity_score': df_submission_final['subjectivity']
})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_roberta_with_subjectivity.tsv', sep='\t', index=False)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy on Test Set: 0.7489711934156379
Macro Average F1 Score on Test Set: 0.7475862068965518
Unique Predicted Labels: ['OBJ' 'SUBJ']


# Analyzing Misclassifications in RoBERTa

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import torch

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data using RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
X_train = tokenizer(df_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_test = tokenizer(df_test_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_submission = tokenizer(df_submission_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Encode labels
label_encoder = LabelEncoder()
y_train = torch.tensor(label_encoder.fit_transform(df_final['label']), dtype=torch.long)
y_test = torch.tensor(label_encoder.transform(df_test_final['label']), dtype=torch.long)

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train['input_ids'], X_train['attention_mask'], y_train)
test_dataset = TensorDataset(X_test['input_ids'], X_test['attention_mask'], y_test)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load pre-trained RoBERTa model for sequence classification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    scheduler.step()

# Evaluate the model on the test set
model.eval()
y_pred = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=1)
        y_pred.extend(pred_labels.cpu().numpy())

# Calculate accuracy
accuracy = (y_pred == y_test.cpu().numpy()).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test.cpu().numpy(), y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
submission_dataset = TensorDataset(X_submission['input_ids'], X_submission['attention_mask'])
submission_dataloader = DataLoader(submission_dataset, batch_size=16, shuffle=False)

model.eval()
y_submission_pred_probs = []
with torch.no_grad():
    for batch in submission_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        y_submission_pred_probs.extend(probs.cpu().numpy())

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_roberta.tsv', sep='\t', index=False)

# Collect incorrect predictions and corresponding true labels
incorrect_predictions = []
true_labels = df_test_final['label'].tolist()  # Assuming 'label' is the column name in df_test_final

for i in range(len(true_labels)):
    if y_pred[i] != y_test.cpu().numpy()[i]:
        incorrect_predictions.append((df_test_final['sentence'].iloc[i], label_encoder.inverse_transform([y_pred[i]])[0], true_labels[i]))

# Print incorrect predictions and corresponding true labels
for idx, (sentence, predicted_label, true_label) in enumerate(incorrect_predictions, 1):
    print(f'Incorrect Prediction {idx}:')
    print(f'Sentence: {sentence}')
    print(f'Predicted Label: {predicted_label}')
    print(f'True Label: {true_label}')
    print()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy on Test Set: 0.7983539094650206
Macro Average F1 Score on Test Set: 0.7978609625668449
Unique Predicted Labels: ['OBJ' 'SUBJ']
Incorrect Prediction 1:
Sentence: The Social Distance Between Us shows every sign of emulating the critical and popular success of Poverty Safari.
Predicted Label: SUBJ
True Label: OBJ

Incorrect Prediction 2:
Sentence: History shows that McCarthy and McConnell, like so many Republican leaders, collapse after demands from the left.
Predicted Label: SUBJ
True Label: OBJ

Incorrect Prediction 3:
Sentence: It’s only when you have fifty illegal aliens end up in a very wealthy, rich sanctuary enclave that he decides to scramble.”  Due to his successful policies in Florida, donations are flowing into this campaign coffers.
Predicted Label: OBJ
True Label: SUBJ

Incorrect Prediction 4:
Sentence: DeSantis believes in freedom and is constantly fighting the “woke” left on a range of issues.
Predicted Label: OBJ
True Label: SUBJ

Incorrect Prediction 5:
Sentence:

# RoBERTa with POS


In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize, pos_tag
from sklearn.preprocessing import LabelEncoder
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
from sklearn.metrics import accuracy_score, f1_score

# Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data using RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
X_train_text = df_final['sentence'].tolist()
X_test_text = df_test_final['sentence'].tolist()
X_submission_text = df_submission_final['sentence'].tolist()

# POS tagging
def get_pos_features(sentences):
    pos_features = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        pos_tags = pos_tag(tokens)
        pos_tags_list = [tag for _, tag in pos_tags]
        pos_features.append(pos_tags_list)
    return pos_features

X_train_pos = get_pos_features(X_train_text)
X_test_pos = get_pos_features(X_test_text)
X_submission_pos = get_pos_features(X_submission_text)

# Define feature extraction functions based on POS tags
def extract_objectivity_features(pos_tags_list, max_length):
    objectivity_features = np.zeros(max_length)  # Initialize with zeros
    for i, pos_tag in enumerate(pos_tags_list):
        if pos_tag in ['``', "''", 'VBD', 'PRP', 'CD']:
            objectivity_features[i] = 1
    return objectivity_features

def extract_subjectivity_features(pos_tags_list, max_length):
    subjectivity_features = np.zeros(max_length)  # Initialize with zeros
    for i, pos_tag in enumerate(pos_tags_list):
        if pos_tag in ['VB', 'VBG', 'VBP', 'VBZ', '!', '?', 'PRP$', 'JJR', 'JJS']:
            subjectivity_features[i] = 1
    return subjectivity_features

# Find the maximum number of POS tags in your data
max_length_train = max(len(tags) for tags in X_train_pos)
max_length_submission = max(len(tags) for tags in X_submission_pos)

# Extract features with consistent lengths
X_train_objectivity = np.array([extract_objectivity_features(tags, max_length_train) for tags in X_train_pos])
X_train_subjectivity = np.array([extract_subjectivity_features(tags, max_length_train) for tags in X_train_pos])

# Concatenate POS tag-based features with text tokenized features
X_train_combined = {'input_ids': X_train_text, 'attention_mask': np.concatenate((X_train_objectivity, X_train_subjectivity), axis=1)}

# Tokenize text data using RoBERTa tokenizer
X_train_tokenized = tokenizer(X_train_combined['input_ids'], truncation=True, padding=True)
X_train_input_ids = X_train_tokenized['input_ids']
X_train_attention_mask = X_train_tokenized['attention_mask']

# Convert input_ids and attention_mask to numpy arrays with correct data type
X_train_input_ids = np.array(X_train_input_ids, dtype=np.int64)
X_train_attention_mask = np.array(X_train_attention_mask, dtype=np.int64)

# Encode labels
label_encoder = LabelEncoder()
y_train = torch.tensor(label_encoder.fit_transform(df_final['label']), dtype=torch.long)

# Create DataLoader for training
train_dataset = TensorDataset(torch.tensor(X_train_input_ids), torch.tensor(X_train_attention_mask), y_train)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Load pre-trained RoBERTa model for sequence classification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    scheduler.step()

# Tokenize test data using RoBERTa tokenizer
X_test_tokenized = tokenizer(X_test_text, truncation=True, padding=True)
X_test_input_ids = X_test_tokenized['input_ids']
X_test_attention_mask = X_test_tokenized['attention_mask']

# Encode test labels
y_test_true = torch.tensor(label_encoder.transform(df_test_final['label']), dtype=torch.long).to(device)

# Create DataLoader for test data
test_dataset = TensorDataset(torch.tensor(X_test_input_ids), torch.tensor(X_test_attention_mask), y_test_true)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Evaluate the model on the test data
model.eval()
y_test_pred = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        labels = batch[2]
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        y_test_pred.extend(torch.argmax(probs, dim=1).cpu().numpy())

# Calculate accuracy and macro average F1 score
accuracy = accuracy_score(y_test_true.cpu().numpy(), y_test_pred)
macro_f1 = f1_score(y_test_true.cpu().numpy(), y_test_pred, average='macro')

print(f'Accuracy on Test Data: {accuracy:.4f}')
print(f'Macro Average F1 on Test Data: {macro_f1:.4f}')

# Tokenize submission data using RoBERTa tokenizer
X_submission_tokenized = tokenizer(X_submission_text, truncation=True, padding=True)
X_submission_input_ids = X_submission_tokenized['input_ids']
X_submission_attention_mask = X_submission_tokenized['attention_mask']

# Create DataLoader for submission data
submission_dataset = TensorDataset(torch.tensor(X_submission_input_ids), torch.tensor(X_submission_attention_mask))
submission_dataloader = DataLoader(submission_dataset, batch_size=16, shuffle=False)

# Evaluate the model on the submission data
model.eval()
y_submission_pred_probs = []
with torch.no_grad():
    for batch in submission_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        y_submission_pred_probs.extend(probs.cpu().numpy())

# Convert probabilities to class labels for submission data
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Create dataframe with sentence_id and predicted labels for submission
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_roberta.tsv', sep='\t', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy on Test Data: 0.6872
Macro Average F1 on Test Data: 0.6740


# XLM-RobertA

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import torch

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data using XLM-RoBERTa tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
X_train = tokenizer(df_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_test = tokenizer(df_test_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_submission = tokenizer(df_submission_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Encode labels
label_encoder = LabelEncoder()
y_train = torch.tensor(label_encoder.fit_transform(df_final['label']), dtype=torch.long)
y_test = torch.tensor(label_encoder.transform(df_test_final['label']), dtype=torch.long)

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train['input_ids'], X_train['attention_mask'], y_train)
test_dataset = TensorDataset(X_test['input_ids'], X_test['attention_mask'], y_test)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load pre-trained XLM-RoBERTa model for sequence classification
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=len(label_encoder.classes_))

# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    scheduler.step()

# Evaluate the model on the test set
model.eval()
y_pred = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=1)
        y_pred.extend(pred_labels.cpu().numpy())

# Calculate accuracy
accuracy = (y_pred == y_test.cpu().numpy()).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test.cpu().numpy(), y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
submission_dataset = TensorDataset(X_submission['input_ids'], X_submission['attention_mask'])
submission_dataloader = DataLoader(submission_dataset, batch_size=16, shuffle=False)

model.eval()
y_submission_pred_probs = []
with torch.no_grad():
    for batch in submission_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        y_submission_pred_probs.extend(probs.cpu().numpy())

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_xlm_roberta.tsv', sep='\t', index=False)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy on Test Set: 0.7119341563786008
Macro Average F1 Score on Test Set: 0.711890243902439
Unique Predicted Labels: ['OBJ' 'SUBJ']


# bert-base-styleclassification-subjective-neutral

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import torch

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("cffl/bert-base-styleclassification-subjective-neutral")
model = AutoModelForSequenceClassification.from_pretrained("cffl/bert-base-styleclassification-subjective-neutral")

# Tokenize text data using the loaded tokenizer
X_train = tokenizer(df_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_test = tokenizer(df_test_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_submission = tokenizer(df_submission_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Encode labels
label_encoder = LabelEncoder()
y_train = torch.tensor(label_encoder.fit_transform(df_final['label']), dtype=torch.long)
y_test = torch.tensor(label_encoder.transform(df_test_final['label']), dtype=torch.long)

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train['input_ids'], X_train['attention_mask'], y_train)
test_dataset = TensorDataset(X_test['input_ids'], X_test['attention_mask'], y_test)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    scheduler.step()

# Evaluate the model on the test set
model.eval()
y_pred = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=1)
        y_pred.extend(pred_labels.cpu().numpy())

# Calculate accuracy
accuracy = (y_pred == y_test.cpu().numpy()).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test.cpu().numpy(), y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
submission_dataset = TensorDataset(X_submission['input_ids'], X_submission['attention_mask'])
submission_dataloader = DataLoader(submission_dataset, batch_size=16, shuffle=False)

model.eval()
y_submission_pred_probs = []
with torch.no_grad():
    for batch in submission_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        y_submission_pred_probs.extend(probs.cpu().numpy())

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_hugging_face_model.tsv', sep='\t', index=False)


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/913 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Accuracy on Test Set: 0.7489711934156379
Macro Average F1 Score on Test Set: 0.7486988217343391
Unique Predicted Labels: ['OBJ' 'SUBJ']


# deBERTa

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import torch

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Load tokenizer and model
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base")

# Tokenize text data using the loaded tokenizer
X_train = tokenizer(df_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_test = tokenizer(df_test_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_submission = tokenizer(df_submission_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Encode labels
label_encoder = LabelEncoder()
y_train = torch.tensor(label_encoder.fit_transform(df_final['label']), dtype=torch.long)
y_test = torch.tensor(label_encoder.transform(df_test_final['label']), dtype=torch.long)

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train['input_ids'], X_train['attention_mask'], y_train)
test_dataset = TensorDataset(X_test['input_ids'], X_test['attention_mask'], y_test)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    scheduler.step()

# Evaluate the model on the test set
model.eval()
y_pred = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=1)
        y_pred.extend(pred_labels.cpu().numpy())

# Calculate accuracy
accuracy = (y_pred == y_test.cpu().numpy()).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test.cpu().numpy(), y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
submission_dataset = TensorDataset(X_submission['input_ids'], X_submission['attention_mask'])
submission_dataloader = DataLoader(submission_dataset, batch_size=16, shuffle=False)

model.eval()
y_submission_pred_probs = []
with torch.no_grad():
    for batch in submission_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        y_submission_pred_probs.extend(probs.cpu().numpy())

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_deberta.tsv', sep='\t', index=False)


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Accuracy on Test Set: 0.720164609053498
Macro Average F1 Score on Test Set: 0.7119648584576768
Unique Predicted Labels: ['OBJ' 'SUBJ']


# deBERTa V3





In [None]:
!pip install transformers==4.30
!pip install accelerate>=0.20.1




In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import torch

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data using RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("GroNLP/mdebertav3-subjectivity-english")
X_train = tokenizer(df_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_test = tokenizer(df_test_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_submission = tokenizer(df_submission_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Encode labels
label_encoder = LabelEncoder()
y_train = torch.tensor(label_encoder.fit_transform(df_final['label']), dtype=torch.long)
y_test = torch.tensor(label_encoder.transform(df_test_final['label']), dtype=torch.long)

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train['input_ids'], X_train['attention_mask'], y_train)
test_dataset = TensorDataset(X_test['input_ids'], X_test['attention_mask'], y_test)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load model directly
model = AutoModelForSequenceClassification.from_pretrained("GroNLP/mdebertav3-subjectivity-english", num_labels=len(label_encoder.classes_))

# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=6e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    scheduler.step()

# Evaluate the model on the test set
model.eval()
y_pred = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=1)
        y_pred.extend(pred_labels.cpu().numpy())

# Calculate accuracy
accuracy = (y_pred == y_test.cpu().numpy()).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test.cpu().numpy(), y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
submission_dataset = TensorDataset(X_submission['input_ids'], X_submission['attention_mask'])
submission_dataloader = DataLoader(submission_dataset, batch_size=16, shuffle=False)

model.eval()
y_submission_pred_probs = []
with torch.no_grad():
    for batch in submission_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        y_submission_pred_probs.extend(probs.cpu().numpy())

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_mdebertav3.tsv', sep='\t', index=False)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Accuracy on Test Set: 0.7942386831275721
Macro Average F1 Score on Test Set: 0.7912658053875756
Unique Predicted Labels: ['OBJ' 'SUBJ']


# XLNet

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import torch

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data using XLNet tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
X_train = tokenizer(df_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_test = tokenizer(df_test_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_submission = tokenizer(df_submission_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Encode labels
label_encoder = LabelEncoder()
y_train = torch.tensor(label_encoder.fit_transform(df_final['label']), dtype=torch.long)
y_test = torch.tensor(label_encoder.transform(df_test_final['label']), dtype=torch.long)

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train['input_ids'], X_train['attention_mask'], y_train)
test_dataset = TensorDataset(X_test['input_ids'], X_test['attention_mask'], y_test)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load pre-trained XLNet model for sequence classification
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=len(label_encoder.classes_))

# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(10):  # Adjust the number of epochs as needed
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    scheduler.step()

# Evaluate the model on the test set
model.eval()
y_pred = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=1)
        y_pred.extend(pred_labels.cpu().numpy())

# Calculate accuracy
accuracy = (y_pred == y_test.cpu().numpy()).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test.cpu().numpy(), y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
submission_dataset = TensorDataset(X_submission['input_ids'], X_submission['attention_mask'])
submission_dataloader = DataLoader(submission_dataset, batch_size=16, shuffle=False)

model.eval()
y_submission_pred_probs = []
with torch.no_grad():
    for batch in submission_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        y_submission_pred_probs.extend(probs.cpu().numpy())

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_xlnet.tsv', sep='\t', index=False)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy on Test Set: 0.7860082304526749
Macro Average F1 Score on Test Set: 0.7857142857142857
Unique Predicted Labels: ['OBJ' 'SUBJ']


# DistilBERT

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import torch

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data using DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
X_train = tokenizer(df_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_test = tokenizer(df_test_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
X_submission = tokenizer(df_submission_final['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Encode labels
label_encoder = LabelEncoder()
y_train = torch.tensor(label_encoder.fit_transform(df_final['label']), dtype=torch.long)
y_test = torch.tensor(label_encoder.transform(df_test_final['label']), dtype=torch.long)

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train['input_ids'], X_train['attention_mask'], y_train)
test_dataset = TensorDataset(X_test['input_ids'], X_test['attention_mask'], y_test)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load pre-trained DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))

# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    scheduler.step()

# Evaluate the model on the test set
model.eval()
y_pred = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=1)
        y_pred.extend(pred_labels.cpu().numpy())

# Calculate accuracy
accuracy = (y_pred == y_test.cpu().numpy()).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test.cpu().numpy(), y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
submission_dataset = TensorDataset(X_submission['input_ids'], X_submission['attention_mask'])
submission_dataloader = DataLoader(submission_dataset, batch_size=16, shuffle=False)

model.eval()
y_submission_pred_probs = []
with torch.no_grad():
    for batch in submission_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        y_submission_pred_probs.extend(probs.cpu().numpy())

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_distilbert.tsv', sep='\t', index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy on Test Set: 0.7160493827160493
Macro Average F1 Score on Test Set: 0.7064241065956963
Unique Predicted Labels: ['OBJ' 'SUBJ']
