In [None]:
!git clone https://github.com/pooja-premnath/CheckThat-Task2-Subjectivity

Cloning into 'CheckThat-Task2-Subjectivity'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 19 (delta 7), reused 18 (delta 6), pack-reused 0[K
Receiving objects: 100% (19/19), 173.30 KiB | 5.78 MiB/s, done.
Resolving deltas: 100% (7/7), done.


# Basic LSTM and BiLSTM




In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import f1_score

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_final['sentence'])
X_train = tokenizer.texts_to_sequences(df_final['sentence'])
X_test = tokenizer.texts_to_sequences(df_test_final['sentence'])
X_submission = tokenizer.texts_to_sequences(df_submission_final['sentence'])

max_length = 100  # Choose an appropriate max length
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')
X_submission = pad_sequences(X_submission, maxlen=max_length, padding='post')

# Encode labels
label_encoder = LabelEncoder()
df_final['Encoded_Label'] = label_encoder.fit_transform(df_final['label'])
y_train = df_final['Encoded_Label']
y_test = label_encoder.transform(df_test_final['label'])

# Define RNN model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_length),
    LSTM(units=128),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1, callbacks=[early_stopping])

# Evaluate on test data
_, accuracy = model.evaluate(X_test, y_test)
print(f"Accuracy on test data: {accuracy}")

# Predict probabilities on submission data
y_submission_prob = model.predict(X_submission)

# Convert probabilities to class labels based on threshold (0.5)
y_submission_pred = (y_submission_prob > 0.5).astype(int)
y_submission_pred = label_encoder.inverse_transform(y_submission_pred.flatten())

# Replace encoded labels with original labels
df_submission_final['Predicted_Label'] = y_submission_pred

# Save submission DataFrame to TSV with sentence_id and original labels
df_submission_final[['sentence_id', 'Predicted_Label']].to_csv("submission_predictions.tsv", sep='\t', index=False)

# Calculate macro average F1 score on test set
y_test_prob = model.predict(X_test)
y_test_pred = (y_test_prob > 0.5).astype(int)
y_test_pred = y_test_pred.flatten()
macro_f1 = f1_score(y_test, y_test_pred, average='macro')
print(f"Macro Average F1 Score on test set: {macro_f1}")


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Accuracy on test data: 0.49054276943206787
Macro Average F1 Score on test set: 0.3231197771587744


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import f1_score

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_final['sentence'])

# Encode text data to sequences
X_train = tokenizer.texts_to_sequences(df_final['sentence'])
X_test = tokenizer.texts_to_sequences(df_test_final['sentence'])
X_submission = tokenizer.texts_to_sequences(df_submission_final['sentence'])

# Pad sequences to ensure uniform length
max_sequence_length = max([len(seq) for seq in X_train + X_test + X_submission])
X_train = pad_sequences(X_train, maxlen=max_sequence_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_sequence_length, padding='post')
X_submission = pad_sequences(X_submission, maxlen=max_sequence_length, padding='post')

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_final['label'])
y_test = label_encoder.transform(df_test_final['label'])

# Build highly complex LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(units=512, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(units=256, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(units=128)))
model.add(Dropout(0.5))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=len(label_encoder.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
model.fit(X_train, y_train, epochs=30, batch_size=128, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Predict probabilities for test set
y_pred_probs = model.predict(X_test)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_probs, axis=1)

# Calculate accuracy
accuracy = (y_pred == y_test).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test, y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
y_submission_pred_probs = model.predict(X_submission)

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_complex_lstm_updated.tsv', sep='\t', index=False)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Accuracy on Test Set: 0.4773662551440329
Macro Average F1 Score on Test Set: 0.3231197771587744
Unique Predicted Labels: ['OBJ' 'SUBJ']


# With Attention

In [None]:
!pip install tensorflow_addons

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/611.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m307.2/611.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow_addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.23.0 typeguard-2.13.3


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import f1_score

# Define Attention layer
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Layer

class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1), initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1), initializer="zeros")
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
        at = K.softmax(et)
        at = K.expand_dims(at, axis=-1)
        output = x * at
        return K.sum(output, axis=1)

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_final['sentence'])

# Encode text data to sequences
X_train = tokenizer.texts_to_sequences(df_final['sentence'])
X_test = tokenizer.texts_to_sequences(df_test_final['sentence'])
X_submission = tokenizer.texts_to_sequences(df_submission_final['sentence'])

# Pad sequences to ensure uniform length
max_sequence_length = max([len(seq) for seq in X_train + X_test + X_submission])
X_train = pad_sequences(X_train, maxlen=max_sequence_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_sequence_length, padding='post')
X_submission = pad_sequences(X_submission, maxlen=max_sequence_length, padding='post')

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_final['label'])
y_test = label_encoder.transform(df_test_final['label'])

# Build LSTM model with Attention mechanism
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(units=512, return_sequences=True)))
model.add(Dropout(0.5))
model.add(AttentionLayer())
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=len(label_encoder.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
model.fit(X_train, y_train, epochs=30, batch_size=128, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Predict probabilities for test set
y_pred_probs = model.predict(X_test)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_probs, axis=1)

# Calculate accuracy
accuracy = (y_pred == y_test).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test, y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
y_submission_pred_probs = model.predict(X_submission)

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_lstm_attention_custom.tsv', sep='\t', index=False)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Accuracy on Test Set: 0.4773662551440329
Macro Average F1 Score on Test Set: 0.3231197771587744
Unique Predicted Labels: ['OBJ']


In [None]:
# Print the number of samples in each class
class_counts = df_final['label'].value_counts()
print(class_counts)


label
OBJ     532
SUBJ    298
Name: count, dtype: int64


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight

# Define Attention layer
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Layer

class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1), initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1), initializer="zeros")
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
        at = K.softmax(et)
        at = K.expand_dims(at, axis=-1)
        output = x * at
        return K.sum(output, axis=1)

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_final['sentence'])

# Encode text data to sequences
X_train = tokenizer.texts_to_sequences(df_final['sentence'])
X_test = tokenizer.texts_to_sequences(df_test_final['sentence'])
X_submission = tokenizer.texts_to_sequences(df_submission_final['sentence'])

# Pad sequences to ensure uniform length
max_sequence_length = max([len(seq) for seq in X_train + X_test + X_submission])
X_train = pad_sequences(X_train, maxlen=max_sequence_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_sequence_length, padding='post')
X_submission = pad_sequences(X_submission, maxlen=max_sequence_length, padding='post')

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_final['label'])
y_test = label_encoder.transform(df_test_final['label'])

class_weights = compute_class_weight('balanced', classes=np.unique(df_final['label']), y=df_final['label'])

class_weights_dict = dict(enumerate(class_weights))

# Build more complex LSTM model with Attention mechanism
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(units=512, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(units=256, return_sequences=True)))
model.add(Dropout(0.5))
model.add(AttentionLayer())
model.add(Dense(units=128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=len(label_encoder.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with class weights
model.fit(X_train, y_train, epochs=30, batch_size=128, validation_data=(X_test, y_test), callbacks=[early_stopping], class_weight=class_weights_dict)

# Predict probabilities for test set
y_pred_probs = model.predict(X_test)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_probs, axis=1)

# Calculate accuracy
accuracy = (y_pred == y_test).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test, y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
y_submission_pred_probs = model.predict(X_submission)

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_lstm_attention_updated.tsv', sep='\t', index=False)


Epoch 1/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Accuracy on Test Set: 0.5020576131687243
Macro Average F1 Score on Test Set: 0.34146341463414637
Unique Predicted Labels: ['OBJ' 'SUBJ']


# Simple GRU


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import f1_score

# Load data
df_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/train_en.tsv", sep='\t')
df_test_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/dev_test_en.tsv", sep='\t')
df_submission_final = pd.read_csv("/content/CheckThat-Task2-Subjectivity/test_en.tsv", sep='\t')

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_final['sentence'])

# Encode text data to sequences
X_train = tokenizer.texts_to_sequences(df_final['sentence'])
X_test = tokenizer.texts_to_sequences(df_test_final['sentence'])
X_submission = tokenizer.texts_to_sequences(df_submission_final['sentence'])

# Pad sequences to ensure uniform length
max_sequence_length = max([len(seq) for seq in X_train + X_test + X_submission])
X_train = pad_sequences(X_train, maxlen=max_sequence_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_sequence_length, padding='post')
X_submission = pad_sequences(X_submission, maxlen=max_sequence_length, padding='post')

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_final['label'])
y_test = label_encoder.transform(df_test_final['label'])

# Build a more complex GRU model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))
model.add(Bidirectional(GRU(units=512, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(GRU(units=256, return_sequences=True)))
model.add(Dropout(0.5))
model.add(GRU(units=128))
model.add(Dropout(0.5))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=len(label_encoder.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
model.fit(X_train, y_train, epochs=30, batch_size=128, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Predict probabilities for test set
y_pred_probs = model.predict(X_test)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_probs, axis=1)

# Calculate accuracy
accuracy = (y_pred == y_test).mean()
print(f'Accuracy on Test Set: {accuracy}')

# Calculate macro average F1 score
f1 = f1_score(y_test, y_pred, average='macro')
print(f'Macro Average F1 Score on Test Set: {f1}')

# Predict probabilities for submission data
y_submission_pred_probs = model.predict(X_submission)

# Convert probabilities to class labels
y_submission_pred = np.argmax(y_submission_pred_probs, axis=1)
predicted_labels = label_encoder.inverse_transform(y_submission_pred)

# Check unique predicted labels
unique_labels = np.unique(predicted_labels)
print(f'Unique Predicted Labels: {unique_labels}')

# Create dataframe with sentence_id and predicted labels
df_submission_predicted = pd.DataFrame({'sentence_id': df_submission_final['sentence_id'], 'predicted_label': predicted_labels})

# Save dataframe to TSV file
df_submission_predicted.to_csv('submission_predictions_gru_updated.tsv', sep='\t', index=False)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Accuracy on Test Set: 0.4773662551440329
Macro Average F1 Score on Test Set: 0.3231197771587744
Unique Predicted Labels: ['OBJ']
