In [None]:

!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import re
import keras_tuner as kt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [None]:
# Load and preprocess the data
train_data = pd.read_csv('/content/sarcasm_tam_train.csv')
dev_data = pd.read_csv('/content/sarcasm_tam_dev.csv')
test_data = pd.read_csv('/content/sarcasm_tam_test_without_labels.csv')



In [None]:
# Clean and tokenize text data
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z\s]+', ' ', text)  # Remove special characters and digits
    text = text.lower()  # Convert to lower case
    return text

train_data['cleaned_text'] = train_data['Text'].apply(clean_text)
dev_data['cleaned_text'] = dev_data['Text'].apply(clean_text)
test_data['cleaned_text'] = test_data['Text'].apply(clean_text)

# Encode labels
label_mapping = {'Non-sarcastic': 0, 'Sarcastic': 1}
train_data['labels'] = train_data['labels'].map(label_mapping)
dev_data['labels'] = dev_data['labels'].map(label_mapping)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_data['cleaned_text'])

X_train = tokenizer.texts_to_sequences(train_data['cleaned_text'])
X_dev = tokenizer.texts_to_sequences(dev_data['cleaned_text'])
X_test = tokenizer.texts_to_sequences(test_data['cleaned_text'])

max_sequence_length = 100
X_train_padded = pad_sequences(X_train, maxlen=max_sequence_length)
X_dev_padded = pad_sequences(X_dev, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test, maxlen=max_sequence_length)

y_train = to_categorical(train_data['labels'])
y_dev = to_categorical(dev_data['labels'])

# Define the hypermodel function for Keras Tuner
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=hp.Int('embedding_dim', min_value=32, max_value=256, step=32), input_length=max_sequence_length))

    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(LSTM(units=hp.Int(f'lstm_units_{i}', min_value=32, max_value=256, step=32),
                       return_sequences=True if i < hp.Int('num_layers', 1, 3) - 1 else False))
        model.add(Dropout(rate=hp.Float(f'dropout_{i}', min_value=0.1, max_value=0.5, step=0.1)))

    model.add(Dense(2, activation='softmax'))

    model.compile(optimizer=hp.Choice('optimizer', ['adam', 'rmsprop']),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# Initialize and run the Hyperparameter Tuner
tuner = kt.Hyperband(build_model,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='my_dir',
                     project_name='intro_to_kt')

tuner.search(X_train_padded, y_train, epochs=10, validation_data=(X_dev_padded, y_dev))



Trial 30 Complete [00h 03m 01s]
val_accuracy: 0.7929292917251587

Best val_accuracy So Far: 0.8014520406723022
Total elapsed time: 00h 44m 41s


In [None]:
# Get the best model and hyperparameters
best_model = tuner.get_best_models(num_models=1)[0]
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print("Best hyperparameters:")
print(best_hps.values)

# Evaluate the best model on the development set
y_dev_pred = best_model.predict(X_dev_padded)
y_dev_pred_classes = np.argmax(y_dev_pred, axis=1)
y_dev_true = np.argmax(y_dev, axis=1)

print("Validation Accuracy:", accuracy_score(y_dev_true, y_dev_pred_classes))
print("Classification Report:\n", classification_report(y_dev_true, y_dev_pred_classes))

# Compute Macro F1 Score for development set
macro_f1_dev = f1_score(y_dev_true, y_dev_pred_classes, average='macro')
print("Development Set Macro F1 Score:", macro_f1_dev)

# Predict on the test set
test_predictions = best_model.predict(X_test_padded)
test_predictions_classes = np.argmax(test_predictions, axis=1)

# Save the predictions to a CSV file
test_results = test_data[['ID']].copy()
test_results['Predicted_Label'] = test_predictions_classes
test_results['Predicted_Label'] = test_results['Predicted_Label'].map({0: 'Non-sarcastic', 1: 'Sarcastic'})

test_results.to_csv('/content/sarcasm_tam_test_predictions_lstm_hp_id.csv', index=False)
print("Test predictions saved to 'sarcasm_tam_test_predictions_lstm.csv'")

Best hyperparameters:
{'embedding_dim': 224, 'num_layers': 1, 'lstm_units_0': 64, 'dropout_0': 0.1, 'optimizer': 'adam', 'lstm_units_1': 192, 'dropout_1': 0.2, 'lstm_units_2': 192, 'dropout_2': 0.4, 'tuner/epochs': 2, 'tuner/initial_epoch': 0, 'tuner/bracket': 2, 'tuner/round': 0}
Validation Accuracy: 0.8014520202020202
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.92      0.87      4630
           1       0.70      0.47      0.56      1706

    accuracy                           0.80      6336
   macro avg       0.76      0.70      0.72      6336
weighted avg       0.79      0.80      0.79      6336

Development Set Macro F1 Score: 0.7152583352088222
Test predictions saved to 'sarcasm_tam_test_predictions_lstm.csv'
