In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('sent_train.csv')
print(df.head())

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Function to remove URLs
def remove_urls(text):
    url_pattern = r'http[s]?://\S+'
    return re.sub(url_pattern, '', text)

# Apply the remove_urls function
df['text'] = df['text'].apply(remove_urls)

# Check for missing values
print(df.isnull().sum())

In [None]:
# Tokenization and cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.lower() not in stop_words]
    return tokens

df['tokens'] = df['text'].apply(preprocess_text)
print(df[['text', 'tokens']].head())

In [None]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

# Assuming 'label' is your target variable
X = df['tokens']
Y = df['label']

# Convert tokens back to string for model input
X = X.apply(lambda tokens: ' '.join(tokens))

# Train-test split (70% training, 30% testing)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Handle class imbalance
ros = RandomOverSampler(random_state=42)
X_resampled, Y_resampled = ros.fit_resample(X_train.values.reshape(-1, 1), Y_train)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 200

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(X_resampled.flatten())
X_train_seq = tokenizer.texts_to_sequences(X_resampled.flatten())
X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH)

X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

def build_lstm_model(max_nb_words, max_sequence_length):
    model = Sequential()
    model.add(Embedding(input_dim=max_nb_words, output_dim=128))
    model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(64))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model_lstm = build_lstm_model(MAX_NB_WORDS, MAX_SEQUENCE_LENGTH)
model_lstm.summary()

In [None]:
from keras.layers import Conv1D, MaxPooling1D

def build_cnn_lstm_model(max_nb_words, max_sequence_length):
    model = Sequential()
    model.add(Embedding(input_dim=max_nb_words, output_dim=128))
    model.add(Conv1D(64, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(64))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model_cnn_lstm = build_cnn_lstm_model(MAX_NB_WORDS, MAX_SEQUENCE_LENGTH)
model_cnn_lstm.summary()

In [None]:
from keras.layers import GRU

def build_gru_lstm_model(max_nb_words, max_sequence_length):
    model = Sequential()
    model.add(Embedding(input_dim=max_nb_words, output_dim=128))
    model.add(GRU(64, return_sequences=True))
    model.add(GRU(32))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model_gru_lstm = build_gru_lstm_model(MAX_NB_WORDS, MAX_SEQUENCE_LENGTH)
model_gru_lstm.summary()


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dense, Dropout, Embedding, GlobalAveragePooling1D

def transformer_block(inputs, head_size, num_heads, ff_dim, dropout=0):
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads)(inputs, inputs)
    x = Dropout(dropout)(x)
    x = LayerNormalization(epsilon=1e-6)(x)
    res = x + inputs  # Residual connection
    x = Dense(ff_dim, activation='relu')(res)
    x = Dropout(dropout)(x)
    x = LayerNormalization(epsilon=1e-6)(x)
    return x

def build_transformer_model(max_nb_words, max_sequence_length):
    input_layer = tf.keras.layers.Input(shape=(max_sequence_length,))
    x = Embedding(input_dim=max_nb_words, output_dim=128)(input_layer)
    x = transformer_block(x, head_size=128, num_heads=8, ff_dim=128)
    x = GlobalAveragePooling1D()(x)
    x = Dense(32, activation='relu')(x)
    x = Dense(1, activation='sigmoid')(x)  # Output layer for binary classification

    model = tf.keras.models.Model(inputs=input_layer, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model_transformer = build_transformer_model(MAX_NB_WORDS, MAX_SEQUENCE_LENGTH)
model_transformer.summary()

In [None]:
# Check the shapes of X_train_padded and Y_train
print("Shape of X_train_padded:", X_train_padded.shape)
print("Shape of Y_train:", Y_train.shape)

# Instead of assertion, investigate the issue
if X_train_padded.shape[0] != Y_train.shape[0]:
    print(f"WARNING: Input and target data have different number of samples.")
    print(f"X_train_padded samples: {X_train_padded.shape[0]}")
    print(f"Y_train samples: {Y_train.shape[0]}")
    # Add code here to investigate and fix the mismatch.
    # This may involve revisiting data loading/preprocessing steps.
    # For example, check if data splitting was done correctly.

    # Assuming X_train_padded contains duplicated data,
    # take only the first half:
    X_train_padded = X_train_padded[:Y_train.shape[0]]
    print("Shape of X_train_padded after truncation:", X_train_padded.shape)

# Define the number of epochs and batch size
epochs = 10  # You can adjust this value as needed
batch_size = 32  # You can adjust this value as needed

# Train LSTM model
history_lstm = model_lstm.fit(X_train_padded, Y_train, validation_data=(X_test_padded, Y_test), epochs=epochs, batch_size=batch_size)

# Train CNN-LSTM model
history_cnn_lstm = model_cnn_lstm.fit(X_train_padded, Y_train, validation_data=(X_test_padded, Y_test), epochs=epochs, batch_size=batch_size)

# Train GRU-LSTM model
history_gru_lstm = model_gru_lstm.fit(X_train_padded, Y_train, validation_data=(X_test_padded, Y_test), epochs=epochs, batch_size=batch_size)

# Train Transformer model
history_transformer = model_transformer.fit(X_train_padded, Y_train, validation_data=(X_test_padded, Y_test), epochs=epochs, batch_size=batch_size)

Shape of X_train_padded: (6680, 200)
Shape of Y_train: (6680,)
Epoch 1/10
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 577ms/step - accuracy: 0.2036 - loss: -11.0709 - val_accuracy: 0.2029 - val_loss: -63.3043
Epoch 2/10
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 569ms/step - accuracy: 0.2066 - loss: -94.6815 - val_accuracy: 0.2029 - val_loss: -220.0691
Epoch 3/10
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 614ms/step - accuracy: 0.2038 - loss: -275.4050 - val_accuracy: 0.2029 - val_loss: -474.6429
Epoch 4/10
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 617ms/step - accuracy: 0.2049 - loss: -563.5254 - val_accuracy: 0.2029 - val_loss: -818.9836
Epoch 5/10
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 567ms/step - accuracy: 0.2067 - loss: -938.6503 - val_accuracy: 0.2029 - val_loss: -1247.2324
Epoch 6/10
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 

In [None]:
# Check the shapes of X_train_padded and Y_train
print("Shape of X_train_padded:", X_train_padded.shape)
print("Shape of Y_train:", Y_train.shape)

# Instead of assertion, investigate the issue
if X_train_padded.shape[0] != Y_train.shape[0]:
    print(f"WARNING: Input and target data have different number of samples.")
    print(f"X_train_padded samples: {X_train_padded.shape[0]}")
    print(f"Y_train samples: {Y_train.shape[0]}")
    # Add code here to investigate and fix the mismatch.
    # This may involve revisiting data loading/preprocessing steps.
    # For example, check if data splitting was done correctly.

    # Assuming there was an error in data splitting and X_train_padded contains
    # duplicate data, we can truncate X_train_padded to match Y_train's size:
    X_train_padded = X_train_padded[:Y_train.shape[0]]


In [None]:
# # Check the shapes of X_train_padded and Y_train
# print("Shape of X_train_padded:", X_train_padded.shape)
# print("Shape of Y_train:", Y_train.shape)

# # Instead of assertion, investigate the issue
# if X_train_padded.shape[0] != Y_train.shape[0]:
#     print(f"WARNING: Input and target data have different number of samples.")
#     print(f"X_train_padded samples: {X_train_padded.shape[0]}")
#     print(f"Y_train samples: {Y_train.shape[0]}")
#     # Add code here to investigate and fix the mismatch.
#     # This may involve revisiting data loading/preprocessing steps.
#     # For example, check if data splitting was done correctly.

#     # Assuming X_train_padded contains duplicated data,
#     # take only the first half:
#     X_train_padded = X_train_padded[:Y_train.shape[0]]
#     print("Shape of X_train_padded after truncation:", X_train_padded.shape)

# # Train LSTM model
# history_lstm = model_lstm.fit(X_train_padded, Y_train, validation_data=(X_test_padded, Y_test), epochs=epochs, batch_size=batch_size)

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate the model
def evaluate_model(model, X_test, Y_test, model_name):
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    accuracy = accuracy_score(Y_test, y_pred)
    precision = precision_score(Y_test, y_pred, average='weighted')
    recall = recall_score(Y_test, y_pred, average='weighted')
    f1 = f1_score(Y_test, y_pred, average='weighted')

    print(f"Evaluation metrics for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("\n")

# Evaluate each model
evaluate_model(model_lstm, X_test_padded, Y_test, "LSTM")
evaluate_model(model_cnn_lstm, X_test_padded, Y_test, "CNN-LSTM")
evaluate_model(model_gru_lstm, X_test_padded, Y_test, "GRU-LSTM")
evaluate_model(model_transformer, X_test_padded, Y_test, "Transformer")

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate the model
def evaluate_model(model, X_test, Y_test, model_name):
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    accuracy = accuracy_score(Y_test, y_pred)
    precision = precision_score(Y_test, y_pred, average='weighted')
    recall = recall_score(Y_test, y_pred, average='weighted')
    f1 = f1_score(Y_test, y_pred, average='weighted')

    print(f"Evaluation metrics for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("\n")

# Evaluate each model
evaluate_model(model_lstm, X_test_padded, Y_test, "LSTM")
evaluate_model(model_cnn_lstm, X_test_padded, Y_test, "CNN-LSTM")
evaluate_model(model_gru_lstm, X_test_padded, Y_test, "GRU-LSTM")
evaluate_model(model_transformer, X_test_padded, Y_test, "Transformer")

In [None]:
def plot_training_history(history, model_name):
    # Plot training & validation accuracy values
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f'{model_name} Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()

    # Plot training & validation loss values
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'{model_name} Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()

    plt.tight_layout()
    plt.show()

# Plot training history for each model
plot_training_history(history_lstm, "LSTM")
plot_training_history(history_cnn_lstm, "CNN-LSTM")
plot_training_history(history_gru_lstm, "GRU-LSTM")
plot_training_history(history_transformer, "Transformer")

In [None]:
# Save the models
model_lstm.save('model_lstm.h5')
model_cnn_lstm.save('model_cnn_lstm.h5')
model_gru_lstm.save('model_gru_lstm.h5')
model_transformer.save('model_transformer.h5')

print("Models saved successfully!")

In [None]:
from tensorflow.keras.models import load_model

# Load the models
loaded_model_lstm = load_model('model_lstm.h5')
loaded_model_cnn_lstm = load_model('model_cnn_lstm.h5')
loaded_model_gru_lstm = load_model('model_gru_lstm.h5')
loaded_model_transformer = load_model('model_transformer.h5')

print("Models loaded successfully!")

In [None]:
from kerastuner import HyperModel, RandomSearch

class LSTMHyperModel(HyperModel):
    def build(self, hp):
        model = Sequential()
        model.add(Embedding(input_dim=MAX_NB_WORDS, output_dim=hp.Int('embedding_dim', 64, 256, step=64)))
        model.add(LSTM(hp.Int('lstm_units', 32, 128, step=32), return_sequences=True))
        model.add(LSTM(hp.Int('lstm_units_2', 32, 128, step=32)))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))

        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

# Define the hypermodel
hypermodel = LSTMHyperModel()

# Search for the best hyperparameters
tuner = RandomSearch(
    hypermodel,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=1,
    directory='my_dir',
    project_name='lstm_tuning'
)

# Start the hyperparameter search
tuner.search(X_train_padded, Y_train, epochs=10, validation_data=(X_test_padded, Y_test), batch_size=64)

# Retrieve the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Print the summary of the best model
best_model.summary()

# Evaluate the best model
evaluate_model(best_model, X_test_padded, Y_test, "Best LSTM Hyperparameter Model")