LSTM, BiLSTM, RNN with Word2Vec:

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, SimpleRNN, Bidirectional
from tensorflow.keras import backend as K
from gensim.models import Word2Vec

# Google Colab Setup: Uncomment if running on Colab
# from google.colab import drive
# drive.mount('/content/drive')
# data_path = '/content/drive/MyDrive/Airport feedback.xlsx'

# Load the dataset
data = pd.read_excel('Airport feedback.xlsx')  # Change path to `data_path` for Colab setup

# Inspect the column names
print("Columns in the dataset:", data.columns)

# Use the correct column name (ensure no extra spaces)
data.rename(columns=lambda x: x.strip(), inplace=True)

# Replace 'Feedback' with the actual column name from your dataset
feedback_column = 'Airport Service Freeform Feedback'  # Replace with the correct column if different
print(f"Using column '{feedback_column}' for processing.")

# Preprocessing the dataset
def preprocess_text(text):
    if pd.isna(text):  # Handle NaN values
        return ""
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\\s]', '', text)  # Remove special characters and numbers
    return text

# Clean and preprocess data
data[feedback_column] = data[feedback_column].apply(preprocess_text)

# Generate random binary labels (Replace with actual labels if available)
y = np.random.randint(2, size=len(data[feedback_column]))

# Split data into features (X) and labels (y)
X = data[feedback_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert X_train and X_test to lists of strings
X_train = X_train.tolist()
X_test = X_test.tolist()

# Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

maxlen = 100
X_train_pad = pad_sequences(X_train_seq, padding='post', maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, padding='post', maxlen=maxlen)

# Word2Vec Embedding
word2vec_model = Word2Vec(sentences=[text.split() for text in X_train if text], vector_size=100, window=5, min_count=1, workers=4)
embedding_matrix = np.zeros((vocab_size, 100))

for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

# Define RNN Model
def create_rnn_model():
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(SimpleRNN(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define LSTM Model
def create_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(LSTM(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define BiLSTM Model
def create_bilstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(Bidirectional(LSTM(128, activation='relu')))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train and Evaluate Models

# Train RNN Model
rnn_model = create_rnn_model()
print("Training RNN Model...")
rnn_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

# Calculate RNN accuracy
rnn_accuracy = rnn_model.evaluate(X_test_pad, y_test)[1]
print(f"RNN Model Test Accuracy: {rnn_accuracy * 100:.2f}%")

# Train LSTM Model
lstm_model = create_lstm_model()
print("Training LSTM Model...")
lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

# Calculate LSTM accuracy
lstm_accuracy = lstm_model.evaluate(X_test_pad, y_test)[1]
print(f"LSTM Model Test Accuracy: {lstm_accuracy * 100:.2f}%")

# Train BiLSTM Model
bilstm_model = create_bilstm_model()
print("Training BiLSTM Model...")
bilstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

# Calculate BiLSTM accuracy
bilstm_accuracy = bilstm_model.evaluate(X_test_pad, y_test)[1]
print(f"BiLSTM Model Test Accuracy: {bilstm_accuracy * 100:.2f}%")

# Display final total accuracy for each model
print("\nTotal Accuracy of Each Model:")
print(f"RNN Accuracy: {rnn_accuracy * 100:.2f}%")
print(f"LSTM Accuracy: {lstm_accuracy * 100:.2f}%")
print(f"BiLSTM Accuracy: {bilstm_accuracy * 100:.2f}%")


Columns in the dataset: Index(['Airport Service Freeform Feedback'], dtype='object')
Using column 'Airport Service Freeform Feedback' for processing.
Training RNN Model...
Epoch 1/5




[1m1678/1678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 41ms/step - accuracy: 0.4971 - loss: 0.6932 - val_accuracy: 0.4996 - val_loss: 0.6933
Epoch 2/5
[1m1678/1678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 38ms/step - accuracy: 0.4987 - loss: 0.6932 - val_accuracy: 0.4996 - val_loss: 0.6932
Epoch 3/5
[1m1678/1678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 39ms/step - accuracy: 0.5018 - loss: 0.6932 - val_accuracy: 0.5004 - val_loss: 0.6932
Epoch 4/5
[1m1678/1678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 38ms/step - accuracy: 0.4988 - loss: 0.6932 - val_accuracy: 0.4996 - val_loss: 0.6932
Epoch 5/5
[1m1678/1678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 39ms/step - accuracy: 0.5001 - loss: 0.6931 - val_accuracy: 0.4996 - val_loss: 0.6932
[1m420/420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.5035 - loss: 0.6931
RNN Model Test Accuracy: 49.96%
Training LSTM Model...
Epoch 1/5
[1m1678/1

LSTM, BiLSTM, RNN with Glove:

In [5]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, SimpleRNN, Bidirectional
from tensorflow.keras import backend as K
from gensim.models import Word2Vec

# Google Colab Setup: Uncomment if running on Colab
# from google.colab import drive
# drive.mount('/content/drive')
# data_path = '/content/drive/MyDrive/Airport feedback.xlsx'

# Load the dataset
data = pd.read_excel('Airport feedback.xlsx')  # Change path to `data_path` for Colab setup

# Inspect the column names
print("Columns in the dataset:", data.columns)

# Use the correct column name (ensure no extra spaces)
data.rename(columns=lambda x: x.strip(), inplace=True)

# Replace 'Feedback' with the actual column name from your dataset
feedback_column = 'Airport Service Freeform Feedback'  # Replace with the correct column if different
print(f"Using column '{feedback_column}' for processing.")

# Preprocessing the dataset
def preprocess_text(text):
    if pd.isna(text):  # Handle NaN values
        return ""
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\\s]', '', text)  # Remove special characters and numbers
    return text

# Clean and preprocess data
data[feedback_column] = data[feedback_column].apply(preprocess_text)

# Generate random binary labels (Replace with actual labels if available)
y = np.random.randint(2, size=len(data[feedback_column]))

# Split data into features (X) and labels (y)
X = data[feedback_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert X_train and X_test to lists of strings
X_train = X_train.tolist()
X_test = X_test.tolist()

# Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

maxlen = 100
X_train_pad = pad_sequences(X_train_seq, padding='post', maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, padding='post', maxlen=maxlen)

# GloVe Embedding (load GloVe vectors from file)
def load_glove_embeddings(glove_file_path, tokenizer):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print(f'Found {len(embeddings_index)} word vectors.')

    # Create the embedding matrix
    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
    for word, i in tokenizer.word_index.items():
        if word in embeddings_index:
            embedding_matrix[i] = embeddings_index[word]
    return embedding_matrix

# Load the GloVe word vectors (ensure to provide the correct path to the GloVe file)
glove_file_path = '/content/glove.6B.100d.txt'  # Provide path to GloVe file
embedding_matrix = load_glove_embeddings(glove_file_path, tokenizer)

# Define RNN Model
def create_rnn_model():
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(SimpleRNN(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define LSTM Model
def create_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(LSTM(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define BiLSTM Model
def create_bilstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(Bidirectional(LSTM(128, activation='relu')))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train and Evaluate Models

# Train RNN Model
rnn_model = create_rnn_model()
print("Training RNN Model...")
rnn_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

# Calculate RNN accuracy
rnn_accuracy = rnn_model.evaluate(X_test_pad, y_test)[1]
print(f"RNN Model Test Accuracy: {rnn_accuracy * 100:.2f}%")

# Train LSTM Model
lstm_model = create_lstm_model()
print("Training LSTM Model...")
lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

# Calculate LSTM accuracy
lstm_accuracy = lstm_model.evaluate(X_test_pad, y_test)[1]
print(f"LSTM Model Test Accuracy: {lstm_accuracy * 100:.2f}%")

# Train BiLSTM Model
bilstm_model = create_bilstm_model()
print("Training BiLSTM Model...")
bilstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

# Calculate BiLSTM accuracy
bilstm_accuracy = bilstm_model.evaluate(X_test_pad, y_test)[1]
print(f"BiLSTM Model Test Accuracy: {bilstm_accuracy * 100:.2f}%")

# Display final total accuracy for each model
print("\nTotal Accuracy of Each Model:")
print(f"RNN Accuracy: {rnn_accuracy * 100:.2f}%")
print(f"LSTM Accuracy: {lstm_accuracy * 100:.2f}%")
print(f"BiLSTM Accuracy: {bilstm_accuracy * 100:.2f}%")


Columns in the dataset: Index(['Airport Service Freeform Feedback'], dtype='object')
Using column 'Airport Service Freeform Feedback' for processing.
Found 30555 word vectors.
Training RNN Model...
Epoch 1/5




[1m1678/1678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 45ms/step - accuracy: 0.5031 - loss: 0.6932 - val_accuracy: 0.4980 - val_loss: 0.6932
Epoch 2/5
[1m1678/1678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 45ms/step - accuracy: 0.4975 - loss: 0.6932 - val_accuracy: 0.5020 - val_loss: 0.6931
Epoch 3/5
[1m1678/1678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 44ms/step - accuracy: 0.5014 - loss: 0.6932 - val_accuracy: 0.4980 - val_loss: 0.6932
Epoch 4/5
[1m1678/1678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 41ms/step - accuracy: 0.5000 - loss: 0.6932 - val_accuracy: 0.4980 - val_loss: 0.6932
Epoch 5/5
[1m1678/1678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 45ms/step - accuracy: 0.4987 - loss: 0.6932 - val_accuracy: 0.4980 - val_loss: 0.6932
[1m420/420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.5012 - loss: 0.6931
RNN Model Test Accuracy: 49.80%
Training LSTM Model...
Epoch 1/5
[1m1678/1

LSTM, BiLSTM, RNN with FastText:

In [3]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4296228 sha256=f7a568bba023db7f16e801bbfb55fdb691682a6c3f15f53a9a97a95f1ee5a8d8
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, SimpleRNN, Bidirectional
from tensorflow.keras import backend as K
import fasttext  # Importing the FastText library

# Google Colab Setup: Uncomment if running on Colab
# from google.colab import drive
# drive.mount('/content/drive')
# data_path = '/content/drive/MyDrive/Airport feedback.xlsx'

# Load the dataset
data = pd.read_excel('Airport feedback.xlsx')  # Change path to `data_path` for Colab setup

# Inspect the column names
print("Columns in the dataset:", data.columns)

# Use the correct column name (ensure no extra spaces)
data.rename(columns=lambda x: x.strip(), inplace=True)

# Replace 'Feedback' with the actual column name from your dataset
feedback_column = 'Airport Service Freeform Feedback'  # Replace with the correct column if different
print(f"Using column '{feedback_column}' for processing.")

# Preprocessing the dataset
def preprocess_text(text):
    if pd.isna(text):  # Handle NaN values
        return ""
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\\s]', '', text)  # Remove special characters and numbers
    return text

# Clean and preprocess data
data[feedback_column] = data[feedback_column].apply(preprocess_text)

# Generate random binary labels (Replace with actual labels if available)
y = np.random.randint(2, size=len(data[feedback_column]))

# Split data into features (X) and labels (y)
X = data[feedback_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert X_train and X_test to lists of strings
X_train = X_train.tolist()
X_test = X_test.tolist()

# Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

maxlen = 100
X_train_pad = pad_sequences(X_train_seq, padding='post', maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, padding='post', maxlen=maxlen)

# FastText Embedding (load FastText vectors from file)
def load_fasttext_embeddings(fasttext_file_path, tokenizer):
    # Loading pre-trained FastText model
    model = fasttext.load_model(fasttext_file_path)  # FastText model
    print(f'Loaded FastText model.')

    # Create the embedding matrix
    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))  # 100-dimensional FastText embeddings
    for word, i in tokenizer.word_index.items():
        # Get word vector from FastText model
        embedding_matrix[i] = model.get_word_vector(word)
    return embedding_matrix

# Load the FastText word vectors (ensure to provide the correct path to the FastText model)
fasttext_file_path = 'cc.en.300.bin'  # Provide path to FastText pre-trained model
embedding_matrix = load_fasttext_embeddings(fasttext_file_path, tokenizer)

# Define RNN Model
def create_rnn_model():
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(SimpleRNN(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define LSTM Model
def create_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(LSTM(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define BiLSTM Model
def create_bilstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(Bidirectional(LSTM(128, activation='relu')))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train and Evaluate Models

# Train RNN Model
rnn_model = create_rnn_model()
print("Training RNN Model...")
rnn_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

# Calculate RNN accuracy
rnn_accuracy = rnn_model.evaluate(X_test_pad, y_test)[1]
print(f"RNN Model Test Accuracy: {rnn_accuracy * 100:.2f}%")

# Train LSTM Model
lstm_model = create_lstm_model()
print("Training LSTM Model...")
lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

# Calculate LSTM accuracy
lstm_accuracy = lstm_model.evaluate(X_test_pad, y_test)[1]
print(f"LSTM Model Test Accuracy: {lstm_accuracy * 100:.2f}%")

# Train BiLSTM Model
bilstm_model = create_bilstm_model()
print("Training BiLSTM Model...")
bilstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

# Calculate BiLSTM accuracy
bilstm_accuracy = bilstm_model.evaluate(X_test_pad, y_test)[1]
print(f"BiLSTM Model Test Accuracy: {bilstm_accuracy * 100:.2f}%")

# Display final total accuracy for each model
print("\nTotal Accuracy of Each Model:")
print(f"RNN Accuracy: {rnn_accuracy * 100:.2f}%")
print(f"LSTM Accuracy: {lstm_accuracy * 100:.2f}%")
print(f"BiLSTM Accuracy: {bilstm_accuracy * 100:.2f}%")
