In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import nltk

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/singapore_airlines_reviews.csv')

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
# Function for data cleaning and lemmatization
def clean_and_lemmatize(text):
    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize

    # Remove stopwords and non-alphabetic characters, and lemmatize
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]

    return ' '.join(filtered_tokens)


In [None]:
data['cleaned_text'] = data['text'].apply(clean_and_lemmatize)

In [None]:
print(data)

                 published_date published_platform  rating    type  \
0     2024-03-12T14:41:14-04:00            Desktop       3  review   
1     2024-03-11T19:39:13-04:00            Desktop       5  review   
2     2024-03-11T12:20:23-04:00            Desktop       1  review   
3     2024-03-11T07:12:27-04:00            Desktop       5  review   
4     2024-03-10T05:34:18-04:00            Desktop       2  review   
...                         ...                ...     ...     ...   
9995  2018-08-06T03:48:21-04:00            Desktop       5  review   
9996  2018-08-05T22:50:29-04:00             Mobile       5  review   
9997  2018-08-05T22:47:06-04:00            Desktop       5  review   
9998  2018-08-05T20:32:03-04:00            Desktop       4  review   
9999  2018-08-05T20:19:51-04:00            Desktop       4  review   

                                                   text  \
0     We used this airline to go from Singapore to L...   
1     The service on Singapore Airlines S

In [None]:
X = data['cleaned_text']
y = data['rating']

In [None]:
max_words = 10000
max_length = 100
embedding_dim = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

In [None]:
X = pad_sequences(sequences, maxlen=max_length)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# RNN Model

In [None]:
model = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_length),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(6, activation='softmax')
])

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.6485000252723694


# CNN Model

In [None]:
model1 = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_length),
    Conv1D(128, 5, activation='relu'),
    MaxPooling1D(pool_size=4),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(6, activation='softmax')
])


In [None]:
model1.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model1.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2, callbacks=[early_stopping])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [None]:
loss, accuracy = model1.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.6520000100135803
