In [1]:
import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

In [2]:
# Load the dataset
dataset_path = '../Data/lab4_train.csv'
data = pd.read_csv(dataset_path)

# Display the first few rows of the dataset
print(data.head())

     id                                               text  \
0  3121               But the staff was so horrible to us.   
1  2777  To be completely fair, the only redeeming fact...   
2  2777  To be completely fair, the only redeeming fact...   
3  1634  The food is uniformly exceptional, with a very...   
4  2534  Where Gabriela personaly greets you and recomm...   

            aspectCategory  polarity  
0                  service  negative  
1                     food  positive  
2  anecdotes/miscellaneous  negative  
3                     food  positive  
4                  service  positive  


In [3]:
data = data[data['polarity'] != 'conflict']

In [4]:
max_len = 100  # Define the maximum length of sequences

# LSTM preprocessing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['text'])
X_lstm = tokenizer.texts_to_sequences(data['text'])
X_lstm = pad_sequences(X_lstm, maxlen=max_len)

# LDA preprocessing
vectorizer = CountVectorizer()
X_lda = vectorizer.fit_transform(data['text'])

In [5]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
max_len = 100

# Encode the polarity column
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['polarity'])

# Convert encoded labels to one-hot encoding
y = to_categorical(y_encoded)

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(LSTM(128))
model.add(Dense(3, activation='softmax'))  # 3 classes: positive, negative, neutral
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_lstm, y, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 57ms/step - accuracy: 0.6023 - loss: 0.9562 - val_accuracy: 0.6254 - val_loss: 0.8280
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - accuracy: 0.6928 - loss: 0.6742 - val_accuracy: 0.6572 - val_loss: 0.7261
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - accuracy: 0.8176 - loss: 0.4643 - val_accuracy: 0.6906 - val_loss: 0.7326
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 48ms/step - accuracy: 0.8914 - loss: 0.2961 - val_accuracy: 0.6906 - val_loss: 0.8146
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 49ms/step - accuracy: 0.9231 - loss: 0.1957 - val_accuracy: 0.6388 - val_loss: 0.8865
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - accuracy: 0.9312 - loss: 0.1900 - val_accuracy: 0.6756 - val_loss: 0.8887
Epoch 7/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x21bb7b86790>

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_lda, y, test_size=0.2, random_state=42)

# Convert one-hot encoded y_train to 1D array
y_train_labels = np.argmax(y_train, axis=1)

# Train logistic regression on LDA features
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train_labels)
y_pred_lr = lr.predict(X_test)

# Convert one-hot encoded y_test to multiclass format
y_test_multiclass = np.argmax(y_test, axis=1)

# Calculate accuracy score
print("Accuracy of Logistic Regression on LDA features:", accuracy_score(y_test_multiclass, y_pred_lr))


Accuracy of Logistic Regression on LDA features: 0.7123745819397993


In [7]:
# Combine predictions from both models
X_stacking = np.hstack((model.predict(X_lstm), lr.predict_proba(X_lda)))

# Train a model on top of the combined predictions
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_stacking, y)

# Evaluate the stacked model
y_pred_stacking = clf.predict(X_stacking)
print("Accuracy of Stacked Model:", accuracy_score(y, y_pred_stacking))


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
Accuracy of Stacked Model: 0.9595182335229173


In [8]:
from sklearn.metrics import classification_report

# Generate classification report for the stacked model
print("Classification Report of Stacked Model:")
print(classification_report(y, y_pred_stacking))

Classification Report of Stacked Model:
              precision    recall  f1-score   support

           0       0.94      0.93      0.93       715
           1       0.98      0.92      0.95       398
           2       0.97      0.98      0.97      1876

   micro avg       0.96      0.96      0.96      2989
   macro avg       0.96      0.94      0.95      2989
weighted avg       0.96      0.96      0.96      2989
 samples avg       0.96      0.96      0.96      2989



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
