In [8]:
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [9]:
# Load the dataset
data = pd.read_csv('../Data/lab4_train.csv')

# Drop the 'conflict' polarity class
data = data[data['polarity'] != 'conflict']

# Tokenize text data
tokenizer = Tokenizer(num_words=100000)  # Maximum words to keep based on frequency
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])

# Pad sequences to a maximum length
maxlen = 100  # Define the maximum sequence length
X = pad_sequences(sequences, maxlen=maxlen)

# Convert target labels to one-hot encoded format if needed
y = pd.get_dummies(data['polarity'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Define the LSTM model architecture
model = Sequential()
model.add(Embedding(input_dim=100000, output_dim=128, input_shape=(100,)))  
model.add(LSTM(128))
model.add(Dense(3, activation='softmax'))  

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

  super().__init__(**kwargs)


In [11]:
# Train the LSTM model
model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 244ms/step - accuracy: 0.6158 - loss: 0.9489 - val_accuracy: 0.6171 - val_loss: 0.8097
Epoch 2/5
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 244ms/step - accuracy: 0.6837 - loss: 0.7008 - val_accuracy: 0.6823 - val_loss: 0.7113
Epoch 3/5
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 242ms/step - accuracy: 0.8464 - loss: 0.4071 - val_accuracy: 0.7040 - val_loss: 0.7281
Epoch 4/5
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 242ms/step - accuracy: 0.9157 - loss: 0.2597 - val_accuracy: 0.6973 - val_loss: 0.9181
Epoch 5/5
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 243ms/step - accuracy: 0.9239 - loss: 0.2134 - val_accuracy: 0.7157 - val_loss: 1.0170


<keras.src.callbacks.history.History at 0x20a78cd8550>

In [12]:
# Extract text data 
text_data = data['text'].tolist()

# Function to extract potential aspects from text data using spaCy
def extract_aspects_spacy(text_data):
    aspects = []
    for text in text_data:
        doc = nlp(text)
        text_aspects = [ent.text for ent in doc.ents if ent.label_ == 'NORP' or ent.label_ == 'PRODUCT']
        aspects.append(text_aspects)
    return aspects

# Extract potential aspects using spaCy entity recognition
aspects_spacy = extract_aspects_spacy(text_data)

In [13]:
# Define the aspects extracted using spaCy entity recognition
aspects_spacy = [
    ['service'],
    ['food'],
    ['anecdotes/miscellaneous'],
    ['price'],
    ['ambience']
]

# Function to identify descriptors associated with each aspect
def identify_descriptors(aspects, text_data):
    all_descriptors = []
    for aspect_list in aspects:
        aspect_descriptors = []
        for aspect in aspect_list:
            descriptors = set()
            for text in text_data:
                doc = nlp(text)
                for token in doc:
                    if token.text.lower() == aspect:
                        for child in token.children:
                            if child.dep_ in ['amod', 'advmod']:  # Consider adjectival modifiers and adverbial modifiers
                                descriptors.add(child.text)
            aspect_descriptors.append(list(descriptors))
        all_descriptors.append(aspect_descriptors)
    return all_descriptors

# Identify descriptors associated with each aspect in your dataset
aspect_descriptors = identify_descriptors(aspects_spacy, text_data)

In [14]:
# Function to combine text data, aspects, and descriptors into a format suitable for input to the LSTM model
def combine_data(text_data, aspects, aspect_descriptors):
    combined_data = []
    for i in range(min(len(text_data), len(aspects), len(aspect_descriptors))):
        aspect_str = ' '.join(map(str, aspects[i]))
        descriptor_str = ' '.join(map(str, aspect_descriptors[i]))
        combined_instance = text_data[i] + ' ' + aspect_str + ' ' + descriptor_str
        combined_data.append(combined_instance)
    return combined_data

# Combine text data, aspects, and descriptors
combined_data = combine_data(text_data, aspects_spacy, aspect_descriptors)

# Prepare input data for aspect sentiment prediction
aspect_text_sequences = tokenizer.texts_to_sequences(combined_data)
X_combined = pad_sequences(aspect_text_sequences, maxlen=maxlen)

# Obtain sentiment predictions for each aspect using the trained LSTM model
aspect_sentiments = model.predict(X_combined)

# Extract Aspect Sentiments
def analyze_aspect_sentiments(aspects, aspect_descriptors, aspect_sentiments):
    aspect_sentiments_analysis = {}
    for i, aspect_list in enumerate(aspects):
        for j, aspect in enumerate(aspect_list):
            descriptors = aspect_descriptors[i][j]
            sentiment_scores = aspect_sentiments

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 190ms/step


In [15]:
# Prepare input data for overall sentiment analysis
overall_sequences = tokenizer.texts_to_sequences(text_data)
X_overall = pad_sequences(overall_sequences, maxlen=maxlen)

# Obtain overall sentiment predictions using the trained LSTM model
overall_sentiments = model.predict(X_overall)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)

print(f"Accuracy: {accuracy}")

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.7186 - loss: 1.0393

Model Accuracy on Test Set:
Loss: 1.0170024633407593
Accuracy: 0.7157190442085266


In [16]:
# Convert one-hot encoded y_test back to categorical labels
y_test_labels = [y.columns[np.argmax(label)] for label in y_test.to_numpy()]

# Convert the model predictions to categorical labels
y_pred_labels = [y.columns[np.argmax(pred)] for pred in model.predict(X_test)]

# Create the classification report
report = classification_report(y_test_labels, y_pred_labels)

# Print the classification report
print("Classification Report:")
print(report)

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Classification Report:
              precision    recall  f1-score   support

    negative       0.63      0.53      0.58       154
     neutral       0.44      0.29      0.35        76
    positive       0.78      0.88      0.82       368

    accuracy                           0.72       598
   macro avg       0.62      0.57      0.58       598
weighted avg       0.70      0.72      0.70       598

