In [1]:
import numpy as np
import pandas as pd
import os
import pickle
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler

In [10]:
import pandas as pd

# Load your training data
train_data = pd.read_excel("C:/Users/vsing/OneDrive/Desktop/Capstone Project/Capstone_project_Sentiment_Analysis_Aspect_Level/Datasets/Capstone Project Dataset- Restaurent Train 1 category.xlsx")

# Fill missing values in the "Sentence" column with empty strings
train_data["Sentence"] = train_data["Sentence"].fillna("")

# Create a new DataFrame with only the "Sentence," "Category_A," and "Polarity_A" columns
aspect_data = train_data[["Sentence", "Category_A", "Polarity_A"]]

# Rename the columns for clarity
aspect_data.columns = ["Sentence", "Category", "Polarity"]

# Print the first few rows of the new dataset
print(aspect_data.head())

# Load your test and trial data
test_data = pd.read_excel("C:/Users/vsing/OneDrive/Desktop/Capstone Project/Capstone_project_Sentiment_Analysis_Aspect_Level/Datasets/Capstone Project Dataset- Restaurent Test one category.xlsx")
trail_data = pd.read_excel("C:/Users/vsing/OneDrive/Desktop/Capstone Project/Capstone_project_Sentiment_Analysis_Aspect_Level/Datasets/Capstone Project Dataset- Restaurent Trial 1 category.xlsx")

# Fill missing values in the "Sentence" column with empty strings
test_data["Sentence"] = test_data["Sentence"].fillna("")
trail_data["Sentence"] = trail_data["Sentence"].fillna("")

# Create new DataFrames with only the "Sentence," "Category_A," and "Polarity_A" columns
test_aspect_data = test_data[["Sentence", "Category_A", "Polarity_A"]]
trail_aspect_data = trail_data[["Sentence", "Category_A", "Polarity_A"]]

# Rename the columns for clarity
test_aspect_data.columns = ["Sentence", "Category", "Polarity"]
trail_aspect_data.columns = ["Sentence", "Category", "Polarity"]

# Print the first few rows of the new datasets
print("Test Aspect Data:")
print(test_aspect_data.head())

print("Trial Aspect Data:")
print(trail_aspect_data.head())


                                            Sentence       Category  Polarity
0               But the staff was so horrible to us.        service  negative
1  To be completely fair, the only redeeming fact...           food  positive
2  The food is uniformly exceptional, with a very...           food  positive
3  Where Gabriela personaly greets you and recomm...        service  positive
4  For those that go once and don't enjoy it, all...  miscellaneous  positive
Test Aspect Data:
                                            Sentence Category  Polarity
0                    The bread is top notch as well.     food  positive
1  I have to say they have one of the fastest del...  service  positive
2        Food is always fresh and hot- ready to eat!     food  positive
3      Did I mention that the coffee is OUTSTANDING?     food  positive
4  Certainly not the best sushi in New York, howe...     food  positive
Trial Aspect Data:
                                            Sentence       Cate

In [11]:
# Combine the training, test, and trial datasets
all_data = pd.concat([aspect_data, test_aspect_data, trail_aspect_data], ignore_index=True)

# Encode the aspect categories using LabelEncoder
label_encoder_category = LabelEncoder()
all_data['Category'] = label_encoder_category.fit_transform(all_data['Category'])

# Encode sentiment (Polarity) using LabelEncoder
label_encoder_sentiment = LabelEncoder()
all_data['Polarity'] = label_encoder_sentiment.fit_transform(all_data['Polarity'])

# Create dictionaries to map encoded labels back to their original values
category_mapping = {label: category for label, category in enumerate(label_encoder_category.classes_)}
sentiment_mapping = {label: sentiment for label, sentiment in enumerate(label_encoder_sentiment.classes_)}

# Print the mapping information
print("Aspect Category Mapping:")
print(category_mapping)
print("\nSentiment (Polarity) Mapping:")
print(sentiment_mapping)

# Tokenize the text
max_words = 10000  # You can adjust this based on your dataset
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(all_data['Sentence'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(all_data['Sentence'])

# Pad sequences to make them of the same length
max_sequence_length = 63  # You can adjust this based on your dataset
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Split the data back into train, test, and trial datasets
train_sequences = sequences[:len(aspect_data)]
test_sequences = sequences[len(aspect_data):len(aspect_data) + len(test_aspect_data)]
trial_sequences = sequences[len(aspect_data) + len(test_aspect_data):]

# Prepare the target labels
train_labels_category = all_data['Category'][:len(aspect_data)]
test_labels_category = all_data['Category'][len(aspect_data):len(aspect_data) + len(test_aspect_data)]
trial_labels_category = all_data['Category'][len(aspect_data) + len(test_aspect_data):]


# Prepare the target sentiment labels
train_labels_sentiment = all_data['Polarity'][:len(aspect_data)]
test_labels_sentiment = all_data['Polarity'][len(aspect_data):len(aspect_data) + len(test_aspect_data)]
trial_labels_sentiment = all_data['Polarity'][len(aspect_data) + len(test_aspect_data):]


Aspect Category Mapping:
{0: 'ambience', 1: 'food', 2: 'miscellaneous', 3: 'price', 4: 'service'}

Sentiment (Polarity) Mapping:
{0: 'negative', 1: 'neutral', 2: 'positive'}


In [13]:
# Initialize a variable to store the maximum sequence length
max_sequence_length = 0

# Iterate through the sentences in your dataset
for sentence in all_data['Sentence']:
    # Tokenize the sentence
    tokens = tokenizer.texts_to_sequences([sentence])[0]

    # Update max_sequence_length if the current sentence is longer
    if len(tokens) > max_sequence_length:
        max_sequence_length = len(tokens)

print("Maximum Sequence Length:", max_sequence_length)


Maximum Sequence Length: 63


BILSTM layers for Category and Run the Epochs


In [14]:
# Define the embedding dimension
embedding_dim = 100  # You can adjust this based on your dataset
# Create the Bi-LSTM model
model = Sequential()

# Add an Embedding layer
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Dropout(0.2))  # Add dropout after embedding

# Add a Bidirectional LSTM layer with dropout
model.add(Bidirectional(LSTM(64, return_sequences=True, kernel_regularizer=l2(0.01))))
model.add(BatchNormalization())
model.add(Dropout(0.2))  # Add dropout after the first LSTM layer
model.add(Bidirectional(LSTM(64, kernel_regularizer=l2(0.01))))
model.add(BatchNormalization())
model.add(Dropout(0.2))  # Add dropout after the second LSTM layer

# Add a Dense layer for aspect classification
num_aspect_categories = len(label_encoder_category.classes_)
model.add(Dense(num_aspect_categories, activation='softmax', kernel_regularizer=l2(0.01)))

# Compile the model with a custom learning rate
custom_optimizer = Adam(learning_rate=0.001)  
model.compile(loss='sparse_categorical_crossentropy', optimizer=custom_optimizer, metrics=['accuracy'])

# Implement early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with learning rate scheduling and early stopping
def lr_schedule(epoch):
    if epoch < 5:
        return 0.001
    else:
        return 0.0001

learning_rate_scheduler = LearningRateScheduler(lr_schedule)
model.fit(train_sequences, train_labels_category, validation_data=(test_sequences, test_labels_category),
          epochs=100, batch_size=32, callbacks=[early_stopping, learning_rate_scheduler])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


<keras.src.callbacks.History at 0x182b8318210>

Save the Category Model and Toknizer

In [15]:
import pickle
import os

# Define the directory where you want to save the model and tokenizer
save_directory = 'save_model_category_2'  # Change this to your desired folder path

# Ensure the directory exists, create it if not
os.makedirs(save_directory, exist_ok=True)

# Save the Keras model in the specified directory
model.save(os.path.join(save_directory, 'my_category_model.h5'))

# Save the tokenizer in the specified directory
with open(os.path.join(save_directory, 'tokenizer.pickle'), 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

  saving_api.save_model(


Apply on testing dataset

In [16]:
import pandas as pd
from tensorflow.keras.models import load_model
import pickle
from sklearn import metrics
from sklearn.metrics import classification_report
import os


# Define the directory where your model and tokenizer are saved
load_directory = 'save_model_category_2'  # Change this to the directory path

# Load the Keras model
loaded_model = load_model(os.path.join(load_directory, 'my_category_model.h5'))

# Load the tokenizer
with open(os.path.join(load_directory, 'tokenizer.pickle'), 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)


In [17]:
# Load your test data (assuming you have a DataFrame with a "Sentence" column)
test_data = pd.read_excel("C:/Users/vsing/OneDrive/Desktop/Capstone Project/Capstone_project_Sentiment_Analysis_Aspect_Level/Datasets/Capstone Project Dataset- Restaurent Test one category.xlsx")  # Replace with the actual path

# Preprocess the test data for prediction (tokenization and padding)
max_sequence_length = 63  # Same as in your training code
sequences = tokenizer.texts_to_sequences(test_data['Sentence'])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Make predictions
predictions = model.predict(padded_sequences)
predicted_labels = predictions.argmax(axis=1)

# Convert the predicted labels back to their original category values using the mapping
predicted_categories = [category_mapping[label] for label in predicted_labels]

# Add the predicted categories to the DataFrame
test_data['Predicted_Category'] = predicted_categories

# Save the results to an Excel file
output_file_path = 'Category_predicted_output.xlsx'  # Replace with the desired output file path
test_data.to_excel(output_file_path, index=False)




In [18]:
# Actual categories
actual_categories = test_data['Category_A']

# Create a classification report to calculate precision, recall, and F1 score
classification_report_result = classification_report(actual_categories, predicted_categories, target_names=category_mapping.values())

# Print the classification report
print("Classification Report for Test Dataset:")
print(classification_report_result)

# Category Analysis Metrics (instead of Sentiment)
accuracy_category = metrics.accuracy_score(actual_categories,predicted_categories)
precision_category = metrics.precision_score(actual_categories, predicted_categories, average='weighted')
recall_category = metrics.recall_score(actual_categories, predicted_categories, average='weighted')
f1_category = metrics.f1_score(actual_categories, predicted_categories, average='weighted')

print("\nCategory Analysis Metrics:")
print("Accuracy:", accuracy_category)
print("Precision:", precision_category)
print("Recall:", recall_category)
print("F1 Score:", f1_category)

Classification Report for Test Dataset:
               precision    recall  f1-score   support

     ambience       0.75      0.46      0.57        84
         food       0.84      0.77      0.81       367
miscellaneous       0.67      0.77      0.72       195
        price       0.56      0.27      0.37        33
      service       0.55      0.78      0.65       120

     accuracy                           0.72       799
    macro avg       0.68      0.61      0.62       799
 weighted avg       0.74      0.72      0.72       799


Category Analysis Metrics:
Accuracy: 0.7221526908635795
Precision: 0.7367312054360522
Recall: 0.7221526908635795
F1 Score: 0.7193281874314086


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the confusion matrix
confusion_matrix_result = confusion_matrix(actual_categories, predicted_categories)

# Visualize the confusion matrix as a heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_matrix_result, annot=True, fmt='d', cmap='Blues', xticklabels=category_mapping.values(), yticklabels=category_mapping.values())
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Category Prediction')
plt.show()

# Print the confusion matrix as a DataFrame
confusion_df = pd.DataFrame(confusion_matrix_result, index=category_mapping.values(), columns=category_mapping.values())
print("Confusion Matrix for Aspect Category Prediction:")
print(confusion_df)


Predict the Category on Random Sentence

In [20]:
# Define a new sentence you want to predict
new_sentence = "this pizza is great"

# Preprocess the new sentence
new_sentence = [new_sentence]  # Convert to a list for consistency with previous data
new_sequences = tokenizer.texts_to_sequences(new_sentence)
new_sequences = pad_sequences(new_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Make predictions using the loaded model
predicted_category_label = model.predict(new_sequences)
predicted_category_label = np.argmax(predicted_category_label)

# Convert the predicted label back to the original category
predicted_category = category_mapping[predicted_category_label]

# Print the predicted category
print(f"Sentence: {new_sentence}")
print(f'Predicted Aspect Category: {predicted_category}')


Sentence: ['this pizza is great']
Predicted Aspect Category: food
