In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical 
import tensorflow as tf

# Step 1: Data Preprocessing
# Load the dataset
df = pd.read_csv("train_without_duplicates.csv")
# df.append(pd.read_csv("validate_without_duplicates.csv"))

# df = pd.read_csv("train_without_duplicates.csv")

# Tokenize the reviews
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Review'])
sequences = tokenizer.texts_to_sequences(df['Review'])

# Define a ModelCheckpoint callback to save the best model based on validation accuracy
checkpoint = ModelCheckpoint("model_without_duplicates.h5", monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

# Convert labels to categorical format
labels = {'PRED': 1, 'PROD1': 2, 'ASP': 3, 'PROD2': 4}
count = 0
labels_numbered = []
for _, row in df.iterrows():
    m_len = len(sequences[count])+2
    output_sequence = [0] * m_len
    for label, indices in eval(row['Label']).items():

        # Set the values according to the class
        for index in indices:
            output_sequence[index] = labels[label]
            
    labels_numbered.append(output_sequence)
    count+=1

# Pad sequences and handle null entries
max_length = max(len(seq) for seq in sequences)
sequences_padded = pad_sequences(sequences, maxlen=max_length, padding='post')
labels_padded = pad_sequences(labels_numbered, maxlen=max_length, padding='post')

# Step 2: Define the LSTM Model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=100, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(Dense(units=5, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# # Step 3: Training the Model
X = sequences_padded
y = labels_padded

model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2, callbacks=[checkpoint])


# Step 4: Evaluation (if needed)
# Evaluate the model on a separate validation set or use cross-validation techniques
# Load and preprocess the validation dataset
validation_df = pd.read_csv("test_without_duplicates.csv")  # Replace "validation_data.csv" with your validation dataset file name
validation_sequences = tokenizer.texts_to_sequences(validation_df['Review'])
max_length = max(len(seq) for seq in validation_sequences)
validation_sequences_padded = pad_sequences(validation_sequences, maxlen=max_length, padding='post')

validation_labels = {'PRED': 1, 'PROD1': 2, 'ASP': 3, 'PROD2': 4}
count = 0
validation_labels_numbered = []
for _, row in validation_df.iterrows():
    # m_len = len(validation_sequences[count])+5
    m_len = len(validation_sequences_padded[count])
    output_sequence = [0] * m_len
    for label, indices in eval(row['Label']).items():
        # Set the values according to the class
        for index in indices:
            output_sequence[index] = labels[label]
            
    validation_labels_numbered.append(output_sequence)
    count+=1


validation_labels_padded = pad_sequences(validation_labels_numbered, maxlen=max_length, padding='post')

X_val = validation_sequences_padded
y_val = validation_labels_padded

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)
##################################################################################################################
# Step 5: Prediction
# Load the saved model
from keras.models import load_model

# Load the saved model
loaded_model = load_model("model_without_duplicates.h5")


# Example sentence to predict SRL labels
example_sentences = ["it is much less expensive than a router also",
                    "good price  same performance as sony","i purchased this to replace a less expensive headset i got from dell",
                    "this is an excellent camera from sony",
                     "the picture cd  while lower total resolution  was far sharper than anything the 8400f could produce at any size",
"i have used the canon cartridges for many years and thought i would try to save money buy purchasing this less expensive brand which has been very disappointing",
"this camera  though a bit less known than those other big-brand versions is and was one of the best qualtiy cameras i've used",
"canon has better sound with video recording  less noise  better detail in shadows",
"this detector is basically the same as the escort passport 8500  but less expensive",
"this is a great camera and has many unique features usually found on more expensive or larger digital cameras",
"yet this camera is less expensive than the newest model",
"the battery life is good and i always believe in genuine replacements from the manufacturer even if they are slightly more expensive",
"oh  it has two settings  the 1500watt and the 900 watt and the fan is the same speed on both",
"usps is so much faster than ups",
"the card works great and the quality is great and the price is even greater",
"my computer is working so much faster now",
"they still read in it's directory as  shn files this device is extremely versatile and worth far more than the low price amazon's asking",
"sure  you could spend more and get an ipod  but why would you want to when this device is so much less expensive",
"this model's oe radio is not as deep as the newer aftermarket radios",
"no other camera has the same range of features for the same price",
"the radio reception is great  and much better than cheap digital tuner walkmans   weighs next to nothing and comes with a convenient plastic proective case",
"the camera creates a larger file through interpolation  but the 6mp image is actually of worse resolution than 3 mp image",
"i understand what it takes to make a good stereo  and i know what value there is in the more expensive kits"]

for example_sentence in example_sentences:
    # Tokenize the example sentence
    example_sequence = tokenizer.texts_to_sequences([example_sentence])

    # Pad the sequence to ensure it has the same length as the sequences used during training
    example_sequence_padded = pad_sequences(example_sequence, maxlen=max_length, padding='post')

    # Make prediction
    predicted_labels = np.argmax(loaded_model.predict(example_sequence_padded),axis=-1)
    # predicted_labels = loaded_model.predict(example_sequence_padded)
    print(predicted_labels)
    # print(predicted_labels.shape)

2024-03-23 17:19:38.590617: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-23 17:19:40.648919: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2024-03-23 17:19:40.833452: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2245900000 Hz


Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.89525, saving model to model_without_duplicates.h5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.89525 to 0.90861, saving model to model_without_duplicates.h5
Epoch 3/10

Epoch 00003: val_accuracy improved from 0.90861 to 0.91008, saving model to model_without_duplicates.h5
Epoch 4/10

Epoch 00004: val_accuracy improved from 0.91008 to 0.91028, saving model to model_without_duplicates.h5
Epoch 5/10

Epoch 00005: val_accuracy improved from 0.91028 to 0.91589, saving model to model_without_duplicates.h5
Epoch 6/10

Epoch 00006: val_accuracy did not improve from 0.91589
Epoch 7/10

Epoch 00007: val_accuracy did not improve from 0.91589
Epoch 8/10

Epoch 00008: val_accuracy did not improve from 0.91589
Epoch 9/10

Epoch 00009: val_accuracy did not improve from 0.91589
Epoch 10/10

Epoch 00010: val_accuracy did not improve from 0.91589
Test Loss: 0.5480355620384216
Test Accuracy: 0.854314386844635
[[2 0 1 1 3 0 0 4 0 0 0 