In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
import json
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Dropout
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import gensim.downloader as api

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [4]:
word_vectors = api.load('glove-wiki-gigaword-200')



In [5]:
import nltk
import pandas as pd
import string
from nltk.corpus import stopwords

def process_data(data):
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)

    tokens_list = []
    aspects_list = []
    bio_tags_list = []
    token_length_list = []
    pos_tags_list = []

    for entry in data:
        tokens = [token for token in entry["token"] if token not in punctuation]
        aspects = entry.get("aspects", [])
        pos_tags = nltk.pos_tag(tokens)

        unique_aspects = []
        bio_tags = ['O'] * len(tokens)

        for aspect in aspects:
            term = [char for char in aspect.get("term", []) if char not in punctuation]
            unique_aspects.append(''.join(term))

            for i in range(len(tokens)):
                if tokens[i:i + len(term)] == term:
                    if i > 0 and bio_tags[i - 1] == 'B':
                        bio_tags[i] = 'I'
                    else:
                        bio_tags[i] = 'B'
                    if len(term) > 1:
                        bio_tags[i + 1:i + len(term)] = ['I'] * (len(term) - 1)

        filtered_tokens = tokens
        filtered_bio_tags = bio_tags
        filtered_pos_tags = [tag[1] for tag in pos_tags]

        tokens_list.append(filtered_tokens)
        aspects_list.append(unique_aspects)
        bio_tags_list.append(filtered_bio_tags)
        token_length_list.append([len(token) for token in filtered_tokens])
        pos_tags_list.append(filtered_pos_tags)

    df_result = pd.DataFrame({
        'token': tokens_list,
        'aspect': aspects_list,
        'bioTag': bio_tags_list,
        'pos': pos_tags_list,  # Add POS tags as a new column
    })

    return df_result


In [6]:
# Load training data from JSON file
train_json_file_path = '/content/drive/MyDrive/Common files/Common files/Dataset/Laptops/train.json'
with open(train_json_file_path, 'r') as file:
    train_data = json.load(file)
test_json_file_path = '/content/drive/MyDrive/Common files/Common files/Dataset/Laptops/test.json'
with open(test_json_file_path, 'r') as file:
    test_data = json.load(file)
valid_json_file_path = '/content/drive/MyDrive/Common files/Common files/Dataset/Laptops/valid.json'
with open(valid_json_file_path, 'r') as file:
    valid_data = json.load(file)

In [7]:
df_train = process_data(train_data)
df_test = process_data(test_data)
df_valid = process_data(valid_data)

In [19]:
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
import nltk
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Dropout, Input, Concatenate, Dot, Activation, multiply
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
from gensim.models import KeyedVectors

# Adjust tokenizer with a fixed maximum vocabulary size
MAX_VOCAB_SIZE = 5000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(df_train['token'])
tokenizer.fit_on_texts(df_train['pos'])

# Set a fixed maximum sequence length
max_seq_length = 200
tag_to_index = {'O': 0, 'B': 1, 'I': 2,'PAD':3}
# Pad sequences with the fixed maximum sequence length
X_train = pad_sequences(tokenizer.texts_to_sequences(df_train['token']), maxlen=max_seq_length, padding='post', truncating='post')
X_test = pad_sequences(tokenizer.texts_to_sequences(df_test['token']), maxlen=max_seq_length, padding='post', truncating='post')
X_val = pad_sequences(tokenizer.texts_to_sequences(df_valid['token']), maxlen=max_seq_length, padding='post', truncating='post')

# Pad POS sequences with the fixed maximum sequence length
pos_vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size for POS tags
X_train_pos = pad_sequences(tokenizer.texts_to_sequences(df_train['pos']), maxlen=max_seq_length, padding='post', truncating='post')
X_test_pos = pad_sequences(tokenizer.texts_to_sequences(df_test['pos']), maxlen=max_seq_length, padding='post', truncating='post')
X_val_pos = pad_sequences(tokenizer.texts_to_sequences(df_valid['pos']), maxlen=max_seq_length, padding='post', truncating='post')

# Pad BIO tag sequences with the fixed maximum sequence length
y_train = pad_sequences([[tag_to_index[tag] for tag in seq] for seq in df_train['bioTag']], padding='post', value=3, maxlen=max_seq_length)
y_test = pad_sequences([[tag_to_index[tag] for tag in seq] for seq in df_test['bioTag']], padding='post', value=3, maxlen=max_seq_length)
y_val = pad_sequences([[tag_to_index[tag] for tag in seq] for seq in df_valid['bioTag']], padding='post', value=3, maxlen=max_seq_length)

# Convert numerical representations to one-hot encoding
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
y_val = to_categorical(y_val)

embedding_dim = 200  # Adjust based on the GloVe file you downloaded
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in word_vectors:
        embedding_matrix[i] = word_vectors[word]

# Import necessary libraries
from keras.preprocessing.image import ImageDataGenerator
from keras.regularizers import l2
from sklearn.utils.class_weight import compute_class_weight


# Compute class weights
# Compute class weights
# Compute class weights
# Calculate class weights manually





# Define L2 regularization strength
l2_reg = 0.01

# Update model architecture with regularization and class weights
def create_model_with_position_embeddings(embedding_matrix, max_seq_length, max_pos_length):
    # Input layers
    token_input_layer = Input(shape=(max_seq_length,))
    pos_input_layer = Input(shape=(max_seq_length,))

    # Word embedding layer
    word_embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                                     output_dim=embedding_matrix.shape[1],
                                     input_length=max_seq_length,
                                     weights=[embedding_matrix],
                                     trainable=True, mask_zero=True)
    token_embedding = word_embedding_layer(token_input_layer)

    # Positional embedding layer
    pos_embedding_layer = Embedding(input_dim=pos_vocab_size,
                                    output_dim=50,
                                    input_length=max_seq_length)
    pos_embedding = pos_embedding_layer(pos_input_layer)

    # Concatenate token embeddings and positional embeddings
    combined_embedding = Concatenate(axis=-1)([token_embedding, pos_embedding])

    # Dropout layer
    dropout_emb = Dropout(0.5)(combined_embedding)

    # Regularization layer
    reg_emb = TimeDistributed(Dense(512, activation='relu', kernel_regularizer=l2(l2_reg)))(dropout_emb)

    # Attention Mechanism
    attention = TimeDistributed(Dense(512, activation='relu'))(reg_emb)
    attention = TimeDistributed(Dense(1))(attention)
    attention = TimeDistributed(Dense(128, activation='relu'))(attention)
    attention = TimeDistributed(Dense(1))(attention)
    attention = Activation('softmax')(attention)

    # Apply attention weights
    sent_representation = multiply([reg_emb, attention])

    # Bidirectional LSTM layer
    lstm = Bidirectional(LSTM(units=100, return_sequences=True))(sent_representation)

    # Self-Attention Mechanism
    self_attention = Dot(axes=[2, 2])([lstm, lstm])
    self_attention = Activation('softmax')(self_attention)

    # Combine attention and self-attention
    combined_attention = Concatenate(axis=-1)([lstm, self_attention])

    # Feedforward layers with regularization
    output = TimeDistributed(Dense(128, activation='tanh', kernel_regularizer=l2(l2_reg)))(combined_attention)
    output = Dropout(0.1)(output)
    output = TimeDistributed(Dense(4, activation='softmax'))(output)  # Assuming 3 classes: 'O', 'B', 'I'

    # Create model
    model = Model(inputs=[token_input_layer, pos_input_layer], outputs=output)

    return model


In [20]:
# Define static class weights
static_class_weights = {0: 1, 1: 3, 2: 3}  # Adjust weights based on the classification report

# Create and compile the model with position embeddings
model_with_position_embeddings = create_model_with_position_embeddings(embedding_matrix, max_seq_length, len(tag_to_index))
model_with_position_embeddings.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
# Re-run the training process with the padded target labels and static class weights
model_with_position_embeddings.fit([X_train, X_train_pos], y_train, validation_data=([X_val, X_val_pos], y_val), epochs=50, batch_size=32,callbacks=[early_stopping])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50


<keras.src.callbacks.History at 0x7dc7f5145060>

In [22]:
from sklearn.metrics import classification_report, accuracy_score

# Make predictions on the test set
y_pred = model_with_position_embeddings.predict([X_test, X_test_pos])

# Convert predictions and true labels from one-hot encoding to labels
y_pred_labels = np.argmax(y_pred, axis=-1)
y_test_labels = np.argmax(y_test, axis=-1)

# Flatten the predictions and true labels to prepare for classification_report
y_pred_labels_flat = y_pred_labels.flatten()
y_test_labels_flat = y_test_labels.flatten()

# Generate classification report
class_names = ['O', 'B', 'I']
# Filter out 'PAD' tag from predictions and true labels
non_pad_indices = y_test_labels_flat != tag_to_index['PAD']
y_pred_labels_filtered = y_pred_labels_flat[non_pad_indices]
y_test_labels_filtered = y_test_labels_flat[non_pad_indices]

# Generate classification report
report = classification_report(y_test_labels_filtered, y_pred_labels_filtered, target_names=class_names)


# Print classification report
print("Classification Report (excluding PAD tag):")
print(report)

# Calculate accuracy excluding 'PAD' tag
accuracy = accuracy_score(y_test_labels_filtered, y_pred_labels_filtered)
print("Accuracy (excluding PAD tag):", accuracy)


Classification Report (excluding PAD tag):
              precision    recall  f1-score   support

           O       0.95      0.98      0.97      4776
           B       0.86      0.75      0.80       623
           I       0.80      0.70      0.75       425

    accuracy                           0.94      5824
   macro avg       0.87      0.81      0.84      5824
weighted avg       0.93      0.94      0.93      5824

Accuracy (excluding PAD tag): 0.9361263736263736
