In [None]:
import pandas as pd
import json

# Assuming your JSON file is named 'your_file.json'
json_file_path = '/content/drive/MyDrive/Common files/Dataset/MAMS/train.json'

# Read data from JSON file
with open(json_file_path, 'r') as file:
    data = json.load(file)

# Initialize empty lists for each column
tokens_list = []
aspects_list = []
bio_tags_list = []
token_length_list = []

# Process each entry in the data
for entry in data:
    tokens = entry["token"]
    aspects = entry.get("aspects", [])

    # Extract unique aspects as a list of terms
    unique_aspects = list(set(term for aspect in aspects for term in aspect.get("term", [])))

    # Initialize BIO tags list with 'O' (outside) for each token
    bio_tags = ['O'] * len(tokens)

    # Process aspects and update BIO tags accordingly
    for aspect in aspects:
        term = aspect.get("term", [])
        for i in range(len(tokens)):
            if tokens[i:i + len(term)] == term:
                if i > 0 and bio_tags[i - 1] == 'B':
                    bio_tags[i] = 'I'
                else:
                    bio_tags[i] = 'B'
                if len(term) > 1:
                    bio_tags[i + 1:i + len(term)] = ['I'] * (len(term) - 1)

    # Append data to lists
    tokens_list.append(tokens)
    aspects_list.append(unique_aspects)  # Append unique aspects
    bio_tags_list.append(bio_tags)
    token_length_list.append([len(token) for token in tokens])

# Create DataFrame
df_result = pd.DataFrame({
    'token': tokens_list,
    'aspect': aspects_list,
    'bioTag': bio_tags_list,
    'token_length': token_length_list
})

# Save DataFrame to CSV file
csv_output_path = 'output_file.csv'
df_result.to_csv(csv_output_path, index=False)

# Display the DataFrame
print(df_result)

                                                  token  \
0     [the, decor, is, not, special, at, all, but, t...   
1     [when, tables, opened, up, ,, the, manager, sa...   
2     [though, the, menu, includes, some, unorthodox...   
3     [service, is, good, although, a, bit, in, your...   
4     [ps-, i, just, went, for, brunch, on, saturday...   
...                                                 ...   
4292  [for, dinner, ,, i, love, the, churrasco, and,...   
4293  [was, there, for, dinner, last, night, ,, and,...   
4294  [the, menu, sounded, good, but, the, grilled, ...   
4295  [service, is, coddling, and, correct, and, the...   
4296  [usc, has, a, cold, smoker, and, smoked, the, ...   

                                                 aspect  \
0                                 [prices, decor, food]   
1                                     [manager, tables]   
2         [classics, butter, peanut, roll, sushi, menu]   
3                                       [service, food]

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [None]:
import pandas as pd
import json
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout
from keras.preprocessing.text import Tokenizer

# Load the data from JSON file
json_file_path = '/content/drive/MyDrive/Common files/Dataset/MAMS/train.json'
with open(json_file_path, 'r') as file:
    data = json.load(file)

# Initialize empty lists for each column
tokens_list = []
aspects_list = []
bio_tags_list = []
token_length_list = []

# Process each entry in the data
for entry in data:
    tokens = entry["token"]
    aspects = entry.get("aspects", [])

    # Extract unique aspects as a list of terms
    unique_aspects = list(set(term for aspect in aspects for term in aspect.get("term", [])))

    # Initialize BIO tags list with 'O' (outside) for each token
    bio_tags = ['O'] * len(tokens)

    # Process aspects and update BIO tags accordingly
    for aspect in aspects:
        term = aspect.get("term", [])
        for i in range(len(tokens)):
            if tokens[i:i + len(term)] == term:
                if i > 0 and bio_tags[i - 1] == 'B':
                    bio_tags[i] = 'I'
                else:
                    bio_tags[i] = 'B'
                if len(term) > 1:
                    bio_tags[i + 1:i + len(term)] = ['I'] * (len(term) - 1)

    # Append data to lists
    tokens_list.append(tokens)
    aspects_list.append(unique_aspects)  # Append unique aspects
    bio_tags_list.append(bio_tags)
    token_length_list.append([len(token) for token in tokens])

# Create DataFrame
df_result = pd.DataFrame({
    'token': tokens_list,
    'aspect': aspects_list,
    'bioTag': bio_tags_list,
    'token_length': token_length_list
})

# Create a tokenizer and fit on the tokenized sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_result['token'])

# Convert tokens to numerical representations
X = tokenizer.texts_to_sequences(df_result['token'])

# Pad sequences to ensure uniform length
max_seq_length = max(len(seq) for seq in X)
X = pad_sequences(X, maxlen=max_seq_length, padding='post')

# Create a dictionary to map BIO tags to numerical representations
tag_to_index = {'O': 0, 'B': 1, 'I': 2}

# Convert BIO tags to numerical representations using the dictionary
y = [[tag_to_index[tag] for tag in seq] for seq in df_result['bioTag']]

# Pad sequences to ensure uniform length
y = pad_sequences(y, padding='post', value=-1)

# Convert numerical representations to one-hot encoding
y = to_categorical(y)

# Split the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=X_train.shape[1]))
model.add(Dropout(0.1))
model.add(LSTM(units=100, return_sequences=True))
model.add(TimeDistributed(Dense(y.shape[2], activation='softmax')))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=10)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.08019061386585236
Test Accuracy: 0.9738039970397949


In [None]:
# Sample test sentences
test_sentences = [
    "The food was delicious but the service was slow.",
    "I loved the ambiance of the restaurant.",
    "The prices were too high for the quality of food.",
    "The staff was friendly and helpful.",
    "The phone battery is good."
]

# Tokenize test sentences
tokenized_test_sentences = tokenizer.texts_to_sequences(test_sentences)

# Pad sequences to ensure uniform length
tokenized_test_sentences = pad_sequences(tokenized_test_sentences, maxlen=max_seq_length, padding='post')

# Predict BIO tags for test sentences
predictions = model.predict(tokenized_test_sentences)

# Convert predictions to BIO tags
predicted_tags = []
for pred in predictions:
    pred_tags = [np.argmax(tag) for tag in pred]
    predicted_tags.append(pred_tags)

# Convert numerical representations to BIO tags
index_to_tag = {index: tag for tag, index in tag_to_index.items()}
predicted_tags = [[index_to_tag[index] for index in seq] for seq in predicted_tags]

def extract_aspects(sentence, bio_tags):
    aspects = []
    current_aspect = ""
    for i, tag in enumerate(bio_tags):
        if i < len(sentence):  # Check if the index is within the range of the sentence
            if tag == "B":
                if current_aspect:
                    aspects.append(current_aspect)
                current_aspect = sentence[i]
            elif tag == "I":
                current_aspect += " " + sentence[i]
    if current_aspect:
        aspects.append(current_aspect)
    return aspects

# Display the predicted aspects for each test sentence
for sentence, tags in zip(test_sentences, predicted_tags):
    aspects = extract_aspects(sentence.split(), tags)
    print("Sentence:", sentence)
    print("Predicted Aspects:", aspects)
    print()

Sentence: The food was delicious but the service was slow.
Predicted Aspects: ['food', 'service']

Sentence: I loved the ambiance of the restaurant.
Predicted Aspects: ['ambiance']

Sentence: The prices were too high for the quality of food.
Predicted Aspects: ['prices', 'quality', 'food.']

Sentence: The staff was friendly and helpful.
Predicted Aspects: ['staff']

Sentence: The phone battery is good.
Predicted Aspects: ['good.']



In [None]:
from sklearn.metrics import confusion_matrix

# Predict BIO tags for test sentences
predictions = model.predict(X_test)

# Convert predictions to BIO tags
predicted_tags = []
for pred in predictions:
    pred_tags = [np.argmax(tag) for tag in pred]
    predicted_tags.append(pred_tags)

# Flatten the true and predicted BIO tags
y_true = np.argmax(y_test, axis=2).flatten()
y_pred = np.concatenate(predicted_tags)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)

# Display the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[19036   500   261]
 [  444  1683    77]
 [  215    80 37904]]


In [None]:
import pandas as pd
import json
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout
from keras.preprocessing.text import Tokenizer

# Load the data from JSON file
json_file_path = '/content/drive/MyDrive/Common files/Dataset/Laptops/train.json'
with open(json_file_path, 'r') as file:
    data = json.load(file)

# Initialize empty lists for each column
tokens_list = []
aspects_list = []
bio_tags_list = []
token_length_list = []

# Process each entry in the data
for entry in data:
    tokens = entry["token"]
    aspects = entry.get("aspects", [])

    # Extract unique aspects as a list of terms
    unique_aspects = list(set(term for aspect in aspects for term in aspect.get("term", [])))

    # Initialize BIO tags list with 'O' (outside) for each token
    bio_tags = ['O'] * len(tokens)

    # Process aspects and update BIO tags accordingly
    for aspect in aspects:
        term = aspect.get("term", [])
        for i in range(len(tokens)):
            if tokens[i:i + len(term)] == term:
                if i > 0 and bio_tags[i - 1] == 'B':
                    bio_tags[i] = 'I'
                else:
                    bio_tags[i] = 'B'
                if len(term) > 1:
                    bio_tags[i + 1:i + len(term)] = ['I'] * (len(term) - 1)

    # Append data to lists
    tokens_list.append(tokens)
    aspects_list.append(unique_aspects)  # Append unique aspects
    bio_tags_list.append(bio_tags)
    token_length_list.append([len(token) for token in tokens])

# Create DataFrame
df_result = pd.DataFrame({
    'token': tokens_list,
    'aspect': aspects_list,
    'bioTag': bio_tags_list,
    'token_length': token_length_list
})

# Create a tokenizer and fit on the tokenized sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_result['token'])

# Convert tokens to numerical representations
X = tokenizer.texts_to_sequences(df_result['token'])

# Pad sequences to ensure uniform length
max_seq_length = max(len(seq) for seq in X)
X = pad_sequences(X, maxlen=max_seq_length, padding='post')

# Create a dictionary to map BIO tags to numerical representations
tag_to_index = {'O': 0, 'B': 1, 'I': 2}

# Convert BIO tags to numerical representations using the dictionary
y = [[tag_to_index[tag] for tag in seq] for seq in df_result['bioTag']]

# Pad sequences to ensure uniform length
y = pad_sequences(y, padding='post', value=-1)

# Convert numerical representations to one-hot encoding
y = to_categorical(y)

# Split the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=X_train.shape[1]))
model.add(Dropout(0.1))
model.add(LSTM(units=100, return_sequences=True))
model.add(TimeDistributed(Dense(y.shape[2], activation='softmax')))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=10)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.051133111119270325
Test Accuracy: 0.984722375869751
