## Importing Libraries and modules

In [None]:
import numpy as np
import pandas as pd 
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.optimizers import SGD, Adam
import bz2
import csv
from sklearn.metrics import roc_auc_score

## Reading function for data

In [None]:
def read(text):
    data = bz2.BZ2File(text)
    data = data.readlines()
    data = [x.decode('utf-8') for x in data]
    return data

In [8]:
training_data=read("D:/AI/Electropi/5.Deep Neural Networks [NN2]/4. Poem Text Generation/Capstones/Data/Amazon_review/train.ft.txt.bz2"
)
testing_data=read("D:/AI/Electropi/5.Deep Neural Networks [NN2]/4. Poem Text Generation/Capstones/Data/Amazon_review/test.ft.txt.bz2"
)

print(len(training_data))
print( len(testing_data))

3600000
400000


## Extracting Training and Testing data & and assigning the labels

In [9]:

def extraction(training_data, testing_data):
    # Split the data into labels and texts
    training_labels = [int(re.findall(r'__label__(\d)', line)[0]) for line in training_data]
    training_texts = [re.sub(r'__label__\d ', '', line) for line in training_data]

    testing_labels = [int(re.findall(r'__label__(\d)', line)[0]) for line in testing_data]
    testing_texts = [re.sub(r'__label__\d ', '', line) for line in testing_data]
    
    return training_labels, training_texts, testing_labels, testing_texts

def convert(training_labels, testing_labels):
    # Convert training labels to binary (0 and 1)
    training_labels = [0 if label == 1 else 1 for label in training_labels]
    
    # Convert test labels to binary (0 and 1)
    testing_labels = [0 if label == 1 else 1 for label in testing_labels]
    
    return training_labels, testing_labels


In [10]:
# Tokenization and padding

training_labels, training_texts, testing_labels, testing_texts = extraction(training_data, testing_data)
training_labels, testing_labels = convert(training_labels, testing_labels)

print("Training Labels:", len(training_labels))
print("Training Texts:", len(training_texts))
print("Test Labels:", len(testing_labels))
print("Test Texts:", len(testing_texts))

In [11]:
from keras.preprocessing.sequence import pad_sequences

# Tokenization and padding
max_words = 1000
max_sequence_length = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(training_texts)

X_train_sequences = tokenizer.texts_to_sequences(training_texts)
X_test_sequences = tokenizer.texts_to_sequences(testing_texts)

# Pad sequences to a fixed length
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# Convert to NumPy arrays
X_train = np.array(X_train_padded)
X_test = np.array(X_test_padded)

# Print shapes
print( X_train.shape)
print( X_test.shape)

# Convert labels to NumPy arrays
y_train = np.array(training_labels)
y_test = np.array(testing_labels)

# Print shapes
print( y_train.shape)
print(y_test.shape)

(3600000, 100)
(400000, 100)
(3600000,)
(400000,)


In [14]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.optimizers import SGD, Adam

def create_model(optimizer, activation_function, lstm_layers, max_words, max_sequence_length):
    # Define the model architecture
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
    for _ in range(lstm_layers):
        model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(128, return_sequences=False))
    model.add(Dense(1, activation=activation_function))
    
    # Compile the model with binary cross-entropy loss and the specified optimizer
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

# Define different configurations for the models
configurations = [
    {"optimizer": SGD(learning_rate=0.01), "activation_function": 'sigmoid', "lstm_layers": 1},
    {"optimizer": Adam(learning_rate=0.001), "activation_function": 'relu', "lstm_layers": 2},
]

results = []

# Train and evaluate models for each configuration
for i, config in enumerate(configurations, 1):
    print(f"Training model {i}/{len(configurations)}...")
    
    # Create a model based on the current configuration
    model = create_model(config["optimizer"], config["activation_function"], config["lstm_layers"], max_words, max_sequence_length)
    
    # Train the model on training data (X_train, y_train) for 5 epochs
    # using a batch size of 2048 and verbose training output
    model.fit(X_train, y_train, epochs=5, batch_size=2048, verbose=1)
    
    # Evaluate the trained model on the test data (X_test, y_test)
    loss, accuracy = model.evaluate(X_test, y_test)
    
    # Store the configuration, loss, and accuracy in the results list
    results.append({"config": config, "loss": loss, "accuracy": accuracy})



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
for result in results:
    config = result["config"]
    loss = result["loss"]
    accuracy = result["accuracy"]
    print(f"Configuration: Optimizer={config['optimizer']}, Activation={config['activation_function']}, LSTM Layers={config['lstm_layers']}")
    print(f"Test Loss: {loss}, Test Accuracy: {accuracy}\n")

Configuration: Optimizer=<keras.optimizers.sgd.SGD object at 0x7fdc344021d0>, Activation=sigmoid, LSTM Layers=1
Test Loss: 0.5965809226036072, Test Accuracy: 0.6793274879455566

Configuration: Optimizer=<keras.optimizers.adam.Adam object at 0x7fdc8261f5b0>, Activation=relu, LSTM Layers=2
Test Loss: 0.3077607750892639, Test Accuracy: 0.9090175032615662



In [17]:
# Evaluate with a confusion matrix and classification report
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Confusion Matrix:
[[176474  23526]
 [ 12867 187133]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.88      0.91    200000
           1       0.89      0.94      0.91    200000

    accuracy                           0.91    400000
   macro avg       0.91      0.91      0.91    400000
weighted avg       0.91      0.91      0.91    400000

