In [89]:
import numpy as np
import tensorflow as tf
import keras
from tensorflow.keras import models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import layers
from keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential

In [76]:
train_dir = 'C:\\Users\\xinga\\OneDrive\\文档\\GitHub\\AI-Project\\dataset\\aclImdb\\train'
val_dir = 'C:\\Users\\xinga\\OneDrive\\文档\\GitHub\\AI-Project\\dataset\\aclImdb\\test'

batch_size = 32

raw_train_ds = keras.utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    class_names=['neg', 'pos'])

raw_val_ds = keras.utils.text_dataset_from_directory(
    val_dir,
    batch_size=batch_size,
    class_names=['neg', 'pos'])

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [77]:
AUTOTUNE = tf.data.AUTOTUNE
raw_train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)
raw_val_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [78]:
# extra texts and labels from data set
texts, labels = [], []
for text_batch, label_batch in raw_train_ds:
    for text, label in zip(text_batch.numpy(), label_batch.numpy()):
        texts.append(text.decode('utf-8'))
        labels.append(label)

In [79]:
# Extract texts and labels from the validation dataset
val_texts, val_labels = [], []
for text_batch, label_batch in raw_val_ds:
    for text, label in zip(text_batch.numpy(), label_batch.numpy()):
        val_texts.append(text.decode('utf-8'))  # Fixed the variable name here
        val_labels.append(label)  # Fixed the typo and variable name here

In [80]:
max_feature = 20000
max_sequence_length = 200
embedding_dim = 100

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts + val_texts)  # Fit tokenizer on both training and validation texts

sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

val_sequences = tokenizer.texts_to_sequences(val_texts)
padded_val_sequences = pad_sequences(val_sequences, maxlen=max_sequence_length)

In [81]:
def load_glove_embeddings(file):
    embeddings = {}
    with open(file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_file = "C:\\Users\\xinga\\OneDrive\\文档\\GitHub\\AI-Project\\dataset\\glove\\glove.6B.100d.txt"
glove_embeddings = load_glove_embeddings(glove_file)

In [82]:
# # Text vectorization layer
# max_tokens = 100 # maximum number of tokens in the vocabulary
# text_vectorization = layers.TextVectorization(max_tokens=max_tokens)
# text_vectorization.adapt(texts)

In [83]:
# Create embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [84]:
# #parse the unzipped file
# path_to_glove_file = "C:\\Users\\xinga\\OneDrive\\文档\\GitHub\\AI-Project\\dataset\\glove\\glove.6B.100d.txt"

In [85]:
# embedding_dim = 100  # Match the dimension of the GloVe vectors you chose.

# # embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
# # for word, i in word_index.items():
# #     embedding_vector = glove_embeddings.get(word)
# #     if embedding_vector is not None:
# #         embedding_matrix[i] = embedding_vector
        
# embedding_matrix = np.zeros((max_tokens, embedding_dim)) #Prepare a matrix that we’ll fill with the GloVe vectors.
 
# for word, i in word_index.items():
#     if i < max_tokens:
#       embedding_vector = embeddings_index.get(word) #Fill entry i in the matrix with the word vector for index i. 
#     if embedding_vector is not None:
#       embedding_matrix[i] = embedding_vector

In [86]:
# Create the Embedding layer
embedding_layer = Embedding(
    input_dim=len(word_index) + 1,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=max_sequence_length,
    trainable=False
)

In [87]:
# embeddings_index = {}
# with open(path_to_glove_file, encoding="utf8") as f:
#     for line in f:
#         word, coefs = line.split(maxsplit=1) #it split the line into two parts, first part is word, second part is
#         coefs = np.fromstring(coefs, "f", sep=" ") # convert the vector from a string to a numpy array of floating point
#         embeddings_index[word] = coefs # adds the word asnd its corresponding vector to the "embedding"
# print(f"Found {len(embeddings_index)} word vectors.")

In [90]:
# Build and compile the model
model = Sequential([
    embedding_layer,
    LSTM(units=128),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train and evaluate the model
num_epochs = 10

model.fit(padded_sequences, np.array(labels), epochs=num_epochs, batch_size=batch_size, validation_data=(padded_val_sequences, np.array(val_labels)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1812515e0a0>

In [92]:
history = model.fit(padded_sequences, np.array(labels), epochs=num_epochs, batch_size=batch_size, validation_data=(padded_val_sequences, np.array(val_labels)))

Epoch 1/10
 97/782 [==>...........................] - ETA: 58s - loss: 0.0654 - accuracy: 0.9778

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Plot accuracy
plt.figure()
plt.plot(history.history['accuracy'], label='Training')
plt.plot(history.history['val_accuracy'], label='Validation')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Model Accuracy')
plt.legend()

In [None]:
plt.figure()
plt.plot(history.history['loss'], label='Training')
plt.plot(history.history['val_loss'], label='Validation')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Model Loss')
plt.legend()