In [19]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar xvzf aclImdb_v1.tar.gz

In [20]:
import os

positive_samples = []
positive_directory = '/content/aclImdb/train/pos/'
for filename in os.listdir(positive_directory):
    with open(os.path.join(positive_directory, filename), 'r') as file:
        positive_samples.append(file.read())

In [21]:
positive_directory = '/content/aclImdb/test/pos/'
for filename in os.listdir(positive_directory):
    with open(os.path.join(positive_directory, filename), 'r') as file:
        positive_samples.append(file.read())

In [22]:
negative_samples = []
negative_directory = '/content/aclImdb/train/neg/'
for filename in os.listdir(negative_directory):
    with open(os.path.join(negative_directory, filename), 'r') as file:
        negative_samples.append(file.read())

In [23]:
negative_directory = '/content/aclImdb/test/neg/'
for filename in os.listdir(negative_directory):
    with open(os.path.join(negative_directory, filename), 'r') as file:
        negative_samples.append(file.read())

In [24]:
import numpy as np
import pandas as pd

pos_texts= np.array(positive_samples)
neg_texts= np.array(negative_samples)
pos_labels=  np.array([1]*len(positive_samples))
neg_labels=  np.array([0]*len(negative_samples))

pos_dataset = pd.DataFrame({'review': pos_texts, 'label': pos_labels}, columns=['review', 'label'])
neg_dataset = pd.DataFrame({'review': neg_texts, 'label': neg_labels}, columns=['review', 'label'])

In [25]:
pos_train = pos_dataset.sample(frac = 0.8)
neg_train = neg_dataset.sample(frac = 0.8)
pos_part_20 = pos_dataset.drop(pos_train.index)
neg_part_20 = neg_dataset.drop(neg_train.index)

In [26]:
pos_test = pos_part_20.sample(frac = 0.5)
neg_test = neg_part_20.sample(frac = 0.5)
pos_val = pos_part_20.drop(pos_test.index)
neg_val = neg_part_20.drop(neg_test.index)

In [27]:
train_set= pd.concat([pos_train, neg_train], axis=0)
test_set=pd.concat([pos_test, neg_test], axis=0)
val_set=pd.concat([pos_val, neg_val], axis=0)
dataset =pd.concat([train_set, test_set,val_set], axis=0)

In [28]:
train_set = train_set.reset_index()
test_set = test_set.reset_index()
val_set = val_set.reset_index()
dataset = dataset.reset_index()

In [29]:
print('Raw data: ')
print('max length =',np.max([len(x) for x in dataset['review']]))
print('mean length =',np.mean([len(x) for x in dataset['review']]))

Raw data: 
max length = 13704
mean length = 1309.43102


In [30]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha() and word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

train_set['review'] = train_set['review'].apply(normalize_text)
val_set['review'] = val_set['review'].apply(normalize_text)
test_set['review'] = test_set['review'].apply(normalize_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
dataset_v2 =pd.concat([train_set, test_set,val_set], axis=0)

print('After normalization: ')
print('max length =',np.max([len(x) for x in dataset_v2['review']]))
print('mean length =',np.mean([len(x) for x in dataset_v2['review']]))

After normalization: 
max length = 9164
mean length = 812.165


In [32]:
# Import the necessary libraries
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Create the vocabulary from the training dataset
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=30000)
tokenizer.fit_on_texts(train_set['review'])
vocab_size= len(tokenizer.word_index)+1

# Encode the text data as sequences of integers
x_train = tokenizer.texts_to_sequences(train_set['review'])
x_val = tokenizer.texts_to_sequences(val_set['review'])
x_test = tokenizer.texts_to_sequences(test_set['review'])

dataset_v3 = x_train + x_val + x_test

print('After vectorization: ')
print('max length =',np.max([len(x) for x in dataset_v3]))
print('mean length =',np.mean([len(x) for x in dataset_v3]))

# Pad the sequences to the same length
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=500)
x_val = tf.keras.preprocessing.sequence.pad_sequences(x_val, maxlen=500)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=500)

After vectorization: 
max length = 1383
mean length = 119.72716


In [33]:
print('After padding: ')
print('max length =',np.max([len(x) for x in x_train]))
print('mean length =',np.mean([len(x) for x in x_train]))

After padding: 
max length = 500
mean length = 500.0


In [34]:
# Create an instance of the LabelEncoder
le = LabelEncoder()

# Fit and transform the label data
y_train = le.fit_transform(train_set['label'])
y_val = le.transform(val_set['label'])
y_test = le.transform(test_set['label'])

# Convert the labels to categorical data
y_train = tf.keras.utils.to_categorical(y_train)
y_val = tf.keras.utils.to_categorical(y_val)
y_test = tf.keras.utils.to_categorical(y_test)

In [35]:
# Define the LSTM model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 100, input_length=500))
model.add(tf.keras.layers.LSTM(256))
model.add(tf.keras.layers.Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs=5, batch_size=32, validation_data=(x_val, y_val))

# Evaluate the model on the test dataset
score = model.evaluate(x_test, y_test, batch_size=32)
print(f'Test loss: {score[0]}')
print(f'Test accuracy: {score[1]}')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.551957368850708
Test accuracy: 0.8813999891281128


In [36]:
# Define the CNN model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 100, input_length=500))
model.add(tf.keras.layers.Conv1D(512, 5, activation='relu'))
model.add(tf.keras.layers.Dropout(.3))
model.add(tf.keras.layers.MaxPooling1D(5))
model.add(tf.keras.layers.Conv1D(512, 5, activation='relu'))
model.add(tf.keras.layers.Dropout(.2))
model.add(tf.keras.layers.MaxPooling1D(5))
model.add(tf.keras.layers.Conv1D(512, 5, activation='relu'))
model.add(tf.keras.layers.GlobalMaxPooling1D())
model.add(tf.keras.layers.Dense(512, activation='relu'))
model.add(tf.keras.layers.Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs=5, batch_size=32, validation_data=(x_val, y_val))

# Evaluate the model on the test dataset
score = model.evaluate(x_test, y_test, batch_size=32)
print(f'Test loss: {score[0]}')
print(f'Test accuracy: {score[1]}')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.5007619857788086
Test accuracy: 0.8651999831199646
