In [3]:
!pip install transformers


KeyboardInterrupt



chagig the ymer of epochs from 3 to 5

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout
from tensorflow.keras.initializers import Constant

# Function to load GloVe embeddings
def load_glove_embeddings(embedding_dim):
    embeddings_index = {}
    embedding_file = f"/kaggle/input/glove6b/glove.6B.{embedding_dim}d.txt"  # Assuming files are named as glove.6B.{dim}d.txt
    with open(embedding_file, encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load IMDb train and test datasets
train_data = pd.read_csv('/kaggle/input/plmsentiment/train.csv/train.csv')
test_data = pd.read_csv('/kaggle/input/plmsentiment/test.csv/test.csv')

# Preprocess train data
X_train = train_data['review'].values
y_train = train_data['sentiment'].values

# Preprocess test data
X_test = test_data['review'].values
y_test = test_data['sentiment'].values

# Convert labels to numerical format
y_train = np.where(y_train == 'negative', 0, 1)
y_test = np.where(y_test == 'negative', 0, 1)

# Parameters for the model and training
max_features = 20000  # Number of words to consider as features
maxlen = 200  # Max length of individual reviews
batch_size = 32
epochs = 5

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train_tokenized, maxlen=maxlen)
X_test = pad_sequences(X_test_tokenized, maxlen=maxlen)

# Load GloVe embeddings of different dimensions
embedding_dims = [50, 100, 200, 300]
glove_embeddings = {}

for dim in embedding_dims:
    glove_embeddings[dim] = load_glove_embeddings(dim)

# List to store individual model predictions
model_predictions = []

# Function to create model with GloVe embeddings
def create_model(embedding_dim):
    embedding_matrix = np.zeros((max_features, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if i >= max_features:
            continue
        embedding_vector = glove_embeddings[embedding_dim].get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    inputs = Input(shape=(maxlen,))
    embedding_layer = Embedding(max_features, embedding_dim,
                                embeddings_initializer=Constant(embedding_matrix),
                                input_length=maxlen,
                                trainable=False)(inputs)

    lstm_branch = LSTM(128, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)
    cnn_branch = Conv1D(64, 5, activation='relu')(embedding_layer)
    cnn_branch = GlobalMaxPooling1D()(cnn_branch)
    merged = Concatenate()([lstm_branch, cnn_branch])
    merged = Dropout(0.5)(merged)
    output = Dense(1, activation='sigmoid')(merged)

    model = Model(inputs=inputs, outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train models and collect predictions
for dim in embedding_dims:
    print(f"Training model with {dim}d GloVe embeddings")
    model = create_model(dim)
    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, verbose=1)
    predictions = model.predict(X_test)
    model_predictions.append(predictions)

# Ensemble predictions by averaging
ensemble_predictions = np.mean(model_predictions, axis=0)

# Calculate ensemble accuracy
ensemble_accuracy = np.mean((ensemble_predictions > 0.5) == y_test)
print(f"Ensemble accuracy: {ensemble_accuracy}")


Training model with 50d GloVe embeddings
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training model with 100d GloVe embeddings
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training model with 200d GloVe embeddings
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training model with 300d GloVe embeddings
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Ensemble accuracy: 0.499917125


In [2]:
from gensim.models import KeyedVectors
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model_W2V = KeyedVectors.load_word2vec_format('/kaggle/input/word2vec/GoogleNews-vectors-negative300.bin',binary=True, limit=100000)

In [3]:
model_W2V.vector_size

300

In [4]:
import numpy as np
import pandas as pd
import time
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.initializers import Constant

# Record start time
start_time = time.time()

# Parameters for the model and training
max_features = 20000  # Number of words to consider as features
maxlen = 200  # Max length of individual reviews
embedding_dim = 300  # Dimension of word embeddings
lstm_units = 128  # Number of LSTM units
filters = 64  # Number of filters for CNN
kernel_size = 5  # Kernel size for CNN
batch_size = 32
epochs = 5

# Load IMDb train and test datasets
train_data = pd.read_csv('/kaggle/input/plmsentiment/train.csv/train.csv')
test_data = pd.read_csv('/kaggle/input/plmsentiment/test.csv/test.csv')

# Preprocess train data
X_train = train_data['review'].values
y_train = train_data['sentiment'].values

# Preprocess test data
X_test = test_data['review'].values
y_test = test_data['sentiment'].values

# Convert labels to numerical format
y_train[y_train == 'negative'] = 0
y_train[y_train == 'positive'] = 1
y_train = y_train.astype(int)

y_test[y_test == 'negative'] = 0
y_test[y_test == 'positive'] = 1
y_test = y_test.astype(int)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train_tokenized, maxlen=maxlen)
X_test = pad_sequences(X_test_tokenized, maxlen=maxlen)

# Load Word2Vec embeddings
word2vec_model = KeyedVectors.load_word2vec_format('/kaggle/input/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

# Create embedding matrix
num_words = min(max_features, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

# Define the model using functional API
inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(num_words, embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=maxlen,
                            trainable=False)(inputs)

# LSTM branch
lstm_branch = LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)

# CNN branch
cnn_branch = Conv1D(filters, kernel_size, activation='relu')(embedding_layer)
cnn_branch = GlobalMaxPooling1D()(cnn_branch)

# Concatenate both branches
merged = Concatenate()([lstm_branch, cnn_branch])
output = Dense(1, activation='sigmoid')(merged)

# Create model
model = Model(inputs=inputs, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print('Test accuracy:', accuracy)

# Record end time
end_time = time.time()
print("Time required to fine-tune:", end_time - start_time)


2024-02-23 17:53:11.925471: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-23 17:53:11.925659: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-23 17:53:12.086586: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.8895999789237976
Time required to fine-tune: 1033.9831807613373


adding dropout to merged output 

In [5]:
import numpy as np
import pandas as pd
import time
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout
from tensorflow.keras.initializers import Constant

# Record start time
start_time = time.time()

# Parameters for the model and training
max_features = 20000  # Number of words to consider as features
maxlen = 200  # Max length of individual reviews
embedding_dim = 300  # Dimension of word embeddings
lstm_units = 128  # Number of LSTM units
filters = 64  # Number of filters for CNN
kernel_size = 5  # Kernel size for CNN
batch_size = 32
epochs = 5

# Load IMDb train and test datasets
train_data = pd.read_csv('/kaggle/input/plmsentiment/train.csv/train.csv')
test_data = pd.read_csv('/kaggle/input/plmsentiment/test.csv/test.csv')

# Preprocess train data
X_train = train_data['review'].values
y_train = train_data['sentiment'].values

# Preprocess test data
X_test = test_data['review'].values
y_test = test_data['sentiment'].values

# Convert labels to numerical format
y_train[y_train == 'negative'] = 0
y_train[y_train == 'positive'] = 1
y_train = y_train.astype(int)

y_test[y_test == 'negative'] = 0
y_test[y_test == 'positive'] = 1
y_test = y_test.astype(int)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train_tokenized, maxlen=maxlen)
X_test = pad_sequences(X_test_tokenized, maxlen=maxlen)

# Load Word2Vec embeddings
word2vec_model = KeyedVectors.load_word2vec_format('/kaggle/input/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

# Create embedding matrix
num_words = min(max_features, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

# Define the model using functional API
inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(num_words, embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=maxlen,
                            trainable=False)(inputs)

# LSTM branch
lstm_branch = LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)

# CNN branch
cnn_branch = Conv1D(filters, kernel_size, activation='relu')(embedding_layer)
cnn_branch = GlobalMaxPooling1D()(cnn_branch)

# Concatenate both branches
merged = Concatenate()([lstm_branch, cnn_branch])
merged = Dropout(0.5)(merged)  # Dropout layer for regularization
output = Dense(1, activation='sigmoid')(merged)

# Create model
model = Model(inputs=inputs, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print('Test accuracy:', accuracy)

# Record end time
end_time = time.time()
print("Time required to fine-tune:", end_time - start_time)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.8881000280380249
Time required to fine-tune: 1017.9823062419891


icreasimg the epochs to 10

In [6]:
import numpy as np
import pandas as pd
import time
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.initializers import Constant

# Record start time
start_time = time.time()

# Parameters for the model and training
max_features = 20000  # Number of words to consider as features
maxlen = 200  # Max length of individual reviews
embedding_dim = 300  # Dimension of word embeddings
lstm_units = 128  # Number of LSTM units
filters = 64  # Number of filters for CNN
kernel_size = 5  # Kernel size for CNN
batch_size = 32
epochs = 10

# Load IMDb train and test datasets
train_data = pd.read_csv('/kaggle/input/plmsentiment/train.csv/train.csv')
test_data = pd.read_csv('/kaggle/input/plmsentiment/test.csv/test.csv')

# Preprocess train data
X_train = train_data['review'].values
y_train = train_data['sentiment'].values

# Preprocess test data
X_test = test_data['review'].values
y_test = test_data['sentiment'].values

# Convert labels to numerical format
y_train[y_train == 'negative'] = 0
y_train[y_train == 'positive'] = 1
y_train = y_train.astype(int)

y_test[y_test == 'negative'] = 0
y_test[y_test == 'positive'] = 1
y_test = y_test.astype(int)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train_tokenized, maxlen=maxlen)
X_test = pad_sequences(X_test_tokenized, maxlen=maxlen)

# Load Word2Vec embeddings
word2vec_model = KeyedVectors.load_word2vec_format('/kaggle/input/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

# Create embedding matrix
num_words = min(max_features, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

# Define the model using functional API
inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(num_words, embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=maxlen,
                            trainable=False)(inputs)

# LSTM branch
lstm_branch = LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)

# CNN branch
cnn_branch = Conv1D(filters, kernel_size, activation='relu')(embedding_layer)
cnn_branch = GlobalMaxPooling1D()(cnn_branch)

# Concatenate both branches
merged = Concatenate()([lstm_branch, cnn_branch])
output = Dense(1, activation='sigmoid')(merged)

# Create model
model = Model(inputs=inputs, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print('Test accuracy:', accuracy)

# Record end time
end_time = time.time()
print("Time required to fine-tune:", end_time - start_time)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.8899499773979187
Time required to fine-tune: 2024.7742836475372


stemmig usimg porter stemmer

In [7]:
import numpy as np
import pandas as pd
import time
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.initializers import Constant
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')

# Record start time
start_time = time.time()

# Parameters for the model and training
max_features = 20000  # Number of words to consider as features
maxlen = 200  # Max length of individual reviews
embedding_dim = 300  # Dimension of word embeddings
lstm_units = 128  # Number of LSTM units
filters = 64  # Number of filters for CNN
kernel_size = 5  # Kernel size for CNN
batch_size = 32
epochs = 5

# Load IMDb train and test datasets
train_data = pd.read_csv('/kaggle/input/plmsentiment/train.csv/train.csv')
test_data = pd.read_csv('/kaggle/input/plmsentiment/test.csv/test.csv')

# Preprocess train data
X_train = train_data['review'].values
y_train = train_data['sentiment'].values

# Preprocess test data
X_test = test_data['review'].values
y_test = test_data['sentiment'].values

# Convert labels to numerical format
y_train[y_train == 'negative'] = 0
y_train[y_train == 'positive'] = 1
y_train = y_train.astype(int)

y_test[y_test == 'negative'] = 0
y_test[y_test == 'positive'] = 1
y_test = y_test.astype(int)

# Stemming function
stemmer = PorterStemmer()
def stem_text(text):
    return ' '.join([stemmer.stem(word) for word in word_tokenize(text.lower())])

# Apply stemming to train and test data
X_train = [stem_text(text) for text in X_train]
X_test = [stem_text(text) for text in X_test]

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train_tokenized, maxlen=maxlen)
X_test = pad_sequences(X_test_tokenized, maxlen=maxlen)

# Load Word2Vec embeddings
word2vec_model = KeyedVectors.load_word2vec_format('/kaggle/input/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

# Create embedding matrix
num_words = min(max_features, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

# Define the model using functional API
inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(num_words, embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=maxlen,
                            trainable=False)(inputs)

# LSTM branch
lstm_branch = LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)

# CNN branch
cnn_branch = Conv1D(filters, kernel_size, activation='relu')(embedding_layer)
cnn_branch = GlobalMaxPooling1D()(cnn_branch)

# Concatenate both branches
merged = Concatenate()([lstm_branch, cnn_branch])
output = Dense(1, activation='sigmoid')(merged)

# Create model
model = Model(inputs=inputs, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print('Test accuracy:', accuracy)

# Record end time
end_time = time.time()
print("Time required to fine-tune:", end_time - start_time)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.8576499819755554
Time required to fine-tune: 1375.0074853897095


usig Lancaster stemmer 

In [1]:
import numpy as np
import pandas as pd
import time
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.initializers import Constant
from nltk.stem import LancasterStemmer
import nltk
nltk.download('punkt')

# Record start time
start_time = time.time()

# Parameters for the model and training
max_features = 20000  # Number of words to consider as features
maxlen = 200  # Max length of individual reviews
embedding_dim = 300  # Dimension of word embeddings
lstm_units = 128  # Number of LSTM units
filters = 64  # Number of filters for CNN
kernel_size = 5  # Kernel size for CNN
batch_size = 32
epochs = 5

# Load IMDb train and test datasets
train_data = pd.read_csv('/kaggle/input/plmsentiment/train.csv/train.csv')
test_data = pd.read_csv('/kaggle/input/plmsentiment/test.csv/test.csv')

# Preprocess train data
X_train = train_data['review'].values
y_train = train_data['sentiment'].values

# Preprocess test data
X_test = test_data['review'].values
y_test = test_data['sentiment'].values

# Convert labels to numerical format
y_train[y_train == 'negative'] = 0
y_train[y_train == 'positive'] = 1
y_train = y_train.astype(int)

y_test[y_test == 'negative'] = 0
y_test[y_test == 'positive'] = 1
y_test = y_test.astype(int)

# Initialize Lancaster Stemmer
stemmer = LancasterStemmer()

# Apply stemming to train and test data
X_train_stemmed = [' '.join([stemmer.stem(word) for word in text.split()]) for text in X_train]
X_test_stemmed = [' '.join([stemmer.stem(word) for word in text.split()]) for text in X_test]

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train_stemmed)

X_train_tokenized = tokenizer.texts_to_sequences(X_train_stemmed)
X_test_tokenized = tokenizer.texts_to_sequences(X_test_stemmed)
X_train = pad_sequences(X_train_tokenized, maxlen=maxlen)
X_test = pad_sequences(X_test_tokenized, maxlen=maxlen)

# Load Word2Vec embeddings
word2vec_model = KeyedVectors.load_word2vec_format('/kaggle/input/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

# Create embedding matrix
num_words = min(max_features, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

# Define the model using functional API
inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(num_words, embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=maxlen,
                            trainable=False)(inputs)

# LSTM branch
lstm_branch = LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)

# CNN branch
cnn_branch = Conv1D(filters, kernel_size, activation='relu')(embedding_layer)
cnn_branch = GlobalMaxPooling1D()(cnn_branch)

# Concatenate both branches
merged = Concatenate()([lstm_branch, cnn_branch])
output = Dense(1, activation='sigmoid')(merged)

# Create model
model = Model(inputs=inputs, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print('Test accuracy:', accuracy)

# Record end time
end_time = time.time()
print("Time required to fine-tune:", end_time - start_time)


2024-02-24 07:47:35.117325: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-24 07:47:35.117469: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-24 07:47:35.291569: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.8695999979972839
Time required to fine-tune: 1729.7143523693085


using SnowballStemmer

In [2]:
import numpy as np
import pandas as pd
import time
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.initializers import Constant
from nltk.stem import SnowballStemmer
import nltk
nltk.download('punkt')

# Record start time
start_time = time.time()

# Parameters for the model and training
max_features = 20000  # Number of words to consider as features
maxlen = 200  # Max length of individual reviews
embedding_dim = 300  # Dimension of word embeddings
lstm_units = 128  # Number of LSTM units
filters = 64  # Number of filters for CNN
kernel_size = 5  # Kernel size for CNN
batch_size = 32
epochs = 5

# Load IMDb train and test datasets
train_data = pd.read_csv('/kaggle/input/plmsentiment/train.csv/train.csv')
test_data = pd.read_csv('/kaggle/input/plmsentiment/test.csv/test.csv')

# Preprocess train data
X_train = train_data['review'].values
y_train = train_data['sentiment'].values

# Preprocess test data
X_test = test_data['review'].values
y_test = test_data['sentiment'].values

# Convert labels to numerical format
y_train[y_train == 'negative'] = 0
y_train[y_train == 'positive'] = 1
y_train = y_train.astype(int)

y_test[y_test == 'negative'] = 0
y_test[y_test == 'positive'] = 1
y_test = y_test.astype(int)

# Initialize Snowball Stemmer
stemmer = SnowballStemmer("english")

# Apply stemming to train and test data
X_train_stemmed = [' '.join([stemmer.stem(word) for word in text.split()]) for text in X_train]
X_test_stemmed = [' '.join([stemmer.stem(word) for word in text.split()]) for text in X_test]

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train_stemmed)

X_train_tokenized = tokenizer.texts_to_sequences(X_train_stemmed)
X_test_tokenized = tokenizer.texts_to_sequences(X_test_stemmed)
X_train = pad_sequences(X_train_tokenized, maxlen=maxlen)
X_test = pad_sequences(X_test_tokenized, maxlen=maxlen)

# Load Word2Vec embeddings
word2vec_model = KeyedVectors.load_word2vec_format('/kaggle/input/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

# Create embedding matrix
num_words = min(max_features, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

# Define the model using functional API
inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(num_words, embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=maxlen,
                            trainable=False)(inputs)

# LSTM branch
lstm_branch = LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)

# CNN branch
cnn_branch = Conv1D(filters, kernel_size, activation='relu')(embedding_layer)
cnn_branch = GlobalMaxPooling1D()(cnn_branch)

# Concatenate both branches
merged = Concatenate()([lstm_branch, cnn_branch])
output = Dense(1, activation='sigmoid')(merged)

# Create model
model = Model(inputs=inputs, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print('Test accuracy:', accuracy)

# Record end time
end_time = time.time()
print("Time required to fine-tune:", end_time - start_time)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Epoch 1/5
Epoch 2/5
Epoch 5/5
Test accuracy: 0.8781499862670898
Time required to fine-tune: 1714.3936071395874


Ensemble of hlove and word2vec

This code combines lexicon-based sentiment analysis using VADER, embedding-based sentiment analysis using Word2Vec, and PLM-based sentiment analysis using BERT. The final sentiment prediction is made through a voting mechanism, where each approach gets one vote, and the final sentiment label is determined by the majority vote.

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dense, concatenate
from keras.models import Model

# Load IMDb dataset
imdb_train = pd.read_csv('/kaggle/input/plmsentiment/train.csv/train.csv')
imdb_test = pd.read_csv('/kaggle/input/plmsentiment/test.csv/test.csv')

# Convert sentiment labels to integer type
label_mapping = {'positive': 1, 'negative': 0}
imdb_train['sentiment'] = imdb_train['sentiment'].map(label_mapping)
imdb_test['sentiment'] = imdb_test['sentiment'].map(label_mapping)


# Load GloVe embeddings
glove_model = KeyedVectors.load_word2vec_format('/kaggle/input/glove6b/glove.6B.100d.txt', binary=False, no_header=True)


# Lexicon-based approach using VADER
vader = SentimentIntensityAnalyzer()

def vader_sentiment_analysis(text):
    compound_score = vader.polarity_scores(text)['compound']
    return 1 if compound_score >= 0 else 0

# Embedding-based approach using GloVe
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(imdb_train['review'])
X_train = tokenizer.texts_to_sequences(imdb_train['review'])
X_test = tokenizer.texts_to_sequences(imdb_test['review'])
X_train = pad_sequences(X_train, maxlen=100)
X_test = pad_sequences(X_test, maxlen=100)

def glove_sentiment_analysis(text):
    tokens = text.split()
    embedding = np.zeros((100,))
    for token in tokens:
        if token in glove_model:
            embedding += glove_model[token]
    embedding /= len(tokens)
    prediction = model.predict(np.array([embedding]))
    return int(round(prediction[0][0]))

# Combine both approaches in an ensemble model
input_layer = Input(shape=(100,))
embedding_layer = Embedding(input_dim=10000, output_dim=100)(input_layer)
lstm_layer = LSTM(128)(embedding_layer)
output_layer_glove = Dense(1, activation='sigmoid')(lstm_layer)
output_layer_vader = Dense(1, activation='sigmoid')(lstm_layer)

# Concatenate the output layers
concatenated_output = concatenate([output_layer_glove, output_layer_vader])
ensemble_output = Dense(1, activation='sigmoid')(concatenated_output)

# Compile the model
model = Model(inputs=input_layer, outputs=ensemble_output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, imdb_train['sentiment'], batch_size=128, epochs=5, validation_split=0.2)

# Evaluate the model
_, accuracy = model.evaluate(X_test, imdb_test['sentiment'])
print("Accuracy: {:.2f}%".format(accuracy * 100))
#5 mi

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 83.95%


In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dense, concatenate, Dropout
from keras.models import Model
from keras.optimizers import RMSprop

# Load IMDb dataset
imdb_train = pd.read_csv('/kaggle/input/plmsentiment/train.csv/train.csv')
imdb_test = pd.read_csv('/kaggle/input/plmsentiment/test.csv/test.csv')

# Convert sentiment labels to integer type
label_mapping = {'positive': 1, 'negative': 0}
imdb_train['sentiment'] = imdb_train['sentiment'].map(label_mapping)
imdb_test['sentiment'] = imdb_test['sentiment'].map(label_mapping)

# Load GloVe embeddings
glove_model = KeyedVectors.load_word2vec_format('/kaggle/input/glove6b/glove.6B.100d.txt', binary=False, no_header=True)

# Lexicon-based approach using VADER
vader = SentimentIntensityAnalyzer()

def vader_sentiment_analysis(text):
    compound_score = vader.polarity_scores(text)['compound']
    return 1 if compound_score >= 0 else 0

# Embedding-based approach using GloVe
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(imdb_train['review'])
X_train = tokenizer.texts_to_sequences(imdb_train['review'])
X_test = tokenizer.texts_to_sequences(imdb_test['review'])
X_train = pad_sequences(X_train, maxlen=100)
X_test = pad_sequences(X_test, maxlen=100)

# Combine both approaches in an ensemble model
input_layer = Input(shape=(100,))
embedding_layer = Embedding(input_dim=10000, output_dim=100)(input_layer)
lstm_layer = LSTM(256, dropout=0.2)(embedding_layer)
output_layer_glove = Dense(1, activation='relu')(lstm_layer)
output_layer_vader = Dense(1, activation='relu')(lstm_layer)

# Concatenate the output layers
concatenated_output = concatenate([output_layer_glove, output_layer_vader])
ensemble_output = Dense(1, activation='relu')(concatenated_output)

# Compile the model
optimizer = RMSprop(learning_rate=0.001)
model = Model(inputs=input_layer, outputs=ensemble_output)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, imdb_train['sentiment'], batch_size=128, epochs=5, validation_split=0.2)

# Evaluate the model
_, accuracy = model.evaluate(X_test, imdb_test['sentiment'])
print("Accuracy: {:.2f}%".format(accuracy * 100))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 50.33%


vader + glove100d

In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dense, concatenate, Dropout, Bidirectional
from keras.models import Model
from keras.optimizers import RMSprop

# Load IMDb dataset
imdb_train = pd.read_csv('/kaggle/input/plmsentiment/train.csv/train.csv')
imdb_test = pd.read_csv('/kaggle/input/plmsentiment/test.csv/test.csv')

# Convert sentiment labels to integer type
label_mapping = {'positive': 1, 'negative': 0}
imdb_train['sentiment'] = imdb_train['sentiment'].map(label_mapping)
imdb_test['sentiment'] = imdb_test['sentiment'].map(label_mapping)

# Load GloVe embeddings
glove_model = KeyedVectors.load_word2vec_format('/kaggle/input/glove6b/glove.6B.100d.txt', binary=False, no_header=True)

# Lexicon-based approach using VADER
vader = SentimentIntensityAnalyzer()

def vader_sentiment_analysis(text):
    compound_score = vader.polarity_scores(text)['compound']
    return 1 if compound_score >= 0 else 0

# Embedding-based approach using GloVe
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(imdb_train['review'])
X_train = tokenizer.texts_to_sequences(imdb_train['review'])
X_test = tokenizer.texts_to_sequences(imdb_test['review'])
X_train = pad_sequences(X_train, maxlen=100)
X_test = pad_sequences(X_test, maxlen=100)

# Combine both approaches in an ensemble model
input_layer = Input(shape=(100,))
embedding_layer = Embedding(input_dim=10000, output_dim=100)(input_layer)
lstm_layer = Bidirectional(LSTM(128, dropout=0.2))(embedding_layer)
output_layer_glove = Dense(64, activation='relu')(lstm_layer)
output_layer_vader = Dense(64, activation='relu')(lstm_layer)

# Concatenate the output layers
concatenated_output = concatenate([output_layer_glove, output_layer_vader])
ensemble_output = Dense(1, activation='sigmoid')(concatenated_output)

# Compile the model
optimizer = RMSprop(learning_rate=0.001)
model = Model(inputs=input_layer, outputs=ensemble_output)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, imdb_train['sentiment'], batch_size=128, epochs=5, validation_split=0.2)

# Evaluate the model
_, accuracy = model.evaluate(X_test, imdb_test['sentiment'])
print("Accuracy: {:.2f}%".format(accuracy * 100))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 85.58%


vader + glove200d

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dense, concatenate, Dropout, Bidirectional
from keras.models import Model
from keras.optimizers import RMSprop

# Load IMDb dataset
imdb_train = pd.read_csv('/kaggle/input/plmsentiment/train.csv/train.csv')
imdb_test = pd.read_csv('/kaggle/input/plmsentiment/test.csv/test.csv')

# Convert sentiment labels to integer type
label_mapping = {'positive': 1, 'negative': 0}
imdb_train['sentiment'] = imdb_train['sentiment'].map(label_mapping)
imdb_test['sentiment'] = imdb_test['sentiment'].map(label_mapping)

# Load GloVe embeddings
glove_model = KeyedVectors.load_word2vec_format('/kaggle/input/glove6b/glove.6B.200d.txt', binary=False, no_header=True)

# Lexicon-based approach using VADER
vader = SentimentIntensityAnalyzer()

def vader_sentiment_analysis(text):
    compound_score = vader.polarity_scores(text)['compound']
    return 1 if compound_score >= 0 else 0

# Embedding-based approach using GloVe
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(imdb_train['review'])
X_train = tokenizer.texts_to_sequences(imdb_train['review'])
X_test = tokenizer.texts_to_sequences(imdb_test['review'])
X_train = pad_sequences(X_train, maxlen=100)
X_test = pad_sequences(X_test, maxlen=100)

# Combine both approaches in an ensemble model
input_layer = Input(shape=(100,))
embedding_layer = Embedding(input_dim=10000, output_dim=100)(input_layer)
lstm_layer = Bidirectional(LSTM(128, dropout=0.2))(embedding_layer)
output_layer_glove = Dense(64, activation='relu')(lstm_layer)
output_layer_vader = Dense(64, activation='relu')(lstm_layer)

# Concatenate the output layers
concatenated_output = concatenate([output_layer_glove, output_layer_vader])
ensemble_output = Dense(1, activation='sigmoid')(concatenated_output)

# Compile the model
optimizer = RMSprop(learning_rate=0.001)
model = Model(inputs=input_layer, outputs=ensemble_output)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, imdb_train['sentiment'], batch_size=128, epochs=5, validation_split=0.2)

# Evaluate the model
_, accuracy = model.evaluate(X_test, imdb_test['sentiment'])
print("Accuracy: {:.2f}%".format(accuracy * 100))


2024-02-25 16:49:22.928073: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-25 16:49:22.928219: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-25 16:49:23.077807: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 85.41%


vader + glove300d

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dense, concatenate, Dropout, Bidirectional
from keras.models import Model
from keras.optimizers import RMSprop

# Load IMDb dataset
imdb_train = pd.read_csv('/kaggle/input/plmsentiment/train.csv/train.csv')
imdb_test = pd.read_csv('/kaggle/input/plmsentiment/test.csv/test.csv')

# Convert sentiment labels to integer type
label_mapping = {'positive': 1, 'negative': 0}
imdb_train['sentiment'] = imdb_train['sentiment'].map(label_mapping)
imdb_test['sentiment'] = imdb_test['sentiment'].map(label_mapping)

# Load GloVe embeddings
glove_model = KeyedVectors.load_word2vec_format('/kaggle/input/glove6b/glove.6B.300d.txt', binary=False, no_header=True)

# Lexicon-based approach using VADER
vader = SentimentIntensityAnalyzer()

def vader_sentiment_analysis(text):
    compound_score = vader.polarity_scores(text)['compound']
    return 1 if compound_score >= 0 else 0

# Embedding-based approach using GloVe
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(imdb_train['review'])
X_train = tokenizer.texts_to_sequences(imdb_train['review'])
X_test = tokenizer.texts_to_sequences(imdb_test['review'])
X_train = pad_sequences(X_train, maxlen=100)
X_test = pad_sequences(X_test, maxlen=100)

# Combine both approaches in an ensemble model
input_layer = Input(shape=(100,))
embedding_layer = Embedding(input_dim=10000, output_dim=100)(input_layer)
lstm_layer = Bidirectional(LSTM(128, dropout=0.2))(embedding_layer)
output_layer_glove = Dense(64, activation='relu')(lstm_layer)
output_layer_vader = Dense(64, activation='relu')(lstm_layer)

# Concatenate the output layers
concatenated_output = concatenate([output_layer_glove, output_layer_vader])
ensemble_output = Dense(1, activation='sigmoid')(concatenated_output)

# Compile the model
optimizer = RMSprop(learning_rate=0.001)
model = Model(inputs=input_layer, outputs=ensemble_output)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, imdb_train['sentiment'], batch_size=128, epochs=5, validation_split=0.2)

# Evaluate the model
_, accuracy = model.evaluate(X_test, imdb_test['sentiment'])
print("Accuracy: {:.2f}%".format(accuracy * 100))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 85.66%


glove100d with afinn

In [2]:
!pip install afinn

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from afinn import Afinn  # Import Afinn
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dense, concatenate, Bidirectional
from keras.models import Model
from keras.optimizers import RMSprop

# Load IMDb dataset
imdb_train = pd.read_csv('/kaggle/input/plmsentiment/train.csv/train.csv')
imdb_test = pd.read_csv('/kaggle/input/plmsentiment/test.csv/test.csv')

# Convert sentiment labels to integer type
label_mapping = {'positive': 1, 'negative': 0}
imdb_train['sentiment'] = imdb_train['sentiment'].map(label_mapping)
imdb_test['sentiment'] = imdb_test['sentiment'].map(label_mapping)

# Load GloVe embeddings
glove_model = KeyedVectors.load_word2vec_format('/kaggle/input/glove6b/glove.6B.100d.txt', binary=False, no_header=True)

# Lexicon-based approach using AFINN
afinn = Afinn()

def afinn_sentiment_analysis(text):
    sentiment_score = afinn.score(text)
    return 1 if sentiment_score >= 0 else 0

# Embedding-based approach using GloVe
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(imdb_train['review'])
X_train = tokenizer.texts_to_sequences(imdb_train['review'])
X_test = tokenizer.texts_to_sequences(imdb_test['review'])
X_train = pad_sequences(X_train, maxlen=100)
X_test = pad_sequences(X_test, maxlen=100)

# Combine both approaches in an ensemble model
input_layer = Input(shape=(100,))
embedding_layer = Embedding(input_dim=10000, output_dim=100)(input_layer)
lstm_layer = Bidirectional(LSTM(128, dropout=0.2))(embedding_layer)
output_layer_glove = Dense(64, activation='relu')(lstm_layer)
output_layer_afinn = Dense(64, activation='relu')(lstm_layer)  # Afinn-based output layer

# Concatenate the output layers
concatenated_output = concatenate([output_layer_glove, output_layer_afinn])  # Afinn output added here
ensemble_output = Dense(1, activation='sigmoid')(concatenated_output)

# Compile the model
optimizer = RMSprop(learning_rate=0.001)
model = Model(inputs=input_layer, outputs=ensemble_output)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, imdb_train['sentiment'], batch_size=128, epochs=5, validation_split=0.2)

# Evaluate the model
_, accuracy = model.evaluate(X_test, imdb_test['sentiment'])
print("Accuracy: {:.2f}%".format(accuracy * 100))


Collecting afinn
  Downloading afinn-0.1.tar.gz (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.6/52.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: afinn
  Building wheel for afinn (setup.py) ... [?25ldone
[?25h  Created wheel for afinn: filename=afinn-0.1-py3-none-any.whl size=53429 sha256=fa9e38e9688802de24c96c6e5cb7faf3e648c7b0a3ed3bcb7ee64d7876078845
  Stored in directory: /root/.cache/pip/wheels/b0/05/90/43f79196199a138fb486902fceca30a2d1b5228e6d2db8eb90
Successfully built afinn
Installing collected packages: afinn
Successfully installed afinn-0.1


2024-02-24 14:34:51.681062: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-24 14:34:51.681208: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-24 14:34:51.860704: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 84.13%


In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.optim import AdamW
import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.initializers import Constant
from gensim.models import KeyedVectors
import time

# Record start time
start_time = time.time()

# Load data for RoBERTa model
data_roberta = pd.read_csv('/kaggle/input/plmsentiment/train.csv/train.csv')
data_roberta['sentiment'] = data_roberta['sentiment'].map({'positive': 1, 'negative': 0})
reviews_roberta = data_roberta['review'].tolist()
labels_roberta = data_roberta['sentiment'].tolist()

# Split data for RoBERTa model into training and validation sets
train_texts_roberta, val_texts_roberta, train_labels_roberta, val_labels_roberta = train_test_split(reviews_roberta, labels_roberta, test_size=0.2)

# Initialize tokenizer for RoBERTa
tokenizer_roberta = RobertaTokenizerFast.from_pretrained('roberta-base')

# Tokenize data for RoBERTa
train_encodings_roberta = tokenizer_roberta(train_texts_roberta, truncation=True, padding=True, max_length=512)
val_encodings_roberta = tokenizer_roberta(val_texts_roberta, truncation=True, padding=True, max_length=512)

# Create torch dataset for RoBERTa
class ReviewDatasetRoBERTa(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create dataloaders for RoBERTa
train_dataset_roberta = ReviewDatasetRoBERTa(train_encodings_roberta, train_labels_roberta)
val_dataset_roberta = ReviewDatasetRoBERTa(val_encodings_roberta, val_labels_roberta)

train_loader_roberta = DataLoader(train_dataset_roberta, batch_size=16, shuffle=True)
val_loader_roberta = DataLoader(val_dataset_roberta, batch_size=16, shuffle=False)

# Initialize RoBERTa model
model_roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
model_roberta = model_roberta.to('cuda')

# Initialize optimizer for RoBERTa
optimizer_roberta = AdamW(model_roberta.parameters(), lr=1e-5)

# Training loop for RoBERTa model
for epoch in range(3):  
    model_roberta.train()
    for batch in train_loader_roberta:
        optimizer_roberta.zero_grad()
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')
        outputs = model_roberta(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_roberta.step()

# Save the RoBERTa model
model_roberta.save_pretrained('sentiment_model_RoBERTa')

# Record end time for RoBERTa model training
end_time_roberta = time.time()

print("Time required to fine-tune RoBERTa: ", end_time_roberta - start_time)

# Record start time for Word2Vec model
start_time_word2vec = time.time()

# Parameters for the Word2Vec model
max_features = 20000  
maxlen = 200  
embedding_dim = 300  
lstm_units = 128  
filters = 64  
kernel_size = 5  
batch_size = 32
epochs = 10

# Load data for Word2Vec model
train_data_word2vec = pd.read_csv('/kaggle/input/plmsentiment/train.csv/train.csv')
test_data_word2vec = pd.read_csv('/kaggle/input/plmsentiment/test.csv/test.csv')

X_train_word2vec = train_data_word2vec['review'].values
y_train_word2vec = train_data_word2vec['sentiment'].values

X_test_word2vec = test_data_word2vec['review'].values
y_test_word2vec = test_data_word2vec['sentiment'].values

y_train_word2vec[y_train_word2vec == 'negative'] = 0
y_train_word2vec[y_train_word2vec == 'positive'] = 1
y_train_word2vec = y_train_word2vec.astype(int)

y_test_word2vec[y_test_word2vec == 'negative'] = 0
y_test_word2vec[y_test_word2vec == 'positive'] = 1
y_test_word2vec = y_test_word2vec.astype(int)

tokenizer_word2vec = Tokenizer(num_words=max_features)
tokenizer_word2vec.fit_on_texts(X_train_word2vec)

X_train_tokenized_word2vec = tokenizer_word2vec.texts_to_sequences(X_train_word2vec)
X_test_tokenized_word2vec = tokenizer_word2vec.texts_to_sequences(X_test_word2vec)
X_train_word2vec = pad_sequences(X_train_tokenized_word2vec, maxlen=maxlen)
X_test_word2vec = pad_sequences(X_test_tokenized_word2vec, maxlen=maxlen)

word2vec_model = KeyedVectors.load_word2vec_format('/kaggle/input/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

num_words = min(max_features, len(tokenizer_word2vec.word_index) + 1)
embedding_matrix_word2vec = np.zeros((num_words, embedding_dim))
for word, i in tokenizer_word2vec.word_index.items():
    if i >= max_features:
        continue
    if word in word2vec_model:
        embedding_matrix_word2vec[i] = word2vec_model[word]

inputs_word2vec = Input(shape=(maxlen,))
embedding_layer_word2vec = Embedding(num_words, embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix_word2vec),
                            input_length=maxlen,
                            trainable=False)(inputs_word2vec)

lstm_branch_word2vec = LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2)(embedding_layer_word2vec)
cnn_branch_word2vec = Conv1D(filters, kernel_size, activation='relu')(embedding_layer_word2vec)
cnn_branch_word2vec = GlobalMaxPooling1D()(cnn_branch_word2vec)

merged_word2vec = Concatenate()([lstm_branch_word2vec, cnn_branch_word2vec])
output_word2vec = Dense(1, activation='sigmoid')(merged_word2vec)

model_word2vec = Model(inputs=inputs_word2vec, outputs=output_word2vec)

model_word2vec.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model_word2vec.fit(X_train_word2vec, y_train_word2vec, batch_size=batch_size, epochs=epochs, validation_split=0.2)

loss_word2vec, accuracy_word2vec = model_word2vec.evaluate(X_test_word2vec, y_test_word2vec)
print('Test accuracy for Word2Vec model:', accuracy_word2vec)

# Record end time for Word2Vec model
end_time_word2vec = time.time()
print("Time required to fine-tune Word2Vec:", end_time_word2vec - start_time_word2vec)

# Load validation data for RoBERTa model
val_data_roberta = pd.read_csv('/kaggle/input/plmsentiment/test.csv/test.csv')
val_texts_roberta = val_data_roberta['review'].tolist()
val_labels_roberta = val_data_roberta['sentiment'].map({'positive': 1, 'negative': 0}).tolist()

# Tokenize data for RoBERTa model
val_encodings_roberta = tokenizer_roberta(val_texts_roberta, truncation=True, padding=True, max_length=512)

# Create torch dataset for validation data for RoBERTa
val_dataset_roberta = ReviewDatasetRoBERTa(val_encodings_roberta, val_labels_roberta)
val_loader_roberta = DataLoader(val_dataset_roberta, batch_size=16, shuffle=False)

# Predict on validation data using RoBERTa model
model_roberta.eval()
predictions_roberta = []
true_labels_roberta = []
for batch in val_loader_roberta:
    input_ids = batch['input_ids'].to('cuda')
    attention_mask = batch['attention_mask'].to('cuda')
    labels = batch['labels'].to('cuda')

    with torch.no_grad():
        outputs = model_roberta(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
    predictions_roberta.extend(predicted_labels)
    true_labels_roberta.extend(labels.cpu().numpy())

# Convert RoBERTa predictions to numpy array
predictions_roberta = np.array(predictions_roberta)

# Load test data for Word2Vec model
test_data_word2vec = pd.read_csv('/kaggle/input/plmsentiment/test.csv/test.csv')
test_texts_word2vec = test_data_word2vec['review'].tolist()
test_labels_word2vec = test_data_word2vec['sentiment'].map({'positive': 1, 'negative': 0}).tolist()

# Tokenize test data for Word2Vec model
test_encodings_word2vec = tokenizer_word2vec(test_texts_word2vec, truncation=True, padding=True, max_length=maxlen)

# Predict on test data using Word2Vec model
test_predictions_word2vec = model_word2vec.predict(test_encodings_word2vec)
test_predictions_word2vec = np.round(test_predictions_word2vec).flatten()

# Create ensemble predictions
ensemble_predictions = np.column_stack((predictions_roberta, test_predictions_word2vec))

# Train meta-classifier (Logistic Regression) on ensemble predictions
meta_classifier = LogisticRegression()
meta_classifier.fit(ensemble_predictions, test_labels_word2vec)

# Predict on test data using ensemble model
final_predictions = meta_classifier.predict(ensemble_predictions)

# Calculate accuracy
ensemble_accuracy = accuracy_score(test_labels_word2vec, final_predictions)
print(f'Ensemble Model Accuracy: {ensemble_accuracy}')

# Record end time
end_time = time.time()
print("Total time required:", end_time - start_time)


2024-02-25 05:08:22.198392: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-25 05:08:22.198506: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-25 05:08:22.492807: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Time required to fine-tune RoBERTa:  6750.19261097908
Epoch 1/10


I0000 00:00:1708844586.672850     123 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy for Word2Vec model: 0.8920000195503235
Time required to fine-tune Word2Vec: 2040.541413784027


TypeError: 'Tokenizer' object is not callable