In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Assuming data is in a DataFrame called df and the columns are named as per the sample provided
# Load data
df = pd.read_csv('/content/tweet dataset.csv')  # Correct the file path

# Use the correct column name for reviews and labels
X = df['text']  # Text column
y = df['target']  # Label column, assuming '0' for negative and '1' for positive sentiment

# Remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# Apply stopword removal
X = X.apply(remove_stopwords)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization and sequence padding
vocab_size = 1000
oov_tok = "<OOV>"
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

sequence_length = 120
train_padded = pad_sequences(train_sequences, maxlen=sequence_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=sequence_length, padding='post', truncating='post')

# Model definition
model = Sequential([
    Embedding(vocab_size, 16, input_length=sequence_length),
    LSTM(32),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit model with callbacks to avoid overfitting
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3),
    ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)
]
history = model.fit(train_padded, y_train, epochs=50, validation_data=(test_padded, y_test), callbacks=callbacks)

# Output model summary and history
print(model.summary())
metrics_df = pd.DataFrame(history.history)
print(metrics_df)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 1/50
Epoch 2/50

  saving_api.save_model(


Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 120, 16)           16000     
                                                                 
 lstm_1 (LSTM)               (None, 32)                6272      
                                                                 
 dense_2 (Den

In [10]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(test_padded, y_test)

# Print the results
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")


Test Loss: -6.518629550933838
Test Accuracy: 0.0
