In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from keras import models
import joblib

# Load the dataset
data = pd.read_csv('/content/sample_data/sentiment.csv')

# Select the relevant columns for sentiment analysis
data = data[['text', 'sentiment']]

# Remove any rows with missing values in 'text' or 'sentiment' columns
data = data.dropna(subset=['text', 'sentiment'])

# Split the data into training and testing sets
X = data['text']
y = data['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert labels to numerical values (0 and 1)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_sequence = tokenizer.texts_to_sequences(X_train)
X_test_sequence = tokenizer.texts_to_sequences(X_test)

# Pad sequences to a fixed length for uniformity
max_sequence_length = 100  # You can set this based on your dataset and sequence length distribution
X_train_padded = pad_sequences(X_train_sequence, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_sequence, maxlen=max_sequence_length)

# Build the RNN model
embedding_dim = 100  # You can adjust this dimension based on the size of your vocabulary and available word embeddings

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

# Train the RNN model
batch_size = 32
epochs = 5

model.fit(X_train_padded, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test_padded, y_test))

# Save the trained model to disk
model_filename = 'sentiment_analysis_rnn_model.h5'
model.save(model_filename)

# Load the saved model
loaded_model = models.load_model(model_filename)

# Example text for prediction
new_text = "A lot of good things are happening. We are respected again throughout the world, and that's a great thing. @realDonaldTrump"

# Tokenize and pad the new text
new_text_sequence = tokenizer.texts_to_sequences([new_text])
new_text_padded = pad_sequences(new_text_sequence, maxlen=max_sequence_length)

# Predict sentiment using the loaded model
predicted_sentiment_probs = loaded_model.predict(new_text_padded)

# Threshold the probabilities to get the final sentiment label (e.g., 'positive' or 'negative')
threshold = 0.5  # You can adjust this threshold as per your requirement

predicted_sentiment_label = ['positive' if prob > threshold else 'negative' for prob in predicted_sentiment_probs]

print("Predicted sentiment:", predicted_sentiment_label[0])


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 100)          1707700   
                                                                 
 lstm_1 (LSTM)               (None, 128)               117248    
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,825,077
Trainable params: 1,825,077
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Predicted sentiment: positive


In [8]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

data = pd.read_csv('/content/sample_data/sentiment.csv')
# Keeping only the necessary columns
data = data[['text', 'sentiment']]

data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)

X = pad_sequences(X)

embed_dim = 128
lstm_out = 196

def create_model(embed_dim, lstm_out):
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim, input_length=X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Create KerasClassifier wrapper for GridSearchCV
model = KerasClassifier(build_fn=create_model, epochs=1, batch_size=16, verbose=2)

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'embed_dim': [64, 128, 256],
    'lstm_out': [128, 196, 256]
}

# Create GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=3)

# Perform the grid search on the training data
grid_search.fit(X_train, Y_train)

# Get the best hyperparameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best hyperparameters:", best_params)

# Evaluate the best model on the test data
score = best_model.model.evaluate(X_test, Y_test, verbose=2)
print(score)


  model = KerasClassifier(build_fn=create_model, epochs=1, batch_size=16, verbose=2)


388/388 - 34s - loss: 0.8472 - accuracy: 0.6379 - 34s/epoch - 88ms/step
194/194 - 2s - loss: 0.7757 - accuracy: 0.6685 - 2s/epoch - 11ms/step
388/388 - 32s - loss: 0.8435 - accuracy: 0.6423 - 32s/epoch - 82ms/step
194/194 - 2s - loss: 0.7761 - accuracy: 0.6611 - 2s/epoch - 10ms/step
388/388 - 30s - loss: 0.8411 - accuracy: 0.6362 - 30s/epoch - 78ms/step
194/194 - 2s - loss: 0.7937 - accuracy: 0.6619 - 2s/epoch - 9ms/step
388/388 - 50s - loss: 0.8677 - accuracy: 0.6289 - 50s/epoch - 128ms/step
194/194 - 3s - loss: 0.8094 - accuracy: 0.6423 - 3s/epoch - 14ms/step
388/388 - 56s - loss: 0.8492 - accuracy: 0.6313 - 56s/epoch - 144ms/step
194/194 - 3s - loss: 0.7852 - accuracy: 0.6562 - 3s/epoch - 17ms/step
388/388 - 53s - loss: 0.8489 - accuracy: 0.6343 - 53s/epoch - 138ms/step
194/194 - 4s - loss: 0.7867 - accuracy: 0.6600 - 4s/epoch - 21ms/step
388/388 - 67s - loss: 0.8465 - accuracy: 0.6331 - 67s/epoch - 173ms/step
194/194 - 4s - loss: 0.8038 - accuracy: 0.6530 - 4s/epoch - 20ms/step
388