### Importe Necessary Libraries

In [62]:
import numpy as np
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense, Dropout
import keras_tuner as kt
import pickle

### Load Dataframe Util

In [2]:
%run ../../src/utils/dataframe_util.ipynb

In [4]:
#importing the datasets
df = load_datasets("../../data/data.csv")

### Load Spacy

In [3]:
# Load spaCy's English tokenizer and stopwords list
nlp = spacy.load('en_core_web_sm')

In [5]:
# Preprocessing function using spaCy
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if token.is_alpha and not token.is_stop]
    return ' '.join(tokens)

In [6]:
# Apply preprocessing
df['Clean_Text'] = df['Sentence'].apply(preprocess_text)

In [7]:
df.head()

Unnamed: 0,Sentence,Sentiment,Clean_Text
0,The GeoSolutions technology will leverage Bene...,positive,geosolutions technology leverage benefon gps s...
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative,esi lows bk real possibility
2,"For the last quarter of 2010 , Componenta 's n...",positive,quarter componenta net sales doubled m m perio...
3,According to the Finnish-Russian Chamber of Co...,neutral,according finnish russian chamber commerce maj...
4,The Swedish buyout firm has sold its remaining...,neutral,swedish buyout firm sold remaining percent sta...


In [8]:
# Encode labels
label_encoder = LabelEncoder()
df['Sentiment_Encoded'] = label_encoder.fit_transform(df['Sentiment'])

### Data Set Splitting

In [9]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Clean_Text'], df['Sentiment_Encoded'], test_size=0.2, random_state=42)

In [10]:
# Tokenization and padding
max_words = 100000  # Vocabulary size
max_len = 100  # Maximum length of input sequences

In [11]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

In [12]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [13]:
# Pad sequences to ensure uniform input size
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

### Hyperparameter Tuning

In [36]:
# Hyperparameter tuning function
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=hp.Int('embedding_dim', min_value=32, max_value=128, step=32), input_length=max_len))

    # Choose between RNN, LSTM, or GRU
    layer_type = hp.Choice('layer_type', ['RNN', 'LSTM', 'GRU'])
    if layer_type == 'RNN':
        model.add(SimpleRNN(hp.Int('rnn_units', min_value=64, max_value=256, step=64), return_sequences=False))
    elif layer_type == 'LSTM':
        model.add(LSTM(hp.Int('lstm_units', min_value=64, max_value=256, step=64), return_sequences=False))
    else:
        model.add(GRU(hp.Int('gru_units', min_value=64, max_value=256, step=64), return_sequences=False))

    model.add(Dropout(hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))  # 3 classes: positive, negative, neutral

    # Compile the model
    model.compile(optimizer=hp.Choice('optimizer', ['adam', 'rmsprop']),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

#### Using Keras Tuning

In [37]:
# Instantiate KerasTuner RandomSearch
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,  # Number of trials to run
    executions_per_trial=2,  # Run the model multiple times to reduce noise
    directory='my_dir',  # Directory to store results
    project_name='sentiment_analysis'
)



In [38]:
# Display summary of the search space
tuner.search_space_summary()

Search space summary
Default search space size: 5
embedding_dim (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 128, 'step': 32, 'sampling': 'linear'}
layer_type (Choice)
{'default': 'RNN', 'conditions': [], 'values': ['RNN', 'LSTM', 'GRU'], 'ordered': False}
rnn_units (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 256, 'step': 64, 'sampling': 'linear'}
dropout_rate (Float)
{'default': 0.2, 'conditions': [], 'min_value': 0.2, 'max_value': 0.5, 'step': 0.1, 'sampling': 'linear'}
optimizer (Choice)
{'default': 'adam', 'conditions': [], 'values': ['adam', 'rmsprop'], 'ordered': False}


In [39]:
# Run hyperparameter search
tuner.search(X_train_pad, y_train, epochs=5, validation_split=0.2, batch_size=32)

Trial 10 Complete [00h 01m 01s]
val_accuracy: 0.6502673625946045

Best val_accuracy So Far: 0.6518716514110565
Total elapsed time: 00h 07m 34s


In [40]:
# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

In [41]:
print(f"""
The optimal number of units in the RNN/LSTM/GRU layer is {best_hps.get('rnn_units') or best_hps.get('lstm_units') or best_hps.get('gru_units')}.
The best embedding dimension is {best_hps.get('embedding_dim')}.
The best optimizer is {best_hps.get('optimizer')}.
""")


The optimal number of units in the RNN/LSTM/GRU layer is 192.
The best embedding dimension is 64.
The best optimizer is rmsprop.



### Set Hyperparameters

In [42]:
# Hyperparameters
embedding_dim = 64
batch_size = 32
epochs = 20

In [43]:
# Create RNN, LSTM, and GRU models
def build_rnn_model():
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_len))
    model.add(SimpleRNN(128, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))  # 3 classes: positive, negative, neutral
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [44]:
def build_lstm_model():
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_len))
    model.add(LSTM(128, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [45]:
def build_gru_model():
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_len))
    model.add(GRU(128, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

### Train RNN Model

In [46]:
# Build and train RNN model
rnn_model = build_rnn_model()
rnn_model.summary()

In [47]:
print("Training RNN Model...")
rnn_model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

Training RNN Model...
Epoch 1/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.4613 - loss: 1.0477 - val_accuracy: 0.5016 - val_loss: 1.0032
Epoch 2/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.5373 - loss: 0.9711 - val_accuracy: 0.4941 - val_loss: 1.0051
Epoch 3/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.6041 - loss: 0.8498 - val_accuracy: 0.5305 - val_loss: 1.0066
Epoch 4/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.7509 - loss: 0.6561 - val_accuracy: 0.5733 - val_loss: 0.9434
Epoch 5/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.6570 - loss: 0.8203 - val_accuracy: 0.5925 - val_loss: 0.9755
Epoch 6/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.7944 - loss: 0.5395 - val_accuracy: 0.5583 - val_loss: 1.0762


<keras.src.callbacks.history.History at 0x19a006b68a0>

#### Evaluation

In [48]:
# Evaluate RNN model
rnn_pred = np.argmax(rnn_model.predict(X_test_pad), axis=1)
rnn_accuracy = accuracy_score(y_test, rnn_pred)
print(f"RNN Model Accuracy: {rnn_accuracy:.4f}")
print("RNN Classification Report:")
print(classification_report(y_test, rnn_pred, target_names=label_encoder.classes_))

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
RNN Model Accuracy: 0.6253
RNN Classification Report:
              precision    recall  f1-score   support

    negative       0.30      0.22      0.26       175
     neutral       0.68      0.77      0.72       622
    positive       0.65      0.58      0.61       372

    accuracy                           0.63      1169
   macro avg       0.54      0.52      0.53      1169
weighted avg       0.61      0.63      0.61      1169



### Train LSTM Model

In [49]:
# Build and train LSTM model
lstm_model = build_lstm_model()
lstm_model.summary()



In [50]:
print("Training LSTM Model...")
lstm_model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

Training LSTM Model...
Epoch 1/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 63ms/step - accuracy: 0.5187 - loss: 1.0183 - val_accuracy: 0.4930 - val_loss: 0.9664
Epoch 2/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 65ms/step - accuracy: 0.5998 - loss: 0.8479 - val_accuracy: 0.6150 - val_loss: 0.8622
Epoch 3/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 65ms/step - accuracy: 0.7776 - loss: 0.5503 - val_accuracy: 0.6299 - val_loss: 0.9497
Epoch 4/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 52ms/step - accuracy: 0.8674 - loss: 0.3296 - val_accuracy: 0.6417 - val_loss: 1.0261
Epoch 5/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 54ms/step - accuracy: 0.8763 - loss: 0.3154 - val_accuracy: 0.6310 - val_loss: 1.0246
Epoch 6/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 64ms/step - accuracy: 0.8938 - loss: 0.2486 - val_accuracy: 0.6578 - val_loss: 1.122

<keras.src.callbacks.history.History at 0x19a06da4e90>

#### Evaluation

In [51]:
# Evaluate LSTM model
lstm_pred = np.argmax(lstm_model.predict(X_test_pad), axis=1)
lstm_accuracy = accuracy_score(y_test, lstm_pred)
print(f"LSTM Model Accuracy: {lstm_accuracy:.4f}")
print("LSTM Classification Report:")
print(classification_report(y_test, lstm_pred, target_names=label_encoder.classes_))

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step
LSTM Model Accuracy: 0.6715
LSTM Classification Report:
              precision    recall  f1-score   support

    negative       0.34      0.27      0.30       175
     neutral       0.71      0.80      0.75       622
    positive       0.72      0.65      0.69       372

    accuracy                           0.67      1169
   macro avg       0.59      0.57      0.58      1169
weighted avg       0.66      0.67      0.66      1169



### Train GRU Model

In [52]:
# Build and train GRU model
gru_model = build_gru_model()
gru_model.summary()



In [53]:
print("Training GRU Model...")
gru_model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

Training GRU Model...
Epoch 1/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 71ms/step - accuracy: 0.5294 - loss: 1.0204 - val_accuracy: 0.5401 - val_loss: 0.9207
Epoch 2/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 69ms/step - accuracy: 0.6484 - loss: 0.7926 - val_accuracy: 0.6193 - val_loss: 0.8592
Epoch 3/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 68ms/step - accuracy: 0.7929 - loss: 0.5117 - val_accuracy: 0.6385 - val_loss: 0.8845
Epoch 4/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 68ms/step - accuracy: 0.8625 - loss: 0.3426 - val_accuracy: 0.6449 - val_loss: 0.9840
Epoch 5/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 70ms/step - accuracy: 0.8911 - loss: 0.2377 - val_accuracy: 0.6556 - val_loss: 1.1314
Epoch 6/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 74ms/step - accuracy: 0.9080 - loss: 0.2040 - val_accuracy: 0.6353 - val_loss: 1.2156

<keras.src.callbacks.history.History at 0x19a07f4c6e0>

#### Evaluation

In [54]:
# Evaluate GRU model
gru_pred = np.argmax(gru_model.predict(X_test_pad), axis=1)
gru_accuracy = accuracy_score(y_test, gru_pred)
print(f"GRU Model Accuracy: {gru_accuracy:.4f}")
print("GRU Classification Report:")
print(classification_report(y_test, gru_pred, target_names=label_encoder.classes_))

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step
GRU Model Accuracy: 0.6689
GRU Classification Report:
              precision    recall  f1-score   support

    negative       0.36      0.24      0.29       175
     neutral       0.70      0.81      0.75       622
    positive       0.70      0.64      0.67       372

    accuracy                           0.67      1169
   macro avg       0.59      0.56      0.57      1169
weighted avg       0.65      0.67      0.66      1169



### Comparison

In [55]:
# Comparison of Models
print(f"RNN Model Accuracy: {rnn_accuracy:.4f}")
print(f"LSTM Model Accuracy: {lstm_accuracy:.4f}")
print(f"GRU Model Accuracy: {gru_accuracy:.4f}")

RNN Model Accuracy: 0.6253
LSTM Model Accuracy: 0.6715
GRU Model Accuracy: 0.6689


### Save The Model

In [61]:
# Save the trained model using pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

lstm_model.save('trained_models/lstm_model.h5')  # Save the model in h5 format



### Load The Model

In [63]:
# Load the saved tokenizer and model
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

model = load_model('trained_models/lstm_model.h5')



### Do The Predicition

In [64]:
# Load spaCy for text preprocessing
nlp = spacy.load('en_core_web_sm')

In [65]:
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if token.is_alpha and not token.is_stop]
    return ' '.join(tokens)

In [66]:
# Function to predict sentiment
def predict_sentiment(user_input):
    # Preprocess the user input
    cleaned_text = preprocess_text(user_input)
    
    # Convert the input text into sequences and pad it
    input_seq = tokenizer.texts_to_sequences([cleaned_text])
    input_pad = pad_sequences(input_seq, maxlen=100)
    
    # Predict using the trained model
    prediction = model.predict(input_pad)
    
    # Convert the prediction to the class label
    predicted_class = np.argmax(prediction, axis=1)[0]
    
    # Mapping back to original sentiment
    sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
    
    return sentiment_map[predicted_class]

In [70]:
# Example user input and prediction
user_input = "$ESI on lows, down $1.50 to $2.50 BK a real possibility"
predicted_sentiment = predict_sentiment(user_input)
print(f"Predicted sentiment: {predicted_sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Predicted sentiment: negative
