### Importe Necessary Libraries

In [8]:
import numpy as np
import pandas as pd
import spacy
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense, Dropout
import keras_tuner as kt
import pickle

### Load Dataframe Util

In [9]:
%run ../../src/utils/dataframe_util.ipynb

In [10]:
#importing the datasets
df = load_datasets("../../data/data.csv")

### Load Spacy

In [11]:
# Load spaCy's English tokenizer and stopwords list
nlp = spacy.load('en_core_web_sm')

In [12]:
# Preprocessing function using spaCy
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if token.is_alpha and not token.is_stop]
    return ' '.join(tokens)

In [13]:
# Apply preprocessing
df['Clean_Text'] = df['Sentence'].apply(preprocess_text)

In [14]:
df.head()

Unnamed: 0,Sentence,Sentiment,Clean_Text
0,The GeoSolutions technology will leverage Bene...,positive,geosolutions technology leverage benefon gps s...
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative,esi lows bk real possibility
2,"For the last quarter of 2010 , Componenta 's n...",positive,quarter componenta net sales doubled m m perio...
3,According to the Finnish-Russian Chamber of Co...,neutral,according finnish russian chamber commerce maj...
4,The Swedish buyout firm has sold its remaining...,neutral,swedish buyout firm sold remaining percent sta...


In [15]:
# Encode labels
label_encoder = LabelEncoder()
df['Sentiment_Encoded'] = label_encoder.fit_transform(df['Sentiment'])

### Data Set Splitting

In [16]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Clean_Text'], df['Sentiment_Encoded'], test_size=0.2, random_state=42)

In [17]:
# Tokenization and padding
max_words = 100000  # Vocabulary size
max_len = 100  # Maximum length of input sequences

In [18]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

In [19]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [20]:
# Pad sequences to ensure uniform input size
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

### Hyperparameter Tuning

In [21]:
# Hyperparameter tuning function
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=hp.Int('embedding_dim', min_value=32, max_value=128, step=32), input_length=max_len))

    # Choose between RNN, LSTM, or GRU
    layer_type = hp.Choice('layer_type', ['RNN', 'LSTM', 'GRU'])
    if layer_type == 'RNN':
        model.add(SimpleRNN(hp.Int('rnn_units', min_value=64, max_value=256, step=64), return_sequences=False))
    elif layer_type == 'LSTM':
        model.add(LSTM(hp.Int('lstm_units', min_value=64, max_value=256, step=64), return_sequences=False))
    else:
        model.add(GRU(hp.Int('gru_units', min_value=64, max_value=256, step=64), return_sequences=False))

    model.add(Dropout(hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))  # 3 classes: positive, negative, neutral

    # Compile the model
    model.compile(optimizer=hp.Choice('optimizer', ['adam', 'rmsprop']),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

#### Using Keras Tuning

In [22]:
# Instantiate KerasTuner RandomSearch
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,  # Number of trials to run
    executions_per_trial=2,  # Run the model multiple times to reduce noise
    directory='my_dir',  # Directory to store results
    project_name='sentiment_analysis'
)

Reloading Tuner from my_dir\sentiment_analysis\tuner0.json


In [23]:
# Display summary of the search space
tuner.search_space_summary()

Search space summary
Default search space size: 7
embedding_dim (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 128, 'step': 32, 'sampling': 'linear'}
layer_type (Choice)
{'default': 'RNN', 'conditions': [], 'values': ['RNN', 'LSTM', 'GRU'], 'ordered': False}
rnn_units (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 256, 'step': 64, 'sampling': 'linear'}
dropout_rate (Float)
{'default': 0.2, 'conditions': [], 'min_value': 0.2, 'max_value': 0.5, 'step': 0.1, 'sampling': 'linear'}
optimizer (Choice)
{'default': 'adam', 'conditions': [], 'values': ['adam', 'rmsprop'], 'ordered': False}
lstm_units (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 256, 'step': 64, 'sampling': 'linear'}
gru_units (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 256, 'step': 64, 'sampling': 'linear'}


In [24]:
# Run hyperparameter search
tuner.search(X_train_pad, y_train, epochs=5, validation_split=0.2, batch_size=32)

In [25]:
# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

In [26]:
print(f"""
The optimal number of units in the RNN/LSTM/GRU layer is {best_hps.get('rnn_units') or best_hps.get('lstm_units') or best_hps.get('gru_units')}.
The best embedding dimension is {best_hps.get('embedding_dim')}.
The best optimizer is {best_hps.get('optimizer')}.
""")


The optimal number of units in the RNN/LSTM/GRU layer is 192.
The best embedding dimension is 64.
The best optimizer is rmsprop.



### Using GridSearch

### Set Hyperparameters

In [27]:
# Hyperparameters
embedding_dim = 64
batch_size = 32
epochs = 20

In [28]:
# Create RNN, LSTM, and GRU models
def build_rnn_model():
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_len))
    model.add(SimpleRNN(192, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))  # 3 classes: positive, negative, neutral
    model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [29]:
def build_lstm_model():
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_len))
    model.add(LSTM(192, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [30]:
def build_gru_model():
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_len))
    model.add(GRU(192, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

### Train RNN Model

In [31]:
# Build and train RNN model
rnn_model = build_rnn_model()
rnn_model.summary()



In [32]:
print("Training RNN Model...")
rnn_model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

Training RNN Model...
Epoch 1/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 103ms/step - accuracy: 0.4320 - loss: 1.1395 - val_accuracy: 0.5016 - val_loss: 1.0479
Epoch 2/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 76ms/step - accuracy: 0.5090 - loss: 1.0287 - val_accuracy: 0.5016 - val_loss: 1.0134
Epoch 3/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 76ms/step - accuracy: 0.5165 - loss: 1.0196 - val_accuracy: 0.5016 - val_loss: 0.9971
Epoch 4/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 77ms/step - accuracy: 0.5305 - loss: 1.0479 - val_accuracy: 0.5005 - val_loss: 1.0109
Epoch 5/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 74ms/step - accuracy: 0.5039 - loss: 1.0454 - val_accuracy: 0.5016 - val_loss: 0.9917
Epoch 6/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 76ms/step - accuracy: 0.5446 - loss: 0.9791 - val_accuracy: 0.5016 - val_loss: 0.98

<keras.src.callbacks.history.History at 0x27fa30aa360>

#### Evaluation

In [33]:
# Evaluate RNN model
rnn_pred = np.argmax(rnn_model.predict(X_test_pad), axis=1)
rnn_accuracy = accuracy_score(y_test, rnn_pred)
print(f"RNN Model Accuracy: {rnn_accuracy:.4f}")
print("RNN Classification Report:")
print(classification_report(y_test, rnn_pred, target_names=label_encoder.classes_))

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step
RNN Model Accuracy: 0.5355
RNN Classification Report:
              precision    recall  f1-score   support

    negative       0.16      0.26      0.20       175
     neutral       0.69      0.74      0.72       622
    positive       0.54      0.32      0.40       372

    accuracy                           0.54      1169
   macro avg       0.46      0.44      0.44      1169
weighted avg       0.56      0.54      0.54      1169



### Train LSTM Model

In [34]:
# Build and train LSTM model
lstm_model = build_lstm_model()
lstm_model.summary()



In [35]:
print("Training LSTM Model...")
lstm_model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

Training LSTM Model...
Epoch 1/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 135ms/step - accuracy: 0.5133 - loss: 1.0147 - val_accuracy: 0.4973 - val_loss: 0.9856
Epoch 2/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 124ms/step - accuracy: 0.5507 - loss: 0.9382 - val_accuracy: 0.5850 - val_loss: 0.9080
Epoch 3/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 130ms/step - accuracy: 0.6761 - loss: 0.7734 - val_accuracy: 0.6353 - val_loss: 0.8640
Epoch 4/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 125ms/step - accuracy: 0.7409 - loss: 0.6534 - val_accuracy: 0.6503 - val_loss: 0.8438
Epoch 5/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 127ms/step - accuracy: 0.7925 - loss: 0.5082 - val_accuracy: 0.3412 - val_loss: 6.5118
Epoch 6/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 125ms/step - accuracy: 0.7568 - loss: 0.7855 - val_accuracy: 0.6567 - val_

<keras.src.callbacks.history.History at 0x27fb7c4d700>

#### Evaluation

In [36]:
# Evaluate LSTM model
lstm_pred = np.argmax(lstm_model.predict(X_test_pad), axis=1)
lstm_accuracy = accuracy_score(y_test, lstm_pred)
print(f"LSTM Model Accuracy: {lstm_accuracy:.4f}")
print("LSTM Classification Report:")
print(classification_report(y_test, lstm_pred, target_names=label_encoder.classes_))

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step
LSTM Model Accuracy: 0.6416
LSTM Classification Report:
              precision    recall  f1-score   support

    negative       0.31      0.34      0.33       175
     neutral       0.71      0.71      0.71       622
    positive       0.70      0.66      0.68       372

    accuracy                           0.64      1169
   macro avg       0.57      0.57      0.57      1169
weighted avg       0.65      0.64      0.64      1169



### Train GRU Model

In [37]:
# Build and train GRU model
gru_model = build_gru_model()
gru_model.summary()



In [38]:
print("Training GRU Model...")
gru_model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

Training GRU Model...
Epoch 1/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 140ms/step - accuracy: 0.5213 - loss: 1.0115 - val_accuracy: 0.4930 - val_loss: 0.9615
Epoch 2/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 136ms/step - accuracy: 0.6037 - loss: 0.8714 - val_accuracy: 0.5968 - val_loss: 0.9097
Epoch 3/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 134ms/step - accuracy: 0.6940 - loss: 0.7249 - val_accuracy: 0.6374 - val_loss: 0.9539
Epoch 4/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 135ms/step - accuracy: 0.7551 - loss: 0.6105 - val_accuracy: 0.6203 - val_loss: 0.9371
Epoch 5/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 136ms/step - accuracy: 0.7943 - loss: 0.4889 - val_accuracy: 0.6417 - val_loss: 0.9624
Epoch 6/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 135ms/step - accuracy: 0.8501 - loss: 0.3587 - val_accuracy: 0.6118 - val_l

<keras.src.callbacks.history.History at 0x27fba1378f0>

#### Evaluation

In [39]:
# Evaluate GRU model
gru_pred = np.argmax(gru_model.predict(X_test_pad), axis=1)
gru_accuracy = accuracy_score(y_test, gru_pred)
print(f"GRU Model Accuracy: {gru_accuracy:.4f}")
print("GRU Classification Report:")
print(classification_report(y_test, gru_pred, target_names=label_encoder.classes_))

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 53ms/step
GRU Model Accuracy: 0.6459
GRU Classification Report:
              precision    recall  f1-score   support

    negative       0.30      0.28      0.29       175
     neutral       0.72      0.71      0.71       622
    positive       0.68      0.71      0.69       372

    accuracy                           0.65      1169
   macro avg       0.57      0.57      0.57      1169
weighted avg       0.64      0.65      0.64      1169



### Comparison

In [40]:
# Comparison of Models
print(f"RNN Model Accuracy: {rnn_accuracy:.4f}")
print(f"LSTM Model Accuracy: {lstm_accuracy:.4f}")
print(f"GRU Model Accuracy: {gru_accuracy:.4f}")

RNN Model Accuracy: 0.5355
LSTM Model Accuracy: 0.6416
GRU Model Accuracy: 0.6459


### Save The Model

In [41]:
# Save the trained model using pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

lstm_model.save('trained_models/lstm_model.h5')  # Save the model in h5 format



### Load The Model

In [42]:
# Load the saved tokenizer and model
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

model = load_model('trained_models/lstm_model.h5')



### Do The Predicition

In [43]:
# Load spaCy for text preprocessing
nlp = spacy.load('en_core_web_sm')

In [44]:
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if token.is_alpha and not token.is_stop]
    return ' '.join(tokens)

In [45]:
# Function to predict sentiment
def predict_sentiment(user_input):
    # Preprocess the user input
    cleaned_text = preprocess_text(user_input)
    
    # Convert the input text into sequences and pad it
    input_seq = tokenizer.texts_to_sequences([cleaned_text])
    input_pad = pad_sequences(input_seq, maxlen=100)
    
    # Predict using the trained model
    prediction = model.predict(input_pad)
    
    # Convert the prediction to the class label
    predicted_class = np.argmax(prediction, axis=1)[0]
    
    # Mapping back to original sentiment
    sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
    
    return sentiment_map[predicted_class]

In [46]:
# Example user input and prediction
user_input = "$ESI on lows, down $1.50 to $2.50 BK a real possibility"
predicted_sentiment = predict_sentiment(user_input)
print(f"Predicted sentiment: {predicted_sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 636ms/step
Predicted sentiment: negative
