### Importe Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense, Dropout
import keras_tuner as kt
import pickle

### Load Dataframe Util

In [2]:
%run ../../src/utils/dataframe_util.ipynb

In [3]:
#importing the datasets
df = load_datasets("../../data/data.csv")

### Load Spacy

In [4]:
# Load spaCy's English tokenizer and stopwords list
nlp = spacy.load('en_core_web_sm')

In [5]:
# Preprocessing function using spaCy
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if token.is_alpha and not token.is_stop]
    return ' '.join(tokens)

In [6]:
# Apply preprocessing
df['Clean_Text'] = df['Sentence'].apply(preprocess_text)

In [7]:
df.head()

Unnamed: 0,Sentence,Sentiment,Clean_Text
0,The GeoSolutions technology will leverage Bene...,positive,geosolutions technology leverage benefon gps s...
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative,esi lows bk real possibility
2,"For the last quarter of 2010 , Componenta 's n...",positive,quarter componenta net sales doubled m m perio...
3,According to the Finnish-Russian Chamber of Co...,neutral,according finnish russian chamber commerce maj...
4,The Swedish buyout firm has sold its remaining...,neutral,swedish buyout firm sold remaining percent sta...


In [8]:
# Encode labels
label_encoder = LabelEncoder()
df['Sentiment_Encoded'] = label_encoder.fit_transform(df['Sentiment'])

### Data Set Splitting

In [9]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Clean_Text'], df['Sentiment_Encoded'], test_size=0.2, random_state=42)

In [10]:
# Tokenization and padding
max_words = 100000  # Vocabulary size
max_len = 100  # Maximum length of input sequences

In [11]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

In [12]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [13]:
# Pad sequences to ensure uniform input size
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

### Hyperparameter Tuning

In [14]:
# Hyperparameter tuning function
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=hp.Int('embedding_dim', min_value=32, max_value=128, step=32), input_length=max_len))

    # Choose between RNN, LSTM, or GRU
    layer_type = hp.Choice('layer_type', ['RNN', 'LSTM', 'GRU'])
    if layer_type == 'RNN':
        model.add(SimpleRNN(hp.Int('rnn_units', min_value=64, max_value=256, step=64), return_sequences=False))
    elif layer_type == 'LSTM':
        model.add(LSTM(hp.Int('lstm_units', min_value=64, max_value=256, step=64), return_sequences=False))
    else:
        model.add(GRU(hp.Int('gru_units', min_value=64, max_value=256, step=64), return_sequences=False))

    model.add(Dropout(hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))  # 3 classes: positive, negative, neutral

    # Compile the model
    model.compile(optimizer=hp.Choice('optimizer', ['adam', 'rmsprop']),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

#### Using Keras Tuning

In [15]:
# Instantiate KerasTuner RandomSearch
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,  # Number of trials to run
    executions_per_trial=2,  # Run the model multiple times to reduce noise
    directory='my_dir',  # Directory to store results
    project_name='sentiment_analysis'
)

Reloading Tuner from my_dir\sentiment_analysis\tuner0.json


In [16]:
# Display summary of the search space
tuner.search_space_summary()

Search space summary
Default search space size: 7
embedding_dim (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 128, 'step': 32, 'sampling': 'linear'}
layer_type (Choice)
{'default': 'RNN', 'conditions': [], 'values': ['RNN', 'LSTM', 'GRU'], 'ordered': False}
rnn_units (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 256, 'step': 64, 'sampling': 'linear'}
dropout_rate (Float)
{'default': 0.2, 'conditions': [], 'min_value': 0.2, 'max_value': 0.5, 'step': 0.1, 'sampling': 'linear'}
optimizer (Choice)
{'default': 'adam', 'conditions': [], 'values': ['adam', 'rmsprop'], 'ordered': False}
lstm_units (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 256, 'step': 64, 'sampling': 'linear'}
gru_units (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 256, 'step': 64, 'sampling': 'linear'}


In [17]:
# Run hyperparameter search
tuner.search(X_train_pad, y_train, epochs=5, validation_split=0.2, batch_size=32)

In [18]:
# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

In [19]:
print(f"""
The optimal number of units in the RNN/LSTM/GRU layer is {best_hps.get('rnn_units') or best_hps.get('lstm_units') or best_hps.get('gru_units')}.
The best embedding dimension is {best_hps.get('embedding_dim')}.
The best optimizer is {best_hps.get('optimizer')}.
""")


The optimal number of units in the RNN/LSTM/GRU layer is 192.
The best embedding dimension is 64.
The best optimizer is rmsprop.



### Set Hyperparameters

In [20]:
# Hyperparameters
embedding_dim = 64
batch_size = 32
epochs = 20

In [21]:
# Create RNN, LSTM, and GRU models
def build_rnn_model():
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_len))
    model.add(SimpleRNN(128, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))  # 3 classes: positive, negative, neutral
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [22]:
def build_lstm_model():
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_len))
    model.add(LSTM(128, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [23]:
def build_gru_model():
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_len))
    model.add(GRU(128, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

### Train RNN Model

In [24]:
# Build and train RNN model
rnn_model = build_rnn_model()
rnn_model.summary()



In [25]:
print("Training RNN Model...")
rnn_model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

Training RNN Model...
Epoch 1/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 91ms/step - accuracy: 0.4793 - loss: 1.0487 - val_accuracy: 0.5016 - val_loss: 1.0052
Epoch 2/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 90ms/step - accuracy: 0.5749 - loss: 0.9249 - val_accuracy: 0.5209 - val_loss: 0.9694
Epoch 3/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 88ms/step - accuracy: 0.6988 - loss: 0.7410 - val_accuracy: 0.5112 - val_loss: 1.1200
Epoch 4/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 88ms/step - accuracy: 0.7989 - loss: 0.4978 - val_accuracy: 0.5219 - val_loss: 1.1628
Epoch 5/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 89ms/step - accuracy: 0.8809 - loss: 0.2833 - val_accuracy: 0.5155 - val_loss: 1.2726
Epoch 6/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 87ms/step - accuracy: 0.8970 - loss: 0.2350 - val_accuracy: 0.5337 - val_loss: 1

<keras.src.callbacks.history.History at 0x17ec26ccaa0>

#### Evaluation

In [26]:
# Evaluate RNN model
rnn_pred = np.argmax(rnn_model.predict(X_test_pad), axis=1)
rnn_accuracy = accuracy_score(y_test, rnn_pred)
print(f"RNN Model Accuracy: {rnn_accuracy:.4f}")
print("RNN Classification Report:")
print(classification_report(y_test, rnn_pred, target_names=label_encoder.classes_))

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
RNN Model Accuracy: 0.5612
RNN Classification Report:
              precision    recall  f1-score   support

    negative       0.21      0.21      0.21       175
     neutral       0.67      0.68      0.68       622
    positive       0.55      0.52      0.54       372

    accuracy                           0.56      1169
   macro avg       0.47      0.47      0.47      1169
weighted avg       0.56      0.56      0.56      1169



### Train LSTM Model

In [27]:
# Build and train LSTM model
lstm_model = build_lstm_model()
lstm_model.summary()



In [28]:
print("Training LSTM Model...")
lstm_model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

Training LSTM Model...
Epoch 1/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 116ms/step - accuracy: 0.5283 - loss: 1.0203 - val_accuracy: 0.5027 - val_loss: 0.9653
Epoch 2/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 111ms/step - accuracy: 0.5927 - loss: 0.8510 - val_accuracy: 0.6332 - val_loss: 0.8391
Epoch 3/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 114ms/step - accuracy: 0.7921 - loss: 0.5270 - val_accuracy: 0.6588 - val_loss: 0.9075
Epoch 4/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 114ms/step - accuracy: 0.8647 - loss: 0.3218 - val_accuracy: 0.6385 - val_loss: 1.0091
Epoch 5/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 113ms/step - accuracy: 0.8925 - loss: 0.2503 - val_accuracy: 0.6374 - val_loss: 1.1127
Epoch 6/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 115ms/step - accuracy: 0.9058 - loss: 0.1991 - val_accuracy: 0.6364 - val_

<keras.src.callbacks.history.History at 0x17ec51ea8d0>

#### Evaluation

In [29]:
# Evaluate LSTM model
lstm_pred = np.argmax(lstm_model.predict(X_test_pad), axis=1)
lstm_accuracy = accuracy_score(y_test, lstm_pred)
print(f"LSTM Model Accuracy: {lstm_accuracy:.4f}")
print("LSTM Classification Report:")
print(classification_report(y_test, lstm_pred, target_names=label_encoder.classes_))

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step
LSTM Model Accuracy: 0.6561
LSTM Classification Report:
              precision    recall  f1-score   support

    negative       0.33      0.25      0.28       175
     neutral       0.69      0.80      0.74       622
    positive       0.73      0.61      0.66       372

    accuracy                           0.66      1169
   macro avg       0.58      0.55      0.56      1169
weighted avg       0.64      0.66      0.65      1169



### Train GRU Model

In [30]:
# Build and train GRU model
gru_model = build_gru_model()
gru_model.summary()



In [31]:
print("Training GRU Model...")
gru_model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

Training GRU Model...
Epoch 1/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 122ms/step - accuracy: 0.5363 - loss: 1.0065 - val_accuracy: 0.5209 - val_loss: 0.9320
Epoch 2/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 116ms/step - accuracy: 0.6383 - loss: 0.7915 - val_accuracy: 0.6342 - val_loss: 0.8803
Epoch 3/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 117ms/step - accuracy: 0.7976 - loss: 0.5019 - val_accuracy: 0.6267 - val_loss: 0.9271
Epoch 4/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 119ms/step - accuracy: 0.8742 - loss: 0.2964 - val_accuracy: 0.6235 - val_loss: 0.9768
Epoch 5/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 118ms/step - accuracy: 0.8946 - loss: 0.2498 - val_accuracy: 0.5968 - val_loss: 1.1922
Epoch 6/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 142ms/step - accuracy: 0.9054 - loss: 0.2090 - val_accuracy: 0.6556 - val_l

<keras.src.callbacks.history.History at 0x17edee64770>

#### Evaluation

In [32]:
# Evaluate GRU model
gru_pred = np.argmax(gru_model.predict(X_test_pad), axis=1)
gru_accuracy = accuracy_score(y_test, gru_pred)
print(f"GRU Model Accuracy: {gru_accuracy:.4f}")
print("GRU Classification Report:")
print(classification_report(y_test, gru_pred, target_names=label_encoder.classes_))

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step
GRU Model Accuracy: 0.6510
GRU Classification Report:
              precision    recall  f1-score   support

    negative       0.31      0.22      0.26       175
     neutral       0.72      0.73      0.72       622
    positive       0.65      0.72      0.68       372

    accuracy                           0.65      1169
   macro avg       0.56      0.56      0.56      1169
weighted avg       0.64      0.65      0.64      1169



### Comparison

In [33]:
# Comparison of Models
print(f"RNN Model Accuracy: {rnn_accuracy:.4f}")
print(f"LSTM Model Accuracy: {lstm_accuracy:.4f}")
print(f"GRU Model Accuracy: {gru_accuracy:.4f}")

RNN Model Accuracy: 0.5612
LSTM Model Accuracy: 0.6561
GRU Model Accuracy: 0.6510


### Save The Model

In [34]:
# Save the trained model using pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

lstm_model.save('trained_models/lstm_model.h5')  # Save the model in h5 format



### Load The Model

In [35]:
# Load the saved tokenizer and model
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

model = load_model('trained_models/lstm_model.h5')



### Do The Predicition

In [36]:
# Load spaCy for text preprocessing
nlp = spacy.load('en_core_web_sm')

In [37]:
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if token.is_alpha and not token.is_stop]
    return ' '.join(tokens)

In [38]:
# Function to predict sentiment
def predict_sentiment(user_input):
    # Preprocess the user input
    cleaned_text = preprocess_text(user_input)
    
    # Convert the input text into sequences and pad it
    input_seq = tokenizer.texts_to_sequences([cleaned_text])
    input_pad = pad_sequences(input_seq, maxlen=100)
    
    # Predict using the trained model
    prediction = model.predict(input_pad)
    
    # Convert the prediction to the class label
    predicted_class = np.argmax(prediction, axis=1)[0]
    
    # Mapping back to original sentiment
    sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
    
    return sentiment_map[predicted_class]

In [39]:
# Example user input and prediction
user_input = "$ESI on lows, down $1.50 to $2.50 BK a real possibility"
predicted_sentiment = predict_sentiment(user_input)
print(f"Predicted sentiment: {predicted_sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 656ms/step
Predicted sentiment: negative
