<h1>Import Nescessary Library</h1>

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input
from keras.api.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt

<h1>Preprocess (Cleanse Data)</h1>

In [None]:
# Import Data
df = pd.read_csv("Dataset/Sentiment_Stock_data.csv", usecols=["Sentiment", "Sentence"])
# print(df.to_string)


Deal With Null Data

In [None]:
# Drop Null value rows
df = df.dropna()

# Check Null
print(df.isnull().sum())
# print(df.to_string)

Deal with Error Format Data

In [None]:
special_char= r'[a-zA-Z0-9]+(?:\s+[a-zA-Z0-9]+)*'

# Find rows where "Sentence" contains special characters
special_char_row = df[~df["Sentence"].str.contains(special_char, regex=True)].index
# print(len(special_char_row))

# Drop special character rows
df.drop(special_char_row, axis=0, inplace=True)

error_encode = "+ñ"

# Find rows where "Sentence" contains special characters
error_encode_row = df[df["Sentence"].str.contains(error_encode, regex=False)].index
# print(len(error_encode_row))

# Drop error encoded rows
df.drop(error_encode_row, axis=0, inplace=True)

# print(df.to_string)

<h1>Text Tokenization & Padding</h1>

In [None]:
# Hyper Parameter
vocab = 30000
embed_dim = 100
input_length = 20
optimizer = Adam(0.005)

In [None]:
tokenizer = Tokenizer(num_words=vocab)
tokenizer.fit_on_texts(df["Sentence"])

# Convert sentences to sequences of token ex. [340, 2, 14467, 1, 72, 15, 48, 220, 2, 352, 62, 702, 2, 73, 6316, 14, 9, 281, 1, 72, 9, 683]
tokenized = tokenizer.texts_to_sequences(df["Sentence"])

# Padding Sentence to same length
X = pad_sequences(tokenizer.texts_to_sequences(df["Sentence"]), maxlen=input_length)

# Change to numpy array for training
y = np.array(df["Sentiment"], dtype=np.int32)

In [None]:
# Load Pretrained GloVe Embeddings
glove_path = "glove.6B/glove.6B.100d.txt"
embedding_index = {}

with open(glove_path, "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = vector

# Create Embedding Matrix
embedding_matrix = np.zeros((vocab, embed_dim))
word_index = tokenizer.word_index

for word, i in word_index.items():
    if i < vocab:
        vector = embedding_index.get(word)
        if vector is not None:
            embedding_matrix[i] = vector

In [None]:
# #************** For view indexing *******
# word_index = tokenizer.word_index  # vocabulary
# index_word = {v: k for k, v in word_index.items()}  # token to word mapping

# print(df[df["Sentence"].str.contains("saudi arabia about", case=False)])
# print(X_train[0]) # Numpy array
# print(Y_train[0]) # Pandas Series
# # Function to decode tokenized sequences back to text
# def decode_sequence(sequence: list):
#     for pharse in sequence:
#         print(pharse)
#         print(" ".join(index_word.get(token, "<UNK>") for token in pharse), "\n")

# # Decode and compare tokenized sequences
# decode_sequence(tokenized)

In [None]:
print(type(X), X.dtype, X.shape)
print(type(y), y.dtype, y.shape)
# print([X_train[x] for x in range(5)])
# print([Y_train[y] for y in range(5)])


<h1> Spilt data </h1>

In [None]:
# Split the entire DataFrame
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# print(df[df["Sentence"].str.match(X_train[1])])

In [None]:
print(type(X_train), X_train.dtype, X_train.shape)
print(type(y_train), y_train.dtype, y_train.shape)
print(type(X_test), X_test.dtype, X_test.shape)
print(type(y_test), y_test.dtype, y_test.shape)

<h1> Build Model </h1>

In [None]:
model=Sequential()
model.add(Embedding(vocab,embed_dim,input_length=input_length, weights=[embedding_matrix], trainable=False))
model.add(LSTM(256, dropout=0.3, recurrent_dropout=0.3))  #try different values
model.add(Dense(1,activation='sigmoid')) #sigmoid since the output is binary
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Training Model

In [None]:
history = model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test), verbose=1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.show()

Evaluate Model

In [None]:
loss, acc = model.evaluate(X_test, y_test)
print(f"Test loss: {loss}")
print(f"Test accuracy: {acc}")

<h1> Plot Weights </h1>

In [None]:
embedding_weights = model.layers[0].get_weights()[0]  # Extracting embedding weights
embedding_weights_flat = embedding_weights.flatten()

# Create an index for the weights
embedding_index = np.arange(len(embedding_weights_flat))

# Plot the embedding weights
plt.figure(figsize=(10, 5))
plt.plot(embedding_index, embedding_weights_flat, marker='o', markersize=2, color='green')
plt.title('Embedding Weights')
plt.xlabel('Weight Index')
plt.ylabel('Embedding Weight Value')
plt.show()


In [None]:
kernel, recurrent_kernel, bias = model.layers[2].get_weights()  # Extract weights from LSTM layer
kernel_flat = kernel.flatten()
recurrent_kernel_flat = recurrent_kernel.flatten()

# Create indices for the weights
kernel_index = np.arange(len(kernel_flat))
recurrent_kernel_index = np.arange(len(recurrent_kernel_flat))

# Plotting the kernel weights
plt.figure(figsize=(12, 6))

# Plot kernel weights (input-to-hidden)
plt.subplot(1, 2, 1)
plt.plot(kernel_index, kernel_flat, marker='o', linestyle='-', markersize=2, color='blue')
plt.title('Input-to-Hidden Weights (Kernel)')
plt.xlabel('Weight Index')
plt.ylabel('Weight Value')

# Plot recurrent kernel weights (hidden-to-hidden)
plt.subplot(1, 2, 2)
plt.plot(recurrent_kernel_index, recurrent_kernel_flat, marker='o', linestyle='-', markersize=2, color='red')
plt.title('Hidden-to-Hidden Weights (Recurrent Kernel)')
plt.xlabel('Weight Index')
plt.ylabel('Weight Value')

plt.tight_layout()
plt.show()
