# Sentiment Analysis

This is a second prediction exercise with RNN but using in this case data in text format. To do this, we will use the https://www.kaggle.com/datasets/ankurzing/sentiment-analysis-for-financial-news  dataset, where financial news headlines appear, and along with it a category indicating whether that text corresponds to a positive, negative or neutral sentiment. The categories of feelings are:
▪ Positive.
▪ Negative
▪ Neutral.


In [2]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import gensim.downloader as api
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, precision_score,recall_score, log_loss
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder

In [None]:
#============================================================================
# 1. Load Data
#============================================================================
# Load files
tf.random.set_seed(42)
path_files = "datasets/bbc-fulltext-and-category"
df_raw = pd.read_csv(path_files+'/bbc-text.csv')
# Shuffle input
df_raw = df_raw.sample(frac=1)

In [None]:
# Load word2vec
word_vectors = api.load("glove-wiki-gigaword-100")
vocabulary = [x for x in word_vectors.vocab]
# Set lemmatizer
lemmatizer = WordNetLemmatizer()
# Check embeddings of one word
vector = word_vectors['computer']
print(vector)
# Label encoding
lb = LabelEncoder()
df_raw['category'] = lb.fit_transform(df_raw['category'])
X = pd.DataFrame(df_raw['text'])
y = df_raw['category']

In [None]:
#============================================================================
# 0. General functions
#============================================================================
def word_vector(df_input, lemmatizer, word_vectors, vocabulary, col_sentences):
 """
 Function to preprocess the input words and get a list with
 the embeddings arrays of the words in each record.
 Parameters
 ----------
 df_input : dataframe
 input dataframe with all texts.
 lemmatizer : object
 NLTK stemming object.
 word_vectors : object
 object with the word2vecs of the Gensim vocabulary.
 vocabulary : list
 List of existing words in Gensim's vocabulary.
 col_sentences : str
 column of the dataframe where the phrases are.
 Returns
 -------
 X : list
 List of lists in which each record has the list with the arrays
 of the embeddings of the words of that phrase. That is, X[0] has
 a list where each element corresponds to the embeddings of a word.
 Thus, for example, X[0][2] will be a vector of dimension 100 where it
appears
 the embeddings vector of the third word of the first sentence.
 """


 X = []

 for text in df_input[col_sentences]:

 # Tokenize every phrase
 # Change all to lower case
  words = re.findall(r'\w+', text.lower(),flags = re.UNICODE)

  # Elimination of stop_words
  words = [word for word in words if word not in
  stopwords.words('english')]
  # Remove hyphens and other weird symbols
  words = [word for word in words if not word.isdigit()] # Elimino numeros
  # Stemming
  words = [lemmatizer.lemmatize(w) for w in words]
  # Delete words that are not in the vocabulary
  words = [word for word in words if word in vocabulary]
  # Word2Vec
  words_embeddings = [word_vectors[x] for x in words]

  # Save the final sentence
  X.append(words_embeddings) # save as a numpy array

 return X


In [None]:
#===========================================================================
# 2. Preprocess
#===========================================================================
# Obtain X variable and prepare y.
X = word_vector(X,
 lemmatizer,
 word_vectors,
 vocabulary,
 col_sentences="text")

In [None]:
# One-hot encode output
y = to_categorical(y)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=42)

# Obtain tensor: [N_SENTENCES x SEQ_LENGTH x EMBEDDING_FEATURES]
SEQ_LENGTH = np.int(np.round(np.percentile([len(x) for x in X], 99,interpolation = 'midpoint')))

# SEQ_LENGTH = np.int(np.round(np.percentile([len(x) for x in X], 100,interpolation = 'midpoint')))

data_train = pad_sequences(X_train, maxlen=SEQ_LENGTH, padding="post", truncating="post")
data_test = pad_sequences(X_test,maxlen=SEQ_LENGTH,padding="post",truncating="post")

In [None]:
def create_RNN(x_train, K, n_lstm=8, loss='categorical_crossentropy',
optimizer='adam'):
 """
 Function to create the RNN. As input parameter we only need the array
 of features to specify the input dimensionality of the NN.
 Parameters
 ----------
 x_input : array
 Input feature matrix.
 K: int
 Clases de salida
 n_lstm : int, optional
 Number of lstm used. The default is 8.
 loss : string, optional
 loss metric. The default is 'categorical_crossentropy'.
 optimizer : string, optional
 optimizer. The default is 'adam'.
 Returns
 -------
 model : object
Advanced deep learning – Supervised deep learning (II)
83 © Structuralia
 Trained model.
 """

 # Begin sequence
 model = tf.keras.Sequential()

 # Add a LSTM layer with 8 internal units.
 model.add(LSTM(n_lstm, input_shape=x_train.shape[-2:]))

 # Add Dropout
 # model.add(Dropout(0.5))

 # # Another layer
 # model.add(Dense(100, activation='relu'))

 # # Output
 model.add(Dense(K, activation='softmax'))

 # Compile model
 model.compile(loss=loss, optimizer=optimizer)

 return model
# ===========================================================================
# 3. Train model
# ===========================================================================
# Params
# M = 50 # hidden layer size
K = y_train.shape[1] # N classes
# V = data_train.shape[2] # EMBEDDING_FEATURES
batch_size = 500
epochs = 100
# Create RNN
model = create_RNN(x_train = data_train,K = K, n_lstm = 200,
loss = 'categorical_crossentropy', optimizer = 'adam')
print(model.summary())
# Fit model
model.fit(data_train,y_train,epochs = epochs,batch_size = batch_size)
# Save model
model.save('model_nlp_reviews2.h5')

Finally, we evaluate the model

In [None]:
# ===========================================================================
# 4. Evaluate
# ===========================================================================
# Obtain predictions
y_pred = model.predict(data_test)

# Obtain original values (not one-hot encoded)
if type(y_test) != list:
  y_test = [np.argmax(x) for x in y_test]

y_pred = [np.argmax(x) for x in y_pred]
# Evaluate results
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: ", cm)
print("Precision: ", precision_score(y_test, y_pred, average='macro'))
print("Recall: ", recall_score(y_test, y_pred, average='macro'))
print("f1_score: ", f1_score(y_test, y_pred, average='macro'))
