In [2]:
#example based on https://keras.io/guides/working_with_rnns/
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import pandas as pd




In [4]:
# Load data
#dataset link https://www.kaggle.com/datasets/fhamborg/news-articles-sentiment
data_train = pd.read_json("train.jsonl", lines=True)
data_test = pd.read_json("test.jsonl", lines=True)
#data_train.head()
data_test.head()

x_train = data_train.sentence
y_train = data_train.polarity

x_test = data_test.sentence
y_test = data_test.polarity

In [5]:
print(len(data_train), "Training")
print(len(data_test), "Test")

# remove duplicates
#data_train = data_train.drop_duplicates(subset='sentence', keep='first')
#data_test = data_test.drop_duplicates(subset='sentence', keep='first')

8739 Training
803 Test


In [7]:
# Preparing sentence and label for training
X_train = data_train['sentence']
y_train = data_train['polarity']

max_length = 200

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=max_length)

y_train_categorical = to_categorical(y_train, num_classes=3)

In [9]:
v_size = len(tokenizer.word_index) + 1
print(v_size)

19585


In [11]:
# Testing data prep
X_val = data_test['sentence']
y_val = data_test['polarity']

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val, maxlen=max_length)

# Convert labels to categorical
y_val_categorical = to_categorical(y_val, num_classes=3)

In [12]:
#check dataset for all polarity options to see if there is an unbalanced level of training data

df = pd.read_json('train.jsonl', lines=True)
rows_with_precision_minus_one = df[df['polarity'] == -1]
# Get the count of rows with precision -1
count_precision_minus_one = len(rows_with_precision_minus_one)
print(f"Number of rows with polarity -1: {count_precision_minus_one}")

rows_with_precision_minus_one = df[df['polarity'] == 0]
# Get the count of rows with precision 0
count_precision_minus_one = len(rows_with_precision_minus_one)
print(f"Number of rows with polarity 0: {count_precision_minus_one}")

rows_with_precision_minus_one = df[df['polarity'] == 1]
# Get the count of rows with precision 1
count_precision_minus_one = len(rows_with_precision_minus_one)
print(f"Number of rows with polarity 1: {count_precision_minus_one}")

Number of rows with polarity -1: 3316
Number of rows with polarity 0: 3028
Number of rows with polarity 1: 2395


Slight bias will resamlpe the dataset os they have more similar spread of polarity


In [13]:
#resample data, oversampling
from sklearn.utils import resample
import pandas as pd

df = pd.read_json('train.jsonl', lines=True)

# Separate classes
df_class_minus_one = df[df['polarity'] == -1]
df_class_0 = df[df['polarity'] == 0]
df_class_1 = df[df['polarity'] == 1]

# Oversample minority classes
df_class_minus_one_oversampled = resample(df_class_minus_one, replace=True, n_samples=len(df_class_0), random_state=42)
df_class_1_oversampled = resample(df_class_1, replace=True, n_samples=len(df_class_0), random_state=42)

# Combine oversampled minority classes with majority class
df_oversampled = pd.concat([df_class_minus_one_oversampled, df_class_0, df_class_1_oversampled])

# Shuffle the classes
df_oversampled = df_oversampled.sample(frac=1, random_state=42)

# Display the counts after oversampling
print(df_oversampled['polarity'].value_counts())


polarity
 1    3028
 0    3028
-1    3028
Name: count, dtype: int64


In [14]:
# retrain a new model and tokenise and split the data
max_length = 200

x_predict = df_oversampled['sentence']
y_label = df_oversampled['polarity']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_predict)
x_predict = tokenizer.texts_to_sequences(x_predict)
x_predict = pad_sequences(x_predict, maxlen=max_length)

In [15]:
# splitting over sampled data into a training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
x_predict, x_p, y_label, y_l = train_test_split(x_predict, y_label, test_size=0.2, random_state=42)

y_label_categorical = to_categorical(y_label, num_classes=3)
y_l_categorical = to_categorical(y_l, num_classes=3)

# Print the lengths of the training and validation sets
print(len(x_predict), "Training sequences")
print(len(x_p), "Validation sequences")

print(len(y_label), "Training sequences")
print(len(y_l), "Validation sequences")

7267 Training sequences
1817 Validation sequences
7267 Training sequences
1817 Validation sequences


In [16]:
# creating new model

embedding_dim = 50
vocab_size = 18000
max_length = 200

m1 = Sequential()
m1.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
m1.add(LSTM(units=150, dropout=0.3, recurrent_dropout=0.3))
m1.add(Dense(units=3, activation='softmax'))

m1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
m1.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 50)           900000    
                                                                 
 lstm (LSTM)                 (None, 150)               120600    
                                                                 
 dense (Dense)               (None, 3)                 453       
                                                                 
Total params: 1021053 (3.90 MB)
Trainable params: 1021053 (3.90 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
# Train the model
m1.fit(x_predict, y_label_categorical, epochs=10, batch_size=64, validation_data=(x_p, y_l_categorical))

# Evaluate the model
loss, accuracy = m1.evaluate(x_p, y_l_categorical)
print(f'Loss: {loss}, Accuracy: {accuracy * 100:.2f}%')

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 0.951392650604248, Accuracy: 73.03%


In [21]:
from sklearn.metrics import precision_score, recall_score

# Predict on validation data
y_val_pred_prob_m1 = m1.predict(x_p)
y_val_pred_m1 = np.argmax(y_val_pred_prob_m1, axis=1)

y_val_true_m1 = np.argmax(y_l_categorical, axis=1)

# Calculate precision and recall for each class
precision_m1 = precision_score(y_val_true_m1, y_val_pred_m1, average='macro')
recall_m1 = recall_score(y_val_true_m1, y_val_pred_m1, average='macro')

# print the results
print(f'Precision: {precision_m1:.4f}')
print(f'Recall: {recall_m1:.4f}')



Precision: 0.7256
Recall: 0.7286
