In [25]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
# from google.colab import drive
from tensorflow import keras
from tensorflow.keras import layers
import pickle
from keras import backend as K

In [26]:
# Load the dataset
# drive.mount('/content/drive')
df =pd.read_csv("../Datasets/restructured_data.csv")

In [27]:
# Split the dataset into input and output
X = df['Data']
#X.append(data['reformulated_tweets'])
Y=df['Stance']
#Y.append(data['stance'])
Y = pd.get_dummies(Y).values
#targets = df['target'].unique()

In [28]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

# Pad the sequences
maxlen = 500
X = pad_sequences(X, padding='post', maxlen=maxlen)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)



In [29]:
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(10000, 128)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(3, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 128)         1280000   
                                                                 
 bidirectional_2 (Bidirectio  (None, None, 128)        98816     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 3)                 387       
                                                                 
Total params: 1,478,019
Trainable params: 1,478,019
Non-tra

In [30]:

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [31]:
optimizer = Adam(learning_rate=0.003)
model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy", f1_m, precision_m, recall_m])

In [16]:
# # Define the model
# model = Sequential()
# model.add(Embedding(10000, 128, input_length=maxlen))
# model.add(Bidirectional(LSTM(64)))
# model.add(Dense(3, activation='softmax'))

# # Compile the model
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [32]:
model.fit(X_train, Y_train, batch_size=64, epochs=30, validation_data=(X_test, Y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1becadbb160>

In [33]:
score = model.evaluate(X_test, Y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
score

Test loss: 3.012768030166626
Test accuracy: 0.5648312568664551


[3.012768030166626,
 0.5648312568664551,
 0.5578296780586243,
 0.5594227313995361,
 0.5562865734100342]

In [34]:
print("AGAINST:",Y[0])
print("FAVOR",Y[10])
print("NONE",Y[9])

AGAINST: [1 0 0]
FAVOR [0 1 0]
NONE [0 0 1]


In [35]:
# Predict the stance of new texts
new_texts = ["Women are smart", "Feminism is a myth"]
new_targets = ["Feminist Movement","Feminist Movement","Feminist Movement"]
new_texts = tokenizer.texts_to_sequences(new_texts)
new_texts = pad_sequences(new_texts, padding='post', maxlen=maxlen)
predictions = []
for i in range(len(new_texts)):
  pred = model.predict(np.array([new_texts[i]]))
  print(np.argmax(pred))


1
0


In [36]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test, batch_size=64, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)
y_test = np.argmax(Y_test, axis=1)

print(classification_report(y_test, y_pred_bool))

              precision    recall  f1-score   support

           0       0.64      0.69      0.67       268
           1       0.45      0.42      0.44       135
           2       0.51      0.47      0.49       160

    accuracy                           0.56       563
   macro avg       0.53      0.53      0.53       563
weighted avg       0.56      0.56      0.56       563



In [37]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

print(confusion_matrix(y_true=y_test, y_pred=y_pred_bool))

[[185  42  41]
 [ 45  57  33]
 [ 57  27  76]]


In [38]:
print("Accuracy: ",accuracy_score(y_test, y_pred_bool))
print("Recall Score: ",recall_score(y_test, y_pred_bool, average='weighted'))
print("Precision Score: ",precision_score(y_test, y_pred_bool, average='weighted'))
print("F1 Score: ",f1_score(y_test, y_pred_bool, average='weighted'))

Accuracy:  0.5648312611012434
Recall Score:  0.5648312611012434
Precision Score:  0.559308540814411
F1 Score:  0.5614277030395591


In [39]:
pickle.dump(model, open("Bi-LSTM.pkl", 'wb'))

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\bidirectional
......vars
...layers\bidirectional\backward_layer
......vars
...layers\bidirectional\backward_layer\cell
......vars
.........0
.........1
.........2
...layers\bidirectional\forward_layer
......vars
...layers\bidirectional\forward_layer\cell
......vars
.........0
.........1
.........2
...layers\bidirectional\layer
......vars
...layers\bidirectional\layer\cell
......vars
...layers\bidirectional_1
......vars
...layers\bidirectional_1\backward_layer
......vars
...layers\bidirectional_1\backward_layer\cell
......vars
.........0
.........1
.........2
...layers\bidirectional_1\forward_layer
......vars
...layers\bidirectional_1\forward_layer\cell
......vars
.........0
.........1
.........2
...layers\bidirectional_1\layer
......vars
...layers\bidirectional_1\layer\cell
......vars
...layers\dense
......vars
.........0
.........1
...layers\embedding
......vars
.........0
...layers\input_layer
......vars
...m