In [49]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
# from google.colab import drive
from tensorflow import keras
from tensorflow.keras import layers
import pickle
from keras import backend as K

In [50]:
# Load the dataset
# drive.mount('/content/drive')
df =pd.read_csv("../Datasets/restructured_data.csv")

In [51]:
# Split the dataset into input and output
X = df['Data']
#X.append(data['reformulated_tweets'])
Y=df['Stance']
#Y.append(data['stance'])
Y = pd.get_dummies(Y).values
#targets = df['target'].unique()

In [52]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

# Pad the sequences
maxlen = 100
X = pad_sequences(X, padding='post', maxlen=maxlen)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)



In [53]:
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(10000, 128)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(3, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_3 (Embedding)     (None, None, 128)         1280000   
                                                                 
 bidirectional_6 (Bidirectio  (None, None, 128)        98816     
 nal)                                                            
                                                                 
 bidirectional_7 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense_3 (Dense)             (None, 3)                 387       
                                                                 
Total params: 1,478,019
Trainable params: 1,478,019
Non-tra

In [54]:

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [55]:
optimizer = Adam(learning_rate=0.003)
model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy", f1_m, precision_m, recall_m])

In [56]:
# # Define the model
# model = Sequential()
# model.add(Embedding(10000, 128, input_length=maxlen))
# model.add(Bidirectional(LSTM(64)))
# model.add(Dense(3, activation='softmax'))

# # Compile the model
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [57]:
model.fit(X_train, Y_train, batch_size=64, epochs=25, validation_data=(X_test, Y_test))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x2f71c766980>

In [58]:
score = model.evaluate(X_test, Y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
score

Test loss: 2.649840831756592
Test accuracy: 0.5328596830368042


[2.649840831756592,
 0.5328596830368042,
 0.521612286567688,
 0.5730702877044678,
 0.48008039593696594]

In [44]:
print("AGAINST:",Y[0])
print("FAVOR",Y[10])
print("NONE",Y[9])

AGAINST: [1 0 0]
FAVOR [1 0 0]
NONE [1 0 0]


In [59]:
# Predict the stance of new texts
new_texts = ["Women are smart", "Feminism is a myth"]
new_targets = ["Feminist Movement","Feminist Movement","Feminist Movement"]
new_texts = tokenizer.texts_to_sequences(new_texts)
new_texts = pad_sequences(new_texts, padding='post', maxlen=maxlen)
predictions = []
for i in range(len(new_texts)):
  pred = model.predict(np.array([new_texts[i]]))
  print(np.argmax(pred))


2
2


In [60]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test, batch_size=64, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)
y_test = np.argmax(Y_test, axis=1)

print(classification_report(y_test, y_pred_bool))

              precision    recall  f1-score   support

           0       0.69      0.66      0.67       267
           1       0.37      0.31      0.34       160
           2       0.43      0.54      0.48       136

    accuracy                           0.53       563
   macro avg       0.50      0.51      0.50       563
weighted avg       0.54      0.53      0.53       563



In [61]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

print(confusion_matrix(y_true=y_test, y_pred=y_pred_bool))

[[176  60  31]
 [ 41  50  69]
 [ 38  24  74]]


In [62]:
print("Accuracy: ",accuracy_score(y_test, y_pred_bool))
print("Recall Score: ",recall_score(y_test, y_pred_bool, average='weighted'))
print("Precision Score: ",precision_score(y_test, y_pred_bool, average='weighted'))
print("F1 Score: ",f1_score(y_test, y_pred_bool, average='weighted'))

Accuracy:  0.5328596802841918
Recall Score:  0.5328596802841918
Precision Score:  0.5360975593930019
F1 Score:  0.5317882379698788


In [None]:
pickle.dump(model, open("Bi-LSTM.pkl", 'wb'))