
# Create a Bi-LSTEM Model with Attention for Bias & for Tone/Bias Analysis
## For Framing Analysis Project by Shreya and Jennifer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/''ISM 6564 - Text Analytics''/''Frame Analysis''/Training_Data.csv') #Change MyDrive to the correct path if needed
print(df.head())


Mounted at /content/drive
                                               title  first_tone_annotation  \
0  A Set of Borders to Cross; For Children Seekin...                  17.35   
1                   LAY OFF THE GUEST WORKER WE WANT                  17.35   
2  Would ban on renting to illegals make sense here?                  19.35   
3     Immigrants Pull Weight In Economy, Study Finds                  17.35   
4                              The Citizenship Surge                  17.35   

   first_framing_annotation  
0                      10.2  
1                       1.2  
2                       5.2  
3                       1.2  
4                       2.2  


In [None]:
df['first_tone_annotation'] = df['first_tone_annotation'].astype(str)


In [None]:
#Missing values in training set
df.isnull().sum()

Unnamed: 0,0
title,79
first_tone_annotation,0
first_framing_annotation,552


In [None]:
# prompt: drop from dataframe where there are no titles

df = df.dropna(subset=['title'])

In [None]:
# tidy dataset for bias

# Create a new DataFrame with no missing 'first_tone_annotation' values
df_no_missing_tone = df.dropna(subset=['first_tone_annotation'])
df_no_missing_tone = df_no_missing_tone[df_no_missing_tone['first_tone_annotation'] != 'nan']

# Create another DataFrame containing only 'title' and 'first_tone_annotation' columns
df_title_tone = df_no_missing_tone[['title', 'first_tone_annotation']]

In [None]:
df_title_tone['first_tone_annotation'].unique()

array(['17.35', '19.35', '18.3', '17.4', '19.4'], dtype=object)

In [None]:
possible_labels = df_title_tone['first_tone_annotation'].unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'17.35': 0, '19.35': 1, '18.3': 2, '17.4': 3, '19.4': 4}

In [None]:
#add a column to the dataframe with the numerical labels
df_title_tone['label'] = df_title_tone['first_tone_annotation'].replace(label_dict)

  df_title_tone['label'] = df_title_tone['first_tone_annotation'].replace(label_dict)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

# Sample text data and target values
text_data = df_title_tone['title'].tolist() # Call the tolist() method to get the actual list of titles
numeric_labels = np.array(df_title_tone['label'])  # Use the correct column for numeric labels

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(text_data, numeric_labels, test_size=0.2, random_state=42)


# Tokenization for Keras and pre-trained embeddings
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_length = 10  # Set max sequence length
vocab_size = len(tokenizer.word_index) + 1

# Pad the sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')


# Embedding dimensions
embedding_dim = 100


# Load pre-trained GloVe embeddings
embedding_index = {}
glove_file_path = '/content/drive/MyDrive/''ISM 6564 - Text Analytics''/''Frame Analysis''/glove.6B.100d.txt'  # Update this path with your local GloVe file path
with open(glove_file_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefficients = np.array(values[1:], dtype='float32')
        embedding_index[word] = coefficients

# Create embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [None]:
from tensorflow.keras.layers import Bidirectional, Dense, Input, LSTM, Embedding, Multiply, Permute, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K

# Define an attention layer
def attention_layer(inputs):
    # Compute the attention scores
    attention = Dense(1, activation='tanh')(inputs)
    attention = tf.keras.layers.Flatten()(attention)
    attention = tf.keras.layers.Activation('softmax')(attention)
    attention = tf.keras.layers.RepeatVector(100)(attention)
    attention = Permute([2, 1])(attention)

    # Apply attention weights
    output_attention = Multiply()([inputs, attention])
    return Lambda(lambda x: K.sum(x, axis=1))(output_attention)

# Define model with Bidirectional LSTM and attention
input_layer = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False)(input_layer)
bi_lstm_layer = Bidirectional(LSTM(50, return_sequences=True))(embedding_layer)

# Apply the attention layer
attention_output = attention_layer(bi_lstm_layer)

# Final dense layers
output_layer = Dense(1)(attention_output)

# Build and compile the model
model_bi_lstm_attention = Model(inputs=input_layer, outputs=output_layer)
model_bi_lstm_attention.compile(optimizer='adam', loss='mean_squared_error')
model_bi_lstm_attention.summary()

# Train and evaluate
model_bi_lstm_attention.fit(X_train_pad, y_train, epochs=10, batch_size=32, verbose=2, validation_data=(X_test_pad, y_test))
bi_lstm_attention_eval = model_bi_lstm_attention.evaluate(X_test_pad, y_test)


Epoch 1/10
110/110 - 5s - 47ms/step - loss: 1.4839 - val_loss: 1.2702
Epoch 2/10
110/110 - 1s - 6ms/step - loss: 1.2553 - val_loss: 1.2598
Epoch 3/10
110/110 - 1s - 6ms/step - loss: 1.1885 - val_loss: 1.2205
Epoch 4/10
110/110 - 1s - 6ms/step - loss: 1.1390 - val_loss: 1.2228
Epoch 5/10
110/110 - 1s - 6ms/step - loss: 1.0966 - val_loss: 1.2300
Epoch 6/10
110/110 - 1s - 6ms/step - loss: 1.0540 - val_loss: 1.2189
Epoch 7/10
110/110 - 1s - 6ms/step - loss: 0.9950 - val_loss: 1.2448
Epoch 8/10
110/110 - 1s - 6ms/step - loss: 0.9277 - val_loss: 1.2666
Epoch 9/10
110/110 - 1s - 6ms/step - loss: 0.8621 - val_loss: 1.2771
Epoch 10/10
110/110 - 1s - 6ms/step - loss: 0.7925 - val_loss: 1.3281
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.2666


In [None]:
# prompt: generate accuracy and microaccuracy for the model

from sklearn.metrics import accuracy_score, confusion_matrix

# Predict on the test set
y_pred = model_bi_lstm_attention.predict(X_test_pad)

# Convert predicted probabilities to class labels (assuming a threshold of 0.5)
y_pred_labels = (y_pred > 0.5).astype(int)


# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_labels)
print(f"Accuracy: {accuracy}")

# Calculate micro-averaged accuracy (useful for imbalanced datasets)
# In this specific case, with a binary classification problem, accuracy and micro-average are the same.
# For multi-class problems, micro-averaging aggregates across all classes.
# For multi-class problems, micro-averaging will be different than the overall accuracy.
cm = confusion_matrix(y_test, y_pred_labels)
micro_accuracy = np.trace(cm) / np.sum(cm)
print(f"Micro-Averaged Accuracy: {micro_accuracy}")

[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step
Accuracy: 0.36
Micro-Averaged Accuracy: 0.36
