In [None]:
!pip install lime

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from sklearn.metrics import accuracy_score, f1_score
import lime
from lime.lime_text import LimeTextExplainer
from operator import itemgetter
from tqdm import tqdm
import csv

In [None]:
# Load data
def load_data(data_file):
    df = pd.read_csv(data_file)
    df.fillna("", inplace=True)
    tweets = df['Tweet'].tolist()
    labels = [0 if label == "Democrat" else 1 for label in df['Party'].tolist()]
    return np.array(tweets), np.array(labels)

tweets, labels = load_data('/content/drive/MyDrive/Datasets/Tweets/ExtractedTweets.csv')

In [None]:
# Tokenize and pad sequences
max_words = 50000
max_len = 128
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(tweets)
sequences = tokenizer.texts_to_sequences(tweets)
tweets = pad_sequences(sequences, maxlen=max_len)

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2, random_state=42)

In [None]:
# Define the RNN model
embedding_dim = 128

In [None]:
rnn_model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [None]:
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the RNN model
history = rnn_model.fit(X_train, y_train, epochs=3, batch_size=32, validation_split=0.2)

In [None]:
# Predict on the test set
y_pred = (rnn_model.predict(X_test) > 0.5).astype("int32")

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1:.4f}')

In [None]:
# Define a prediction function for LIME
def predict_proba(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_len)
    return np.array([[1 - pred[0], pred[0]] for pred in rnn_model.predict(padded_sequences)])

In [None]:
# Create a LIME explainer
explainer = LimeTextExplainer(class_names=['Democrat', 'Republican'])

In [None]:
# Function to explain predictions
def explain_prediction(tweet):
    explanation = explainer.explain_instance(tweet, predict_proba, num_features=10)
    return explanation

In [None]:
def sort_tuples_array_by_second_item(tuples):
    return sorted(tuples, key=itemgetter(1))

In [None]:
# Extract strong words using LIME
words = {}
wordsForCSV = []

In [None]:
def get_max_explained_words(txt):
    prediction = 'Democrat' if predict_proba([txt])[0][1] < 0.5 else 'Republican'
    exp = explain_prediction(txt)

    exp_list = [(x[1][0], x[1][1], x[0][0]) for x in zip(exp.local_exp[1], exp.as_list())]
    democrat_list = sort_tuples_array_by_second_item([x for x in exp_list if x[1] < 0])
    republican_list = sort_tuples_array_by_second_item([x for x in exp_list if x[1] > 0])

    if prediction == "Democrat":
        for mc in democrat_list[:2]:  # get top 2
            if (mc[0], 0) in words:
                words[(mc[0], 0)]['lime_score'].append(mc[1])
            else:
                words[(mc[0], 0)] = {'lime_score': [mc[1]], 'position': mc[2]}
                wordsForCSV.append([mc[0], 0, mc[1]])
    else:
        for mc in republican_list[-2:]:  # get top 2
            if (mc[0], 1) in words:
                words[(mc[0], 1)]['lime_score'].append(mc[1])
            else:
                words[(mc[0], 1)] = {'lime_score': [mc[1]], 'position': mc[2]}
                wordsForCSV.append([mc[0], 1, mc[1]])

    return words, wordsForCSV


In [None]:
for tweet in tqdm(tweets, total=len(tweets)):
    words, wordsForCSV = get_max_explained_words(tweet)

In [None]:
# Save extracted words to CSV
header = ["word", "label", "lime_score"]
with open('/content/drive/MyDrive/Datasets/Tweets/extracted_strong_words_rnn.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(wordsForCSV)