In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, concatenate, Dropout, Input
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from keras.regularizers import l1_l2
from keras.models import Sequential
import tensorflow.compat.v1 as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from textblob import TextBlob
from afinn import Afinn
from pattern.en import sentiment
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('punkt')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df=pd.read_csv('truthseeker_emotions.csv')
df.head(5)

Unnamed: 0.1,Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,3_label_majority_answer,anger,disgust,fear,joy,neutral,sadness,surprise
0,0,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@POTUS Biden Blunders - 6 Month Update\n\nInfl...,Mostly Agree,Agree,0,0,0,0,0,1,0
1,1,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,NO MAJORITY,Agree,0,0,0,0,0,1,0
2,2,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",THE SUPREME COURT is siding with super rich pr...,Agree,Agree,0,0,0,0,0,1,0
3,3,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@POTUS Biden Blunders\n\nBroken campaign promi...,Mostly Agree,Agree,0,0,0,0,0,1,0
4,4,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@OhComfy I agree. The confluence of events rig...,Agree,Agree,0,0,0,0,0,1,0


In [3]:
df['textblob'] = df['tweet'].apply(lambda x: TextBlob(x).sentiment.polarity)
afinn = Afinn()
df['afinn'] = df['tweet'].apply(lambda x: afinn.score(x))
df['pattern'] = df['tweet'].apply(lambda x: sentiment(x)[0])
analyzer = SentimentIntensityAnalyzer()

df['vader'] = df['tweet'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

In [4]:
selected_columns = ['tweet', 'BinaryNumTarget', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise',
                    'afinn', 'pattern', 'vader', 'textblob']

In [5]:
df1 = df[selected_columns]

In [6]:
df1.head(3)

Unnamed: 0,tweet,BinaryNumTarget,anger,disgust,fear,joy,neutral,sadness,surprise,afinn,pattern,vader,textblob
0,@POTUS Biden Blunders - 6 Month Update\n\nInfl...,1.0,0,0,0,0,0,1,0,-7.0,0.5,-0.9169,0.5
1,@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,1.0,0,0,0,0,0,1,0,-10.0,0.125,-0.9449,0.125
2,THE SUPREME COURT is siding with super rich pr...,1.0,0,0,0,0,0,1,0,1.0,0.206667,-0.3147,0.206667


In [7]:
df1['BinaryNumTarget'] = df1['BinaryNumTarget'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['BinaryNumTarget'] = df1['BinaryNumTarget'].astype(int)


In [8]:
X_text = df1['tweet'].values
X_emotions = df1[['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']].values
X_scores = df1[['afinn', 'pattern', 'vader', 'textblob']].values
y = df1['BinaryNumTarget'].values


In [9]:
statements = df1['tweet']

max_len_text = max(len(statement.split()) for statement in statements)

print("Maximum length of statements:", max_len_text)

Maximum length of statements: 108


In [10]:
max_len_text = max_len_text  
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_text)
sequences = tokenizer.texts_to_sequences(X_text)
X_text_pad = pad_sequences(sequences, maxlen=max_len_text)

In [11]:
embedding_dim = 100  
vocab_size = len(tokenizer.word_index) + 1

In [12]:
def create_model():
    model_text = Sequential()
    model_text.add(Embedding(vocab_size, embedding_dim, input_length=max_len_text))
    model_text.add(Bidirectional(LSTM(128))) 

    model_emotions = Sequential()
    model_emotions.add(Dense(64, input_shape=(7,)))  # 7 features for emotions 
    model_scores = Sequential()
    model_scores.add(Dense(64, input_shape=(4,)))  # 4 features for sentiment scores

    merged = concatenate([model_text.output, model_emotions.output, model_scores.output])
    merged = Dense(64, activation='relu')(merged)
    output = Dense(1, activation='sigmoid')(merged)

    model = tf.keras.models.Model(inputs=[model_text.input, model_emotions.input, model_scores.input], outputs=output)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [13]:
n_splits = 5  
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [14]:
for fold, (train_index, test_index) in enumerate(skf.split(X_text_pad, y)):
    print(f"Training Fold {fold+1}...")

    X_train_text, X_test_text = X_text_pad[train_index], X_text_pad[test_index]
    X_train_emotions, X_test_emotions = X_emotions[train_index], X_emotions[test_index]
    X_train_scores, X_test_scores = X_scores[train_index], X_scores[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = create_model()
    model.fit([X_train_text, X_train_emotions, X_train_scores], y_train, epochs=10, batch_size=32, verbose=1)

Training Fold 1...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Fold 2...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Fold 3...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Fold 4...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Fold 5...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
    loss, accuracy = model.evaluate([X_test_text, X_test_emotions, X_test_scores], y_test, batch_size=32)
    predictions = model.predict([X_test_text, X_test_emotions, X_test_scores])
    predictions = (predictions > 0.5).astype(int)



In [16]:
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

In [17]:
    tn, fp, fn, tp = tf.math.confusion_matrix(y_test, predictions).numpy().ravel()
    accuracy_scores.append((tp + tn) / (tp + tn + fp + fn))
    precision_scores.append(tp / (tp + fp))
    recall_scores.append(tp / (tp + fn))
    f1_scores.append(2 * tp / (2 * tp + fp + fn))

    print(f'Fold {fold+1} - Test Accuracy: {accuracy_scores[-1]:.4f}, Precision: {precision_scores[-1]:.4f}, Recall: {recall_scores[-1]:.4f}, F1-score: {f1_scores[-1]:.4f}')


Fold 5 - Test Accuracy: 0.9789, Precision: 0.9804, Recall: 0.9785, F1-score: 0.9795


In [18]:
print(f'\nAverage Test Accuracy: {np.mean(accuracy_scores):.4f}')
print(f'Average Precision: {np.mean(precision_scores):.4f}')
print(f'Average Recall: {np.mean(recall_scores):.4f}')
print(f'Average F1-score: {np.mean(f1_scores):.4f}')


Average Test Accuracy: 0.9789
Average Precision: 0.9804
Average Recall: 0.9785
Average F1-score: 0.9795


In [19]:
best_fold = np.argmax(accuracy_scores)
X_train_text, X_test_text = X_text_pad[train_index], X_text_pad[test_index]
X_train_emotions, X_test_emotions = X_emotions[train_index], X_emotions[test_index]
X_train_scores, X_test_scores = X_scores[train_index], X_scores[test_index]
y_train, y_test = y[train_index], y[test_index]

best_model = create_model()
best_model.fit([X_train_text, X_train_emotions, X_train_scores], y_train, epochs=10, batch_size=32, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f7c4a1a950>

In [20]:
loss, accuracy = best_model.evaluate([X_test_text, X_test_emotions, X_test_scores], y_test, batch_size=32)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')

# Make predictions
predictions = best_model.predict([X_test_text, X_test_emotions, X_test_scores])
predictions = (predictions > 0.5).astype(int)

# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Test Loss: 0.1043, Test Accuracy: 0.9776

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     13053
           1       0.97      0.99      0.98     13786

    accuracy                           0.98     26839
   macro avg       0.98      0.98      0.98     26839
weighted avg       0.98      0.98      0.98     26839



In [21]:
best_model.save('fake_news_detection_LSTM.h5')