In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, SpatialDropout1D

# Load the dataset
file_path = '/content/Truth_Seeker_Model_Dataset.csv'
dataset = pd.read_csv(file_path)

# Preprocess the text data
statements = dataset['statement'].astype(str).values
tweets = dataset['tweet'].astype(str).values
labels = dataset['BinaryNumTarget'].astype(int).values

# Combine statements and tweets into one feature
combined_text = [s + " " + t for s, t in zip(statements, tweets)]

# Tokenize the text data
max_num_words = 5000
max_sequence_length = 250

tokenizer = Tokenizer(num_words=max_num_words, oov_token="<OOV>")
tokenizer.fit_on_texts(combined_text)
sequences = tokenizer.texts_to_sequences(combined_text)
word_index = tokenizer.word_index

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Build and compile the DNN model
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=max_num_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
batch_size = 64
epochs = 10

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=2)

# Predict on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate accuracy, precision, and recall
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy}")
print(f"Test Precision: {precision}")
print(f"Test Recall: {recall}")


Epoch 1/10
1678/1678 - 1194s - loss: 0.0680 - accuracy: 0.9767 - val_loss: 0.0049 - val_accuracy: 0.9993 - 1194s/epoch - 711ms/step
Epoch 2/10
1678/1678 - 1188s - loss: 0.1081 - accuracy: 0.9645 - val_loss: 0.0369 - val_accuracy: 0.9940 - 1188s/epoch - 708ms/step
Epoch 3/10
1678/1678 - 1182s - loss: 0.0201 - accuracy: 0.9968 - val_loss: 0.0253 - val_accuracy: 0.9955 - 1182s/epoch - 705ms/step
Epoch 4/10
1678/1678 - 1188s - loss: 0.0075 - accuracy: 0.9986 - val_loss: 0.0200 - val_accuracy: 0.9977 - 1188s/epoch - 708ms/step
Epoch 5/10
1678/1678 - 1186s - loss: 0.0063 - accuracy: 0.9990 - val_loss: 0.0028 - val_accuracy: 0.9996 - 1186s/epoch - 707ms/step
Epoch 6/10
1678/1678 - 1173s - loss: 0.0098 - accuracy: 0.9986 - val_loss: 0.0066 - val_accuracy: 0.9985 - 1173s/epoch - 699ms/step
Epoch 7/10
1678/1678 - 1174s - loss: 0.0020 - accuracy: 0.9996 - val_loss: 0.0020 - val_accuracy: 0.9997 - 1174s/epoch - 700ms/step
Epoch 8/10
1678/1678 - 1180s - loss: 0.0017 - accuracy: 0.9997 - val_loss: 0