# I load The dataset 

In [40]:
import pandas as pd
LIAR = pd.read_csv("liar_full_dataset_preprocessed.csv", usecols=["content", "type"])
LIAR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12788 entries, 0 to 12787
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   type     10239 non-null  object
 1   content  12788 non-null  object
dtypes: object(2)
memory usage: 199.9+ KB


# I check the median length of the number of letters and words

In [41]:
# Calculate the median length of the content lists
median_length_symbols = LIAR['content'].apply(lambda x: len(x))
median_length_words = LIAR['content'].apply(lambda x: len(x.split()))
print(f"Median length: {median_length_words.median()}")
print(f"Median length: {median_length_symbols.median()}")

Median length: 11.0
Median length: 94.0


# I load The model

In [42]:
from tensorflow.keras.models import load_model

# Load the model
model = load_model('my_model_v2.keras')

# Verify the model architecture
model.summary()

# I Classify the type into a binary classification of fake and reliable. Then tokenize the text and pad the content to fit the model


In [43]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

# Use the existing LIAR dataframe
LIAR['type'] = LIAR['type'].map({
    'true': 1.0,
    'false': 0.0,
    'half-true': 0.0,
    'pants-fire': 0.0,
    'barely-true': 1.0,
    'mostly-true': 0.0
})

# Replace NaN values with 0.0
LIAR.fillna(0.0, inplace=True)

# Verify the changes
LIAR.info()

# Tokenization
tokenizer = Tokenizer(num_words=10000)  # Use top 10,000 words
tokenizer.fit_on_texts(LIAR['content'])

# Convert text to sequences
X_seq = tokenizer.texts_to_sequences(LIAR['content']) # word embedding

# Padding sequences to ensure equal length
X_padded = pad_sequences(X_seq, maxlen=12)  # Max length of 1800 words

# Extract labels
y = LIAR['type'].values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12788 entries, 0 to 12787
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   type     12788 non-null  float64
 1   content  12788 non-null  object 
dtypes: float64(1), object(1)
memory usage: 199.9+ KB


# I then apply our model to the test data and get the predictions

In [44]:
loss, accuracy = model.evaluate(X_padded, y)

print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Make predictions
predictions = model.predict(X_padded)
binary_predictions = (predictions > 0.5).astype("int32")

# Calculate metrics
accuracy = accuracy_score(y, binary_predictions)
f1 = f1_score(y, binary_predictions)
conf_matrix = confusion_matrix(y, binary_predictions)

print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:\n", conf_matrix)



[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5818 - loss: 0.9645
Test Accuracy: 47.89%
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Accuracy: 0.48
F1 Score: 0.19
Confusion Matrix:
 [[5329 4129]
 [2535  795]]
