In [None]:
import pandas as pd

data = pd.read_csv("Data/955,000_rows_preprocessed.csv", usecols=["type", "content"]) #Read the CSV file


In [None]:
print(data['content'])
print(type(data['content'].iloc[0]))

average_length = data['content'].apply(len).median()

print(f"Median length: {average_length}")


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load your dataset (assuming df is your DataFrame)
df = data.dropna(subset=['type', 'content']).copy()

df['type'] = df['type'].map({
    'unreliable': 0.0,
    'fake': 0.0,
    'clickbait': 1.0,
    'conspiracy': 0.0,
    'bias': 0.0,
    'hate': 0.0,
    'junksci': 0.0,
    'political': 1.0,
    'unknown': 0.0,
    'reliable': 1.0
})

# Replace NaN values with "unknown"
df.fillna(0.0, inplace=True)

# Verify the changes
df.info()

# Tokenization
tokenizer = Tokenizer(num_words=10000)  # Use top 10,000 words
tokenizer.fit_on_texts(df['content'])

# Convert text to sequences
X_seq = tokenizer.texts_to_sequences(df['content']) # word embedding
print(X_seq)

# Padding sequences to ensure equal length
X_padded = pad_sequences(X_seq, maxlen=int(1800))  # Max length of 1800 words

# Extract labels
y = df['type'].values
print(y)
#Runtime med 995,000 rows 11 min 45 sec

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=0)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam



model = Sequential([
    Embedding(input_dim=10000, output_dim=128),  # Word Embeddings
    LSTM(128, return_sequences=False),  # Main LSTM layer
    Dropout(0.5),  # Prevent overfitting
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary Classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=600, validation_split=0.2)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

y_pred = (model.predict(X_test) > 0.5).astype("int32")

f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:\n", conf_matrix)
