In [22]:
import numpy as np
import pandas as pd
import keras
import tensorflow as tf
from keras import layers
import string
import re
from sklearn.model_selection import train_test_split

In [23]:
df = pd.read_csv("/content/news.csv", index_col=0)

In [24]:
df['label'] = df['label'].replace('REAL','0')
df['label'] = df['label'].replace('FAKE','1')
df['label'] = df['label'].astype('int32')


In [25]:
df.drop(columns = ['title'], inplace = True)

In [26]:
df

Unnamed: 0,text,label
8476,"Daniel Greenfield, a Shillman Journalism Fello...",1
10294,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
3608,U.S. Secretary of State John F. Kerry said Mon...,0
10142,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
875,It's primary day in New York and front-runners...,0
...,...,...
4490,The State Department told the Republican Natio...,0
8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,1
8622,Anti-Trump Protesters Are Tools of the Oligar...,1
4021,"ADDIS ABABA, Ethiopia —President Obama convene...",0


In [27]:
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [28]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )


max_features = 20000
embedding_dim = 128
sequence_length = 500

In [29]:
vectorize_layer = keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

In [30]:
text_ds = tf.data.Dataset.from_tensor_slices(train_df['text'].values)
vectorize_layer.adapt(text_ds)

In [31]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    label = tf.expand_dims(label, -1)
    return vectorize_layer(text), label

train_ds = tf.data.Dataset.from_tensor_slices((train_df['text'].values, train_df['label'].values))
train_ds = train_ds.map(vectorize_text)

val_ds = tf.data.Dataset.from_tensor_slices((valid_df['text'].values, valid_df['label'].values))
val_ds = val_ds.map(vectorize_text)

test_ds = tf.data.Dataset.from_tensor_slices((test_df['text'].values, test_df['label'].values))
test_ds = test_ds.map(vectorize_text)

train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

In [32]:
inputs = keras.Input(shape=(None,), dtype="int64")

x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = keras.Model(inputs, predictions)

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [33]:
epochs = 10

model.fit(train_ds, validation_data=val_ds, epochs=epochs, batch_size=64)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7bfa717320b0>

In [34]:
val_loss, val_accuracy = model.evaluate(val_ds, batch_size=64)




In [35]:
val_accuracy

0.9315789341926575