In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout


In [4]:
import zipfile
import os

zip_file_path = "/content/drive/MyDrive/fake or real news.zip"
extract_path = "/content/"

# Ensure the extract path exists
if not os.path.exists(extract_path):
    os.makedirs(extract_path)

# Unzip the specified file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Check if 'Fake.csv' is in the zip file and extract it
    if 'Fake.csv' in zip_ref.namelist():
        zip_ref.extract('Fake.csv', extract_path)
        print("Fake.csv extracted successfully.")
    else:
        print("Fake.csv not found in the zip archive.")
        # Handle case where file is not found inside zip, e.g., raise an error or exit.
        # For now, we'll assume it's there and proceed.

# Read the extracted CSV file
df = pd.read_csv(os.path.join(extract_path, "Fake.csv"))
df.head()

Fake.csv extracted successfully.


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [6]:
X = df['text'].astype(str).values
# Create a 'label' column since it's missing.
# Assign 0 for 'fake news' as this DataFrame is from Fake.csv
df['label'] = 0
y = df['label'].values

In [9]:
max_words = 50000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X)

X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=max_len)

X_train, X_test, y_train, y_test = train_test_split(
    X_pad, y, test_size=0.2, random_state=42
)
print(X_train)
print(X_test)

[[1655  456    7 ... 1269   90 8589]
 [  11    6 2172 ...    2 2602 2375]
 [ 148 2855 1030 ... 2145    3   21]
 ...
 [  18 3344    4 ...  116   90 2022]
 [   6 4804 1758 ...  282  264 2022]
 [   9 1392 2163 ...   90  629   75]]
[[    0     0     0 ...     0     0     0]
 [    6  7350  1436 ...  4135    90 30840]
 [    0     0     0 ...     2  2333  5451]
 ...
 [    5    37    15 ...   861   282   264]
 [   54    42   175 ...    90   629    75]
 [ 1491    28   341 ...   400   853   256]]


In [10]:
model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    SimpleRNN(128, return_sequences=False),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

model.summary()




In [11]:
history = model.fit(
    X_train, y_train,
    epochs=3,
    batch_size=128,
    validation_split=0.2
)


Epoch 1/3
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 371ms/step - accuracy: 0.9751 - loss: 0.0974 - val_accuracy: 1.0000 - val_loss: 2.0192e-04
Epoch 2/3
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 372ms/step - accuracy: 1.0000 - loss: 2.2889e-04 - val_accuracy: 1.0000 - val_loss: 1.0310e-04
Epoch 3/3
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 369ms/step - accuracy: 1.0000 - loss: 1.2540e-04 - val_accuracy: 1.0000 - val_loss: 6.2911e-05


In [12]:
loss, acc = model.evaluate(X_test, y_test)
print("Accuracy =", acc)


[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 33ms/step - accuracy: 1.0000 - loss: 6.2638e-05
Accuracy = 1.0


In [13]:
sample = ["This news is very true and confirmed"]
seq = tokenizer.texts_to_sequences(sample)
pad = pad_sequences(seq, maxlen=max_len)
print("Prediction:", model.predict(pad))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step
Prediction: [[7.185094e-05]]


In [15]:
model.save('/content/drive/MyDrive/fake_or_real_news_model.keras')