In [2]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
news_dataset = pd.read_csv('train.csv',
                           engine='python',
                           on_bad_lines='skip')

In [9]:
news_dataset = news_dataset.fillna('')


In [10]:
news_dataset['content'] = news_dataset['author'] + ' ' + news_dataset['title']


In [11]:
port_stem = PorterStemmer()

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content
                       if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content


In [12]:
news_dataset['content'] = news_dataset['content'].apply(stemming)


In [17]:
# separating data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

# ----- CLEAN LABELS -----
Y = pd.to_numeric(Y, errors='coerce')
Y = pd.Series(Y).fillna(0).astype(int)

# ----- CLEAN TEXT -----
X = X.astype(str)


In [18]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)


In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, stratify=Y, random_state=2)


In [20]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


In [21]:
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()


In [22]:
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_dense.shape[1],)),
    layers.Dropout(0.3),

    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),

    layers.Dense(1, activation='sigmoid')
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [23]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [24]:
model.fit(X_train_dense, Y_train,
          epochs=5,
          batch_size=64,
          validation_data=(X_test_dense, Y_test))


Epoch 1/5
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.7716 - loss: 0.4680 - val_accuracy: 0.9857 - val_loss: 0.0502
Epoch 2/5
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 40ms/step - accuracy: 0.9958 - loss: 0.0173 - val_accuracy: 0.9875 - val_loss: 0.0456
Epoch 3/5
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 37ms/step - accuracy: 0.9992 - loss: 0.0056 - val_accuracy: 0.9875 - val_loss: 0.0488
Epoch 4/5
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 31ms/step - accuracy: 0.9997 - loss: 0.0021 - val_accuracy: 0.9872 - val_loss: 0.0489
Epoch 5/5
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 52ms/step - accuracy: 0.9997 - loss: 0.0021 - val_accuracy: 0.9869 - val_loss: 0.0502


<keras.src.callbacks.history.History at 0x7984b8ed43e0>

In [25]:
train_loss, train_acc = model.evaluate(X_train_dense, Y_train)
test_loss, test_acc = model.evaluate(X_test_dense, Y_test)

print("Training Accuracy:", train_acc)
print("Test Accuracy:", test_acc)


[1m420/420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.9999 - loss: 9.9243e-04
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9851 - loss: 0.0634
Training Accuracy: 0.9997764825820923
Test Accuracy: 0.9868891835212708


In [26]:
X_new = X_test[3]
X_new_dense = X_new.toarray()

prediction = model.predict(X_new_dense)
prediction = (prediction > 0.5).astype(int)

print(prediction)

if prediction[0] == 0:
    print('The news is Real')
else:
    print('The news is Fake')

print("Actual Label:", Y_test[3])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[[0]]
The news is Real


KeyError: 3