In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
nltk.download('stopwords')



/kaggle/input/fake-and-real-news-dataset/True.csv
/kaggle/input/fake-and-real-news-dataset/Fake.csv
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
fake_news_path = "/kaggle/input/fake-and-real-news-dataset/Fake.csv"
real_news_path = "/kaggle/input/fake-and-real-news-dataset/True.csv"

In [3]:
fake_news = pd.read_csv(fake_news_path)
real_news = pd.read_csv(real_news_path)

In [4]:
fake_news.head(3)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"


In [5]:
real_news.head(3)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"


In [6]:
real = real_news.copy()
fake = fake_news.copy()

In [7]:
real['Label'] = 'Real'
fake['Label'] = 'Fake'

In [8]:
news = pd.concat([real, fake], axis=0, ignore_index=True)
news.reset_index()
news.head()

Unnamed: 0,title,text,subject,date,Label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",Real
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",Real
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",Real
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",Real
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",Real


In [9]:
print(f"Samples available: {news.shape[0]}\n#features of dataset: {news.shape[1]}")

Samples available: 44898
#features of dataset: 5


In [10]:
news_ds = news.sample(1000).drop(['title', 'date', 'subject'], axis=1)
news_ds.head(3)

Unnamed: 0,text,Label
36923,If local law enforcement begins to act like i...,Fake
38038,Obama and HUD want to give one last freebie to...,Fake
28271,Corey Lewandowski got some good news last week...,Fake


In [11]:
CLASS_NAMES = ['Fake', 'Real']
class_mapper = {
    'Fake':0,
    'Real':1
}

In [12]:
news_ds['Label'] = news_ds['Label'].map(class_mapper)
news_ds.head(3)

Unnamed: 0,text,Label
36923,If local law enforcement begins to act like i...,0
38038,Obama and HUD want to give one last freebie to...,0
28271,Corey Lewandowski got some good news last week...,0


In [13]:
class_dist = px.histogram(data_frame=news,
                         y='Label',
                         color='Label',
                         title='Fake vs Real news Original dataset',
                         text_auto=True)
class_dist.update_layout(showlegend=False)
class_dist.show()

In [14]:
subject_dist = px.histogram(data_frame=news,
                           x='subject',
                           color='subject',
                           title='Fake vs Real news Subject Distribution',
                           text_auto=True,
                           facet_col='Label')
subject_dist.update_layout(showlegend=False)
subject_dist.show()

In [15]:
news.date.unique().max()

'https://fedup.wpengine.com/wp-content/uploads/2015/04/hillarystreetart.jpg'

In [16]:
list(filter(lambda x:len(x)>20, news.date.unique()))

['https://100percentfedup.com/served-roy-moore-vietnamletter-veteran-sets-record-straight-honorable-decent-respectable-patriotic-commander-soldier/',
 'https://100percentfedup.com/video-hillary-asked-about-trump-i-just-want-to-eat-some-pie/',
 'https://100percentfedup.com/12-yr-old-black-conservative-whose-video-to-obama-went-viral-do-you-really-love-america-receives-death-threats-from-left/',
 'https://fedup.wpengine.com/wp-content/uploads/2015/04/hillarystreetart.jpg',
 'https://fedup.wpengine.com/wp-content/uploads/2015/04/entitled.jpg',
 'MSNBC HOST Rudely Assumes Steel Worker Would Never Let His Son Follow in His Footsteps…He Couldn’t Be More Wrong [Video]']

In [17]:
news = news[news['date'].map(lambda x:len(x)) <= 20]
news.date = pd.to_datetime(news['date'], format='mixed')
news.head()

Unnamed: 0,title,text,subject,date,Label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,2017-12-31,Real
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,2017-12-29,Real
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,2017-12-31,Real
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,2017-12-30,Real
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,2017-12-29,Real


In [18]:
date_dist = px.histogram(data_frame=news,
                         x='date',
                         color='Label')
date_dist.show()

In [19]:
subject_dist = px.histogram(data_frame=news,
                           x='date',
                           color='subject')
subject_dist.show()

In [20]:
real_sub_dist = px.histogram(data_frame=news[news['Label']=='Real'],
                            x='date',
                            color='subject')
real_sub_dist.show()

## Text Preprocessing

In [21]:
import string

In [22]:
stop_words = stopwords.words('english')
def text_preprocessing(text):
    words = text.lower().split()
    filtered_words = [word for word in words if word not in stop_words]
    pure_text = ' '.join(filtered_words)
    pure_text = pure_text.translate(str.maketrans('', '', string.punctuation)).strip()
    return pure_text

In [23]:
X = news_ds.text.apply(text_preprocessing).to_numpy()
y = news_ds.Label.to_numpy().astype('float32').reshape(-1, 1)

train_X, test_X, train_y, test_y = train_test_split(X, y,
                                                 train_size=0.9,
                                                 stratify=y,
                                                 random_state=7)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y,
                                                 train_size=0.9,
                                                 stratify=train_y,
                                                 random_state=7)

In [24]:
model_name = "BERTFakeNewsDetector"
model_callbacks = ModelCheckpoint(model_name, save_best_only=True)

In [25]:
bert_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(bert_name,
                                         padding='max_length',
                                         do_lower_case=True,
                                         add_special_tokens=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [26]:
def tokenize(df):
    inputs = tokenizer(df.tolist(),
                      padding=True,
                      truncation=True,
                      return_tensors='tf').input_ids
    return inputs

In [27]:
train_X_encoded = tokenize(train_X)
val_X_encoded = tokenize(val_X)
test_X_encoded = tokenize(test_X)

In [28]:
def prepare_datasets(encoded, true_df, true_target_df):
    return tf.data.Dataset.from_tensor_slices((encoded, true_target_df)).shuffle(true_df.shape[0]).batch(8).prefetch(tf.data.AUTOTUNE)

In [29]:
train_ds = prepare_datasets(train_X_encoded, train_X, train_y)
test_ds = prepare_datasets(test_X_encoded, test_X, test_y)
val_ds = prepare_datasets(val_X_encoded, val_X, val_y)

In [30]:
model = TFAutoModelForSequenceClassification.from_pretrained(bert_name,
                                                            num_labels=1)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
model.compile(
        optimizer = Adam(learning_rate=1e-5),
        metrics = [
            tf.keras.metrics.BinaryAccuracy(name='Accuracy'),
            tf.keras.metrics.Precision(name='Precision'),
            tf.keras.metrics.Recall(name='Recall')
        ]
    )
model_history = model.fit(train_ds,
                     validation_data=val_ds,
                     callbacks=model_callbacks,
                     epochs=5,
                     batch_size=16)

model_history = pd.DataFrame(model_history.history)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [32]:
model_history

Unnamed: 0,loss,Accuracy,Precision,Recall,val_loss,val_Accuracy,val_Precision,val_Recall
0,0.181528,0.735802,0.754777,0.63369,0.034657,0.966667,0.953488,0.97619
1,0.045669,0.944444,0.941019,0.938503,0.020463,0.988889,0.976744,1.0
2,0.015262,0.992593,0.994624,0.989305,0.009228,0.988889,0.976744,1.0
3,0.0072,0.996296,0.994667,0.997326,0.006398,0.988889,0.976744,1.0
4,0.004542,1.0,1.0,1.0,0.002476,1.0,1.0,1.0


In [33]:
model.save(model_name)

In [34]:
fig = make_subplots(rows=2, cols=2, subplot_titles=('Loss', 'Accuracy', 'Precision', 'Recall'))

fig.add_trace(go.Scatter(y=model_history['loss'], mode='lines', name='Training Loss'), row=1, col=1)
fig.add_trace(go.Scatter(y=model_history['val_loss'], mode='lines', name='Validation Loss'), row=1, col=1)

fig.add_trace(go.Scatter(y=model_history['Accuracy'], mode='lines', name='Training Accuracy'), row=1, col=2)
fig.add_trace(go.Scatter(y=model_history['val_Accuracy'], mode='lines', name='Validation Accuracy'), row=1, col=2)

fig.add_trace(go.Scatter(y=model_history['Precision'], mode='lines', name='Training Precision'), row=2, col=1)
fig.add_trace(go.Scatter(y=model_history['val_Precision'], mode='lines', name='Validation Precision'), row=2, col=1)

fig.add_trace(go.Scatter(y=model_history['Recall'], mode='lines', name='Training Recall'), row=2, col=2)
fig.add_trace(go.Scatter(y=model_history['val_Recall'], mode='lines', name='Validation Recall'), row=2, col=2)

fig.update_layout(title='Model Training History')

fig.update_xaxes(title_text='Epoch', row=1, col=1)
fig.update_xaxes(title_text='Epoch', row=1, col=2)
fig.update_xaxes(title_text='Epoch', row=2, col=1)
fig.update_xaxes(title_text='Epoch', row=2, col=2)

fig.update_yaxes(title_text='Loss', row=1, col=1)
fig.update_yaxes(title_text='Accuracy', row=1, col=2)
fig.update_yaxes(title_text='Precision', row=2, col=1)
fig.update_yaxes(title_text='Recall', row=2, col=2)


fig.show()

In [35]:
test_loss, test_acc, test_precision, test_recall = model.evaluate(test_ds, verbose = 0)

print(f"Test Loss      : {test_loss}")
print(f"Test Accuracy  : {test_acc}")
print(f"Test Precision : {test_precision}")
print(f"Test Recall    : {test_recall}")

Test Loss      : 0.003277710871770978
Test Accuracy  : 1.0
Test Precision : 1.0
Test Recall    : 1.0


In [36]:
def make_prediction(text, model=model):
    text = np.array([text])
    inputs = tokenize(text)
    return np.abs(np.round(model.predict(inputs, verbose=1).logits))

In [37]:
for _ in range(5):
    index = np.random.randint(test_X.shape[0])
    
    text = test_X[index]
    real = test_y[index]
    model_pred = make_prediction(text)
    
    print(f"Original Text:\n\n{text}\n\nTrue: {CLASS_NAMES[int(real)]}\t\tPredicted: {CLASS_NAMES[int(model_pred)]}\n{'-'*100}\n")

Original Text:

reuters  republican candidate backed president donald trump us senate seat alabama expected finish distant second primary election tuesday still enough votes advance runoff local news projected election seat vacated republican jeff sessions became us attorney general luther strange appointed seat backed trump move republican primary along former alabama supreme court chief justice roy moore projections said neither candidate captured 50 percent vote required outright victory 60 state’s 67 counties reporting moore 438 percent vote strange 317 percent official results showed strange former alabama attorney general close ties senate majority leader mitch mcconnell feuding trump lately stalled effort pass health care reform strange scored president’s surprise endorsement last week trump reiterated support strange automated phone call voters monday “he’s helping senate” trump said “he’s going get tax cuts us he’s lot things people alabama” moore effectively ousted alabama’s 