##### Copyright 2019 The TensorFlow Authors.
##### The following pipeline is losely based on the text classification notebook by Google available here: https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/keras/text_classification_with_hub.ipynb

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
#@title MIT License
#
# Copyright (c) 2017 François Chollet
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

In [7]:
!pip install --upgrade pip setuptools wheel
!pip install sklearn
!pip install tensorflow-hub
!pip install tensorflow-datasets
!pip install tensorflow-text
!pip install graphviz
!pip install pydotplus
!pip install pandas

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Downloading scikit_learn-0.24.2-cp38-cp38-macosx_10_13_x86_64.whl (7.2 MB)
[K     |████████████████████████████████| 7.2 MB 3.0 MB/s eta 0:00:01
Collecting joblib>=0.11
  Downloading joblib-1.0.1-py3-none-any.whl (303 kB)
[K     |████████████████████████████████| 303 kB 3.8 MB/s eta 0:00:01
[?25hCollecting scipy>=0.19.1
  Downloading scipy-1.6.3-cp38-cp38-macosx_10_9_x86_64.whl (30.8 MB)
[K     |████████████████████████████████| 30.8 MB 3.4 MB/s eta 0:00:01    |▏                               | 122 kB 4.0 MB/s eta 0:00:08     |▎                               | 245 kB 4.0 MB/s eta 0:00:08     |▍                               | 378 kB 4.0 MB/s eta 0:00:08     |██▉                             | 2.7 MB 4.0 MB/s eta 0:00:07     |██████████████▋                 | 14.1 MB 5.2 MB/s eta 0:00:04     |█████████████████▋              | 16.9 MB 5.2 MB/s eta 0:00:03
[?25hCollecting threadpoolctl>=2.0.0
  Down

In [9]:
import os
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


#from google.colab import drive
import zipfile

def split_dataset(dataset: tf.data.Dataset, validation_data_fraction: float):
    """
    Splits a dataset of type tf.data.Dataset into a training and validation dataset using given ratio. Fractions are
    rounded up to two decimal places.
    @param dataset: the input dataset to split.
    @param validation_data_fraction: the fraction of the validation data as a float between 0 and 1.
    @return: a tuple of two tf.data.Datasets as (training, validation)
    """

    validation_data_percent = round(validation_data_fraction * 100)
    if not (0 <= validation_data_percent <= 100):
        raise ValueError("validation data fraction must be ∈ [0,1]")

    dataset = dataset.enumerate()
    train_dataset = dataset.filter(lambda f, data: f % 100 > validation_data_percent)
    validation_dataset = dataset.filter(lambda f, data: f % 100 <= validation_data_percent)

    # remove enumeration
    train_dataset = train_dataset.map(lambda f, data: data)
    validation_dataset = validation_dataset.map(lambda f, data: data)

    return train_dataset, validation_dataset

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.5.0
Eager mode:  True
Hub version:  0.12.0
GPU is NOT AVAILABLE


## Load datasets from Google drive

The datasets used are the following:
- ISOT Fake News Dataset (https://www.uvic.ca/engineering/ece/isot/datasets/fake-news/index.php)
- Getting Real about Fake News (https://www.kaggle.com/mrisdal/fake-news)




In [None]:
# Mount the Google Drive with the data. 
#drive.mount('/content/gdrive')

In [None]:
#Unzip Datasets
with zipfile.ZipFile('./fake_or_real_news.csv.zip', 'r') as z:
    # printing all the contents of the zip file
    z.printdir()
  
    # extracting all the files
    print('Extracting all the files now...')
    z.extractall()
    print('Done!')

with zipfile.ZipFile('./News _dataset.zip', 'r') as z:
    # printing all the contents of the zip file
    z.printdir()
  
    # extracting all the files
    print('Extracting all the files now...')
    z.extractall()
    print('Done!')



### Prepare Italian Data

In [None]:
#it_df = pd.read_excel('/content/gdrive/MyDrive/nlp_data/train.xlsx', index_col=0)

In [None]:
#it_df['label'] = pd.Categorical(it_df['Topic'])
#it_df['label'] = it_df.label.cat.codes

In [None]:
#labels = it_df['label']
#texts = it_df['Text']
#es_dataset = tf.data.Dataset.from_tensor_slices((texts, labels))


In [None]:
#Split data in training, testing, and vallidation sets.
#train_data_it, test_data_it = split_dataset(it_dataset, 0.4)
#test_data_it, validation_data_it = split_dataset(test_data_it, 0.4)

### Prepare Spanish Data

In [None]:
#es_df = pd.read_excel('/content/gdrive/MyDrive/nlp_data/train.xlsx', index_col=0)

In [None]:
#es_df['label'] = pd.Categorical(es_df['Topic'])
#es_df['label'] = es_df.label.cat.codes

In [None]:
#labels = es_df['label']
#texts = es_df['Text']
#es_dataset = tf.data.Dataset.from_tensor_slices((texts, labels))


In [None]:
#Split data in training, testing, and vallidation sets.
#train_data_es, test_data_es = split_dataset(es_dataset, 0.4)
#test_data_es, validation_data_es = split_dataset(test_data_es, 0.4)

### Prepare English Data

In [None]:
#Read csv files
true_fake_news = pd.read_csv('./True.csv', usecols=['text'])
fake_fake_news = pd.read_csv('./Fake.csv', usecols=['text']) 

In [None]:
# Remove headers from positive examples
def fixer(x):
  res = x.split(") - ", 1)
  return res[1] if len(res) > 1 else x

true_fake_news = true_fake_news.applymap(fixer)

true_fake_news['label'] = np.ones(21417)

In [None]:
fake_fake_news['label'] = np.zeros(23481)

In [None]:
#Consolidate dataset
all_fake_news = pd.concat([fake_fake_news, true_fake_news])

In [None]:
all_fake_news['label'] = pd.Categorical(all_fake_news['label'])
all_fake_news['label'] = all_fake_news.label.cat.codes


In [None]:
all_fake_news_x = all_fake_news.copy()
all_fake_news_y = all_fake_news.pop('label')
all_fake_news_x = all_fake_news.pop('text')

#Transform dataframes into Tensorflow dataset
real_dataset = tf.data.Dataset.from_tensor_slices((all_fake_news_x, all_fake_news_y))



In [None]:
#Split data in training, testing, and vallidation sets.
train_data_real, test_data_real = split_dataset(real_dataset, 0.4)
test_data_real, validation_data_real = split_dataset(test_data_real, 0.4)

## Explore the data 

Let's take a moment to understand the format of the data. Each example is a sentence representing the movie review and a corresponding label. The sentence is not preprocessed in any way. The label is an integer value of either 0 or 1, where 0 is a negative review, and 1 is a positive review.

Let's print first 10 examples.

In [None]:
train_data = train_data_en

In [None]:
train_examples_batch, train_labels_batch = next(iter(train_data.batch(1)))

Let's also print the first 10 labels.

## Build the model

We decided to use a LSTM network with pretrained embeddings for the detection of fake news in multiple languages.

The embedding we decided to use is the Universal Sentence Encoder (and its multilingual versrion) available at the Tensorflow Hub here: https://tfhub.dev/google/collections/universal-sentence-encoder/1




In [None]:
#We create and embedding layer with our selected encoder
embedding = "https://tfhub.dev/google/universal-sentence-encoder/4"
#embedding = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)

In [None]:
from tensorflow.keras.utils import plot_model

#Type of model
model = tf.keras.Sequential()
#Embedding layer
model.add(hub_layer)
#Reshape to input into the LSTM layer
model.add(tf.keras.layers.Reshape( target_shape=( 512 , 1 ) ))
#LSTM layer 
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)))
#Relu dense layer
model.add(tf.keras.layers.Dense(64, activation='relu'))
#Output layer
model.add(tf.keras.layers.Dense(1))

model.summary()
plot_model(model, to_file='model_plot.png', show_layer_names=True)

In [None]:
#We compile the model with the 'Adam' optimizer
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

## Train the model

Train the model for 10 epochs in mini-batches of 512 samples.

In [None]:
history = model.fit(train_data.shuffle(50000).batch(512),
                    epochs=10,
                    validation_data=validation_data_en.batch(512),
                    verbose=1)

## Evaluate the model



In [None]:
results = model.predict(test_data_en.batch(512), verbose=2)

#Evaluate the model for loss and binary accuracy
model.evaluate(test_data_en.batch(512))



In [None]:
#read a different dataset for extra validation
external_test_data = pd.read_csv('./fake_or_real_news.csv')


In [None]:
#prepare external dataset for validation
external_test_data['label'] = pd.Categorical(external_test_data['label'])
external_test_data['label'] = external_test_data.label.cat.codes

In [None]:
#evaluate 
labels = external_test_data['label']
texts = external_test_data['text']
external_dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
model.evaluate(external_dataset.batch(512))


In [None]:
nada raro
22/22 - 27s
22/22 [==============================] - 28s 1s/step - loss: 0.0348 - binary_accuracy: 0.9916
loss: -8.484
binary_accuracy: -8.478

In [None]:
og lstm 64
22/22 - 29s
22/22 [==============================] - 29s 1s/step - loss: 0.1349 - binary_accuracy: 0.9819
loss: -6.970
binary_accuracy: -6.970

In [None]:
lstm 256
22/22 - 31s
22/22 [==============================] - 31s 1s/step - loss: 0.0859 - binary_accuracy: 0.9903
loss: -9.479
binary_accuracy: -9.479

## Results:

* For the english dataset we achieved a binary accuracy of 99.03% with a loss of 0.085. This is a very good reusult. For the validation with an external dataset we achieved a binary accuracy of 62.76% with a loss of 3.37. This is a reasonably good result since 
* For the Spanish dataset