<a href="https://colab.research.google.com/github/PazSheimy/Fake-News-Detention-Using-Machine-Learning/blob/main/Fake_News_Detection_with_RoBERTa_Milestone_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install transformers and PyTorch Lightning libraries

!pip install transformers
!pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@master --upgrade

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 271 kB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 7.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 26.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 33.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  

In [None]:
# Import required libraries

import pandas as pd #data manipulation: merging, reshaping, selecting, data cleaning
import re #deal with pattern matching
import sklearn #use to buil machine learning models
import nltk #natural language processing
from sklearn.model_selection import train_test_split #model_selection is metod forsetting 
#a blueprint to analyze data and then using it to measure new data.
#train_test_split is function in sklearn for selection for splitting data data and for 
# arrays into two subsets: for training data and for testing data. this
#function will make random partitions for the two subsets so there is no need
#to divide the dataset manually
from google.colab import drive


from transformers import RobertaTokenizer #RoBERTa is derived from the GPT-2 tokenizer,
#using byte_level_Pair_Encoding (this tokenizer has been trained to treat spaces
# like parts of the token (like sentencepiece)so a word will be encoded differently wheter
# it is at the beginning of the sentence(without space) or not)
import torch #open source machine learning library, provide wide range of algorithm for deep learning
#
from torch.utils.data import DataLoader, TensorDataset
import pytorch_lightning as pl

In [None]:
# Mount your Google Drive

drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
# Read fake news and real news datasets

fake_news = pd.read_csv("gdrive/MyDrive/fake news data/Khilnani_LP_fake_news.csv")
real_news = pd.read_csv("gdrive/MyDrive/fake news data/Khilnani_LP_real_news.csv")
fake_news.head() # the .head() function is used to get the first n rows
#for the object based on the position.

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


# New Section

In [None]:
# Fake news data gets a label of 1 and real news data gets a label of 0

fake_news["label"] = 1
real_news["label"] = 0
data = pd.concat([fake_news, real_news], axis=0) # Concatenate both the dataframes

In [None]:
# Randomly shuffle the concatenated dataframe

data = data.sample(frac=1).reset_index(drop=True)

In [None]:
# Take only the text and label columns

data = data[["text", "label"]]
data.head()

Unnamed: 0,text,label
0,Via: MRCTV,1
1,"Just recently, a USA Today sports columnist sa...",1
2,The news that Russia interfered in our electio...,1
3,In the first year of President Obama s term in...,1
4,CAIRO (Reuters) - Egypt s Foreign Minister Sam...,0


In [None]:
# Clean the text

nltk.download("stopwords")
def clean_text(text):
  stopwords = nltk.corpus.stopwords.words('english')
  text = text.lower() # Convert to lower case
  text = re.sub(r'[^\w\s]', '', text) # Remove everything except words
  words = [word for word in text.split() if word not in stopwords] # Remove stopwords
  text = " ".join(words)
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
data["text"] = data["text"].apply(clean_text)

In [None]:
x = data["text"].values
y = data["label"].values

# Split into training and validation sets

train_data, val_data, train_labels, val_labels = train_test_split(x, y)

In [None]:
# Load pre-trained RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
# Tokenize the articles

train_tokens = tokenizer(list(train_data), return_tensors="pt", padding=True, truncation=True, max_length=64)
val_tokens = tokenizer(list(val_data), return_tensors="pt", padding=True, truncation=True, max_length=64)

In [None]:
# Create lists of tokens

device = "cuda"
trn = [train_tokens["input_ids"].to(device), train_tokens["attention_mask"].to(device), torch.tensor(train_labels).to(device)]
val = [val_tokens["input_ids"].to(device), val_tokens["attention_mask"].to(device), torch.tensor(val_labels).to(device)]

In [None]:
# Dataloader class

BATCH_SIZE = 32
class ClassificationData(pl.LightningDataModule):
    def __init__(self, trn, val):
        super().__init__()

        self.trn = DataLoader(TensorDataset(*trn), batch_size=BATCH_SIZE)
        self.val = DataLoader(TensorDataset(*val), batch_size=BATCH_SIZE)

    def train_dataloader(self): return self.trn
    def val_dataloader(self): return self.val

dls = ClassificationData(trn, val)

In [None]:
# Load pre-trained RobertaModel

!pip install transformers
!pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@master --upgrade
from transformers import RobertaModel
roberta_model = RobertaModel.from_pretrained('roberta-base')

Collecting git+https://github.com/PytorchLightning/pytorch-lightning.git@master
  Cloning https://github.com/PytorchLightning/pytorch-lightning.git (to revision master) to /tmp/pip-req-build-hf8x270g
  Running command git clone -q https://github.com/PytorchLightning/pytorch-lightning.git /tmp/pip-req-build-hf8x270g
  Running command git submodule update --init --recursive -q
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone


Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# The RobertaClassifier class

import torch
import pytorch_lightning as pl

class RobertaClassifier(pl.LightningModule):
    def __init__(self, dropout_p, hid_dim, output_dim):
        super().__init__()
        self.roberta = roberta_model
        self.dropout = torch.nn.Dropout(dropout_p)
        self.linear_1 = torch.nn.Linear(hid_dim,hid_dim)
        self.linear_2 = torch.nn.Linear(hid_dim, output_dim)
        self.loss = torch.nn.NLLLoss()

    def forward(self, input_ids, attention_mask):
        x1 = self.roberta(input_ids, attention_mask=attention_mask)[0]
        x1 = x1[:,0]
        x1 = self.dropout(torch.nn.ReLU()(self.linear_1(x1)))
        output  = torch.log_softmax(self.linear_2(x1), dim = 1)
        return output

    def training_step(self, batch, ix):
        pred = self(batch[0], batch[1])
        loss = self.loss(pred, batch[2].view(-1))
        return loss

    def validation_step(self, batch, ix):
        pred = self(batch[0], batch[1])
        loss = self.loss(pred, batch[2].view(-1))
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-5)

m = RobertaClassifier(0.5, 768, 2)

In [None]:
# Train the model

# dls is the object of the dataloader class 
device = "cuda"
t = pl.Trainer(max_epochs=1, gpus=1)
t.fit(m.to(device), dls)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Missing logger folder: /content/lightning_logs

  | Name     | Type         | Params
------------------------------------------
0 | roberta  | RobertaModel | 124 M 
1 | dropout  | Dropout      | 0     
2 | linear_1 | Linear       | 590 K 
3 | linear_2 | Linear       | 1.5 K 
4 | loss     | NLLLoss      | 0     
------------------------------------------
125 M     Trainable params
0         Non-trainable params
125 M     Total params
500.951   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [None]:
# Predict probabilities on a validation batch 
val_batch = next(iter(dls.val))

device = "cuda"
m.to(device)

val_pred = m(val_batch[0], val_batch[1]) # m is the model created 
val_label = val_pred.data.max(1)[1].cpu().numpy()

val_true = val_batch[2].reshape(BATCH_SIZE).cpu().numpy() # BATCH_SIZE is 32, as initialized 

In [None]:
# Calculate precision, recall and F1-score

f_score = sklearn.metrics.f1_score(val_true, val_label, average = "macro")
precision_score = sklearn.metrics.precision_score(val_true, val_label)
recall_score = sklearn.metrics.recall_score(val_true, val_label)

print(f_score, precision_score, recall_score)

0.9684729064039409 1.0 0.9444444444444444
