<a href="https://colab.research.google.com/github/Sandeep0076/Fake-News-Detection-System-for-Covid-19/blob/main/Model%20Deployment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Features with predictions and f1 score
I have not made UI yet


In [None]:
!pip install transformers~=3.4.0
!pip install boilerpy3
#!pip install transformers
#pip install pytorch-lightning



In [None]:
from google.colab import drive
drive.mount("/content/Gdrive")

Drive already mounted at /content/Gdrive; to attempt to forcibly remount, call drive.mount("/content/Gdrive", force_remount=True).


## Imports

In [None]:
%%writefile lstm_bert_model.py
from google.colab import drive
drive.mount("/content/Gdrive")
file_location="/content/Gdrive/MyDrive/covid19_news_dataset.csv"
"""## Imports"""
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support as score

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,Dataset,random_split,SubsetRandomSampler

import transformers
from transformers import  BertTokenizer, RobertaModel, BertModel, AdamW, AutoConfig,get_linear_schedule_with_warmup

import time 
from datetime import  date
import warnings
import collections 
from operator import truediv
from boilerpy3 import extractors
from urllib.parse import quote,unquote,urlparse
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup

warnings.filterwarnings('ignore')

## Classes
# Base bert class for training and classification of the model
class Bert_TextClassification_Model(nn.Module):
    def __init__(self):
        super(Bert_TextClassification_Model, self).__init__()
        self.num_classes = NUM_CLASSES
        self.bert_path = 'bert-base-uncased'
        # Adding Gradient checkpoint for reducing memory usage
        #it is saving 50% of memory in my case before the Model was going out of memory even with 11Gb now
        #now after deleting unused variable, checkpointing and smaller variables consuption is reduced to 4Gb
        self.config = AutoConfig.from_pretrained(self.bert_path,
                                                 gradient_checkpointing=True, )
        #for memory conservation by using smaller value
        self.config.use_bfloat16 = True
        self.bert = transformers.BertModel.from_pretrained(self.bert_path, config=self.config)
        self.out = nn.Linear(768, self.num_classes)

    def forward(self, ids, mask, token_type_ids):
        _, pooled_out = self.bert(
            ids, attention_mask=mask, token_type_ids=token_type_ids)
        return self.out(pooled_out)

# This implements Bert take out the fined tuned model excluding last and add LSTM and linear layer on top of it to achieve higher accuracy
class RoBERT_Model(nn.Module):

    def __init__(self, bertFineTuned):
        super(RoBERT_Model, self).__init__()
        self.num_classes = NUM_CLASSES
        # old model is initialized which is trained and remembers the weights and bais.
        # since bert model only take 512 we will input the data with the same order
        # but before we feed to the lstm layer,we will combine the output of bert which was previously
        # divided into 200, seg and then in last filter out in linear
        self.bertFineTuned = bertFineTuned
        self.lstm = nn.LSTM(768, 100, num_layers=1, bidirectional=False)
        self.out = nn.Linear(100, self.num_classes)

    def forward(self, ids, mask, token_type_ids, num_chunks):
        _, pooled_out = self.bertFineTuned(ids,
                                           attention_mask=mask,
                                           token_type_ids=token_type_ids)

        #For every 200-lenght chunk we extracted a representation vector from BERT of size 768 each
        #which is the size of the hidden layer
        chunks_emb = pooled_out.split_with_sizes(num_chunks)
        #print(f'Number of chunks : {num_chunks}')

        seq_num_chunkshs = torch.LongTensor([x for x in map(len, chunks_emb)])
        #seq_len = tensor(5) for 5 chunks in a document
        del pooled_out
        #  number of 200-num_chunksh chunk is not fixed
        #Pad a list of variable num_chunksh Tensors with padding_value
        #sequences (list[Tensor]) – list of variable num_chunksh sequences.
        #batch_first (bool, optional) – output will be in B x T x * if True, or in T x B x * otherwise
        #padding_value (float, optional) – value for padded elements. Default: 0.
        #* is any number of trailing dimensions
        '''
        , batch size more than one,padding to the max num_chunksh and masking, 
        in this way we pad the shorter sequences with a special value to be masked (skipped for the network) later.
        In this case the special values is -99
        '''
        # batch_emb_pad = [1,5,768]
        batch_emb_pad = nn.utils.rnn.pad_sequence(chunks_emb,
                                                  padding_value=-99,
                                                  batch_first=True)
        # batch_emb = [5,1,768]
        batch_emb = batch_emb_pad.transpose(0, 1)  # (B,L,D) -> (L,B,D)

        #Packs a Tensor containing padded sequences  from above of variable num_chunksh.
        # because each document can be of different len
        lstm_input = nn.utils.rnn.pack_padded_sequence(batch_emb, seq_num_chunkshs.cpu().numpy(),
                                                       batch_first=False, enforce_sorted=False)
        # lstm_input Bactch size is [1,1,1,1,1] meaning 5
        #this will reduce it to 100 layers  as initiatized
        #packed_output = 100*5() 5 is batch
        #h_t = 100 is a  output vector of output shape [1,1,100]
        packed_output, (h_t, h_c) = self.lstm(lstm_input, )  # (h_t, h_c))

        h_t = h_t.view(-1, 100)

        return self.out(h_t)

# Train and Evaluate - Bert Model
def train_eval_bert_model(data_loader, model, optimizer, device, mode, scheduler=None):
    if mode=='train':
        model.train()
        t0 = time.time()
    elif mode=='eval':
        model.eval()
        target_res = []
        output_res = []
    else:
        print('wrong mode given')

    losses = []
    for batch_idx, batch in enumerate(data_loader):

        # taking out the data from dataloader
        ids = [data["ids"] for data in batch]
        # This will give list of tensor ids,masks....
        mask = [data["mask"] for data in batch]
        token_type_ids = [data["token_type_ids"] for data in batch]
        targets = [data["targets"] for data in batch]
        # [tensor([1, 1, 1], dtype=torch.int32)]<class 'list'>
        num_chunks = [data['len'] for data in batch]
        #	output is [tensor([3])]<class 'list'>

        # Converting list into tensors
        ids = torch.cat(ids)
        mask = torch.cat(mask)
        token_type_ids = torch.cat(token_type_ids)
        targets = torch.cat(targets)
        # tensor([1, 1, 1], dtype=torch.int32)<class 'torch.Tensor'>
        num_chunks = torch.cat(num_chunks)
        # output is tensor([3])<class 'torch.Tensor'>

        # Loading variables to Memory
        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.long)

        if mode=='train':
            # changing all the previous gradients to zero if any
            optimizer.zero_grad()
            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

            # Deleting unused variables which consume quite a bit of memory
            del ids, mask, token_type_ids, num_chunks

            loss = loss_fun(outputs, targets)
            loss.backward()
            model.float()
            optimizer.step()
            if scheduler:
                scheduler.step()
            losses.append(loss.item())
            del loss
            if batch_idx % 1000 == 0:
                print(
                    f"batch index = {batch_idx} / {len(data_loader)} ({100 * batch_idx / len(data_loader):.2f}%), loss = {np.mean(losses[-10:]):.4f}, time = {(time.time()-t0)//60} minutes ")
                t0 = time.time()
            output = losses

        elif mode =='eval':
            with torch.no_grad():
                outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
                del ids, mask, token_type_ids, num_chunks
                loss = loss_fun(outputs, targets)
                losses.append(loss.item())
                del loss

            target_res.append(targets.cpu().detach().numpy())
            output_res.append(torch.softmax(
                outputs, dim=1).cpu().detach().numpy())
            output = np.concatenate(output_res), np.concatenate(target_res), losses

    return  output

# Train and Evaluate - RoBert Model
def train_eval_robert_model(data_loader, model, optimizer, device, mode, scheduler=None):
    if mode == 'train':
        model.train()
        t0 = time.time()
    elif mode == 'eval':
        model.eval()
        target_res = []
        output_res = []
    else:
        print('wrong mode given')

    losses = []
    for batch_idx, batch in enumerate(data_loader):

        # taking out the data from dataloader
        ids = [data["ids"] for data in batch]
        mask = [data["mask"] for data in batch]
        token_type_ids = [data["token_type_ids"] for data in batch]
        targets = [data["targets"][0] for data in batch]
        num_chunks = [data['len'] for data in batch]
        # input is <class 'list'>
        # targets and its type : [tensor(0, dtype=torch.int32)]<class 'list'>
        # num_chunks and its type : [tensor([5])]<class 'list'>
        ids = torch.cat(ids)
        mask = torch.cat(mask)
        token_type_ids = torch.cat(token_type_ids)
        targets = torch.stack(targets)
        num_chunks = torch.cat(num_chunks)
        num_chunks = [x.item() for x in num_chunks]
        # stacked targets and its type : tensor([0], dtype=torch.int32)<class 'torch.Tensor'>
        # concatenated num_chunks and its type : tensor([5])<class 'torch.Tensor'>
        # ietms num_chunks and its type : [5]<class 'list'>
        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.long)

        if mode == 'train':
            # changing all the previous gradients to zero if any
            optimizer.zero_grad()
            outputs = model(ids=ids, mask=mask,
                            token_type_ids=token_type_ids, num_chunks=num_chunks)
            del ids, mask, token_type_ids, num_chunks
            loss = loss_fun(outputs, targets)
            loss.backward()
            model.float()
            optimizer.step()
            if scheduler:
                scheduler.step()
            losses.append(loss.item())
            del loss
            if batch_idx % 1000 == 0:
              print(
                f"batch index = {batch_idx} / {len(data_loader)} ({100*batch_idx / len(data_loader):.2f}%), loss = {np.mean(losses[-10:]):.4f}, time = {(time.time()-t0)//60} minutes")
              t0 = time.time()
            output = losses

        elif mode == 'eval':
            with torch.no_grad():
                outputs = model(ids=ids, mask=mask,
                                token_type_ids=token_type_ids, num_chunks=num_chunks)
                loss = loss_fun(outputs, targets)
                losses.append(loss.item())
            target_res.append(targets.cpu().detach().numpy())
            output_res.append(torch.softmax(
                outputs, dim=1).cpu().detach().numpy())
            output = np.concatenate(output_res), np.concatenate(target_res), losses

    return output

# To create a dataloader with variable-size input
def my_collate1(batches):
    return [{key: torch.stack(value) for key, value in batch.items()} for batch in batches]

# Loss function
def loss_fun(outputs, targets):
    loss = nn.CrossEntropyLoss()
    return loss(outputs, targets)
    # return nn.BCEWithLogitsLoss()(outputs, targets)

# For predictions and accuracy
def evaluate(target, predicted):
    true_label_mask = [1 if (np.argmax(x) - target[i]) ==
                            0 else 0 for i, x in enumerate(predicted)]
    nb_prediction = len(true_label_mask)
    true_prediction = sum(true_label_mask)
    false_prediction = nb_prediction - true_prediction
    Accuracy = true_prediction / nb_prediction
    return {
        "Accuracy": Accuracy,
        "No. of examples": len(target),
        "True prediction": true_prediction,
        "False prediction": false_prediction,
    }
#----------------------------------------------
# can join evaluate and f1 score and remove accuracy 

# Precision () is defined as the number of true positives () over 
# the number of true positives plus the number of false positives 
def get_f1_precision_recall(y_test,y_pred):
    
  float_formatter = "{:.2f}".format
  np.set_printoptions(formatter={'float_kind':float_formatter})

  precision, recall, fscore, support = score(y_test, y_pred)
  print('precision: {}'.format(precision))
  print('recall   : {}'.format(recall))
  print('fscore   : {}'.format(fscore))
  return

# For plotting confusion matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

# To create, clean and divide the dataset into chunks
class FakeNewsDataset(Dataset):

    def __init__(self, tokenizer, max_len, chunk_len=200, overlap_len=50, max_size_dataset=None,
                  file_location=file_location, min_len=10, mode='train_eval', testdf=None):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.overlap_len = overlap_len
        self.chunk_len = chunk_len
        self.min_len = min_len
        self.max_size_dataset = max_size_dataset
        self.testdf = testdf
        self.mode = mode
        self.data, self.label,self.label_count,self.label_mapping= self.process_data(file_location,mode,testdf )

    def process_data(self, file_location,mode,testdf):
        
        base_l = []
        # dataset for training and evaluation
        if mode == 'train_eval':
          df = pd.read_csv(file_location)
          train_raw = df[df.body.notnull()]
          train_raw = train_raw[train_raw.title.notnull()]
        # dataset for predicting the single outcome
        elif mode == 'test':
          train_raw = testdf.copy()

        
        base_l = get_base_url (train_raw.url)
        train_raw['base_url'] = base_l
        train_raw = train_raw.assign(len_txt=train_raw.body.apply(lambda x: len(x.split())))
        train_raw = train_raw[train_raw.len_txt > self.min_len]

        train_raw = train_raw[['title','body', 'label','base_url']]
        train_raw.reset_index(inplace=True, drop=True)
        label_count = train_raw['label'].value_counts()
        LE = LabelEncoder()
        train_raw['label'] = LE.fit_transform(train_raw['label'])
        label_mapping = dict(zip(LE.classes_, range(len(LE.classes_))))
        
        train = train_raw.copy()
        del train_raw
        if (self.max_size_dataset):
            train = train.loc[0:self.max_size_dataset, :]
        train = train.reindex(np.random.permutation(train.index))
      
        train['body'] = train.body.apply(self.clean_txt)
        train['title'] = train.title.apply(self.clean_txt)
        train['text'] = train[['title','body', 'base_url']].agg(' '.join, axis=1)
        return train['text'].values, train['label'].values,label_count,label_mapping

    def clean_txt(self, text):
        # Apply regex and clean the data
        text = re.sub("'", "", text)
        text = re.sub("(\\W)+", " ", text)
        text = re.sub(r'[^A-Z-a-z. \d/,]', '', text.lower())
        text = ' '.join(text.split())
        return text

    def long_terms_tokenizer(self, data_tokenize, targets):
        
        input_ids_list = []
        attention_mask_list = []
        token_type_ids_list = []
        targets_list = []

        previous_input_ids = data_tokenize["input_ids"].reshape(-1)
        previous_attention_mask = data_tokenize["attention_mask"].reshape(-1)
        previous_token_type_ids = data_tokenize["token_type_ids"].reshape(-1)

        # it contains tokens that are after 200 num_chunksh because of CHUNK_LEN=200
        overflowing_ids = data_tokenize.get("overflowing_tokens").reshape(-1)  # added .reshape(-1)
        targets = torch.tensor(targets, dtype=torch.int)

        input_ids_list.append(previous_input_ids)
        attention_mask_list.append(previous_attention_mask)
        token_type_ids_list.append(previous_token_type_ids)
        targets_list.append(targets)

        if overflowing_ids.nelement() != 0 :
            overflowing_ids = torch.tensor(overflowing_ids, dtype=torch.long)
            # total no. of index, chunck + overflow
            # range means how many steps of 200 chunks eg if len is 723, then range is 4
            idxs = range(len(overflowing_ids) + self.chunk_len)
            # from where the next chunk starts
            # idxs = start, step, stop, 148,148,396
            idxs = idxs[(self.chunk_len - self.overlap_len - 2)
                        ::(self.chunk_len - self.overlap_len - 2)]
            # input_ids_first_overlap last 50 tokens of the first 200 batch
            input_ids_first_overlap = previous_input_ids[-(
                    self.overlap_len + 1):-1]
            start_token = torch.tensor([101], dtype=torch.long)
            end_token = torch.tensor([102], dtype=torch.long)

            for i, idx in enumerate(idxs):
                # enumerate i = counter, 1,2 3--
                if i == 0:
                    # input_ids = 50 from last of the first batch + 148 from overflowing_ids + 2 tokens
                    input_ids = torch.cat(
                        (input_ids_first_overlap, overflowing_ids[:idx]))
                elif i == len(idxs):
                    # for the last overflowing_idsing indexes
                    input_ids = overflowing_ids[idx:]
                elif previous_idx >= len(overflowing_ids):
                    # when the indexes finishes it breaks
                    break
                else:
                    # if not the first and the last, in between then last index-50, to overlap till next 150
                    input_ids = overflowing_ids[(previous_idx - self.overlap_len):idx]

                previous_idx = idx

                # after getting input ids add equivalant amount of attention mask and then token ids
                nb_token = len(input_ids) + 2
                attention_mask = torch.ones(self.chunk_len, dtype=torch.long)  # chunk len = 200
                attention_mask[nb_token:self.chunk_len] = 0
                token_type_ids = torch.zeros(self.chunk_len, dtype=torch.long)
                input_ids = torch.cat((start_token, input_ids, end_token))
                # if its input id is less then 200, then add padding till 200
                if self.chunk_len - nb_token > 0:
                    padding = torch.zeros(
                        self.chunk_len - nb_token, dtype=torch.long)
                    input_ids = torch.cat((input_ids, padding))

                input_ids_list.append(input_ids)
                attention_mask_list.append(attention_mask)
                token_type_ids_list.append(token_type_ids)
                targets_list.append(targets)
        return ({
            'ids': input_ids_list,  # torch.tensor(ids, dtype=torch.long),
            # torch.tensor(mask, dtype=torch.long),
            'mask': attention_mask_list,
            # torch.tensor(token_type_ids, dtype=torch.long),
            'token_type_ids': token_type_ids_list,
            'targets': targets_list,
            'len': [torch.tensor(len(targets_list), dtype=torch.long)]
        })

    def __getitem__(self, idx):

        body = str(self.data[idx])
        targets = int(self.label[idx])
        data = self.tokenizer.encode_plus(
            body,
            truncation=True,
            max_length=self.chunk_len,
            padding='longest',
            add_special_tokens=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_overflowing_tokens=True,
            return_tensors='pt')
        # calling third function
        long_token = self.long_terms_tokenizer(data, targets)
        return long_token

    def __len__(self):
        """ Return data length """
        return self.label.shape[0]

# For extracting base url from complete url
def get_base_url (long_url):  
  ls = []
  for item in long_url:
      u=unquote(item)
      g=urlparse(u)
      base_url=g.netloc
      ls.append(base_url)
  return ls

# For extracting article from the URL 
def get_text_from_url(url):
  try:
    extractor = extractors.ArticleExtractor()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
    }
    resp = requests.get(url, headers=headers)
    if resp.ok:
        doc = extractor.get_content(resp.text)
    else:
        doc = get_body_from_soup(url)
    testarticle = pd.DataFrame(columns=['title','body','label','url']) #header=None
    title = 'news'
    body = doc
    label = 'fake'
    url = url
    testarticle = testarticle.append({'title': title, 'body': body, 'label': label,'url':url}, ignore_index=True)
  except:
    print("Please use different URL")
  return testarticle

#if the above function fails, then use beautiful soup, just a check safe(its not used tho)
def get_body_from_soup(url):
  res = requests.get(url)
  html_page = res.content
  soup = BeautifulSoup(html_page, 'html.parser')
  text = soup.find_all(text=True)
  set([t.parent.name for t in text])
  output = ''
  want = ['body','title','p','div','article','h1','h2','h3','h4']
  for t in text:
      if t.parent.name in want:
          output += '{} '.format(t)
  return output

def predict_article(url,model,model_roberta):
    

  url = url
  testarticle = get_text_from_url(url)

  # check if article is related to Covid-19 or not
  cov_rel_words = 'covid|corona|covid-19|covid19|coronavirus|pandemic|crisis|disease|quarantine|distancing|immunity|symptoms|viruses|epidemic|lockdown|vaccine'
  related = testarticle['body'].str.lower().str.contains(cov_rel_words)
  if related.bool() :
    # Extracting Dataset
    testdataset = FakeNewsDataset(
                              tokenizer=bert_tokenizer,
                              min_len=MIN_LEN,
                              max_len=MAX_LEN,
                              chunk_len=CHUNK_LEN,
                              max_size_dataset=1,
                              overlap_len=OVERLAP_LEN,
                              mode='test',
                              testdf=testarticle
                              )

    test_data_loader=DataLoader(testdataset,
                                  batch_size=1,
                                  sampler=None,
                                  collate_fn=my_collate1)
    num_training_steps = int(len(testdataset) / TRAIN_BATCH_SIZE * EPOCH)
    # Initializing/ loading Bert model trained before
    model = model

    #Loading all the layers of the bert model except the last one as explained in the class above.
    model_roberta = model_roberta
    optimizer = AdamW(model_roberta.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=num_training_steps)
    val_losses = []
    output, target, val_losses_tmp = train_eval_robert_model(test_data_loader, model_roberta, optimizer, device,'eval')
    y_pred = np.argmax(output, axis=1).flatten()[0] #Estimated targets as returned by a classifier.
    pred_label = dict(map(reversed, dataset.label_mapping.items()))
    
    return pred_label[y_pred]
  
  else:
    msg = 'Please enter url related to Coronavirus'
    return msg


# Check if Graphic card is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
## HyperParameters
TRAIN_BATCH_SIZE = 1
EPOCH = 2
DATASET_SPLIT = .25
shuffle_dataset = True
random_seed = 42
# MIN_LEN=249
MIN_LEN = 10
MAX_LEN = 50
CHUNK_LEN = 200
OVERLAP_LEN = 50
max_size_dataset =None
NUM_CLASSES = 4
lr = 1e-5
# Defining tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# Extracting Dataset
dataset = FakeNewsDataset(
                          tokenizer=bert_tokenizer,
                          min_len=MIN_LEN,
                          max_len=MAX_LEN,
                          chunk_len=CHUNK_LEN,
                          max_size_dataset=max_size_dataset,
                          overlap_len=OVERLAP_LEN,
                          mode='train_eval',
                          testdf=None
                          )

dataset_size = len(dataset)
print('Total articles :' + str(dataset_size))

# Because the labels are imbalanced, we split the data set in a stratified fashion, using this as the class labels.
#Dividing into test and Validating
train_idx, valid_idx = train_test_split(
                                        np.arange(dataset_size),
                                        test_size=DATASET_SPLIT,
                                        shuffle=True,
                                        stratify=dataset.label)

# Creating data indices for training and validation splits:
#Map-style datasets
#A map-style dataset is one that implements the __getitem__() and __len__() protocols,
# and represents a map from (possibly non-integral) indices/keys to data samples.

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

train_data_loader=DataLoader(
                              dataset,
                              batch_size=TRAIN_BATCH_SIZE,
                              sampler=train_sampler,
                              collate_fn=my_collate1)

valid_data_loader=DataLoader(
                              dataset,
                              batch_size=TRAIN_BATCH_SIZE,
                              sampler=valid_sampler,
                              collate_fn=my_collate1)
## Load Trained Model
'''model = Bert_TextClassification_Model()
model.load_state_dict(torch.load('/content/Gdrive/MyDrive/finetuned_BERT_Model-2020-12-14-2.pt'))
model.to(device)
model.eval()
model_robert = RoBERT_Model(bertFineTuned=list(model.children())[0]).to(device)
model_robert.load_state_dict(torch.load('/content/Gdrive/MyDrive/RoBERT_Model-2020-12-14-2.pt'))
model_robert.to(device)
model_robert.eval()
'''

Overwriting lstm_bert_model.py


## User Interface

In [None]:
!pip -q install streamlit
!pip -q install pyngrok
!cp '/content/Gdrive/MyDrive/style.css' .
!cp -r '/content/Gdrive/MyDrive/fake_news' .

In [None]:
%%writefile app.py
from google.colab import drive
drive.mount("/content/Gdrive")
import streamlit as st
import torch
from lstm_bert_model import Bert_TextClassification_Model, RoBERT_Model, predict_article
from PIL import Image 
import validators

device = torch.device("cuda")
st.set_option('deprecation.showfileUploaderEncoding',False)
st.title("Fake News Detector")

# For loading Stylesheet for the UI
def local_css(file_name):
    with open(file_name) as f:
        st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)

def remote_css(url):
    st.markdown(f'<link href="{url}" rel="stylesheet">', unsafe_allow_html=True)    
# For loading the saved model
@st.cache()
def load_model():
  model = Bert_TextClassification_Model()
  model.load_state_dict(torch.load('/content/Gdrive/MyDrive/finetuned_BERT_Model-2021-03-03-1.pt'))
  model.to(device)
  model.eval()
  model_robert = RoBERT_Model(bertFineTuned=list(model.children())[0]).to(device)
  model_robert.load_state_dict(torch.load('/content/Gdrive/MyDrive/RoBERT_Model-2021-03-04-2.pt'))
  model_robert.to(device)
  model_robert.eval()
  return model, model_robert

with st.spinner('Loading Model Into Memory......'):
  model, model_robert = load_model()
local_css("style.css")
remote_css('https://fonts.googleapis.com/icon?family=Material+Icons')

#labels = {'High probability fake ': 0, 'High probability real': 1, 'fake': 2, 'real': 3}

High_prob_fake_img = '/content/fake_news/High_prob_fake.jpg'
High_prob_real_img = '/content/fake_news/High_prob_real.jpg'
fake_img = '/content/fake_news/fake.jpg'
real_img = '/content/fake_news/real.jpg'


st.text('Please enter the URL')
user_input_url = st.text_input("")
button_clicked = st.button("OK")
if button_clicked:
  # Check if its a valid url
  if validators.url(user_input_url):
    # Extract the article into text and predict the label
    result = predict_article(user_input_url,model=model,model_roberta=model_robert)
    if result == 'fake':
      st.image(fake_img,width=None)
    elif result =='real':
      st.image(real_img,width=None)
    elif result =='High probability fake ':
      st.image(High_prob_fake_img,width=None)
    elif result =='High probability real':
      st.image(High_prob_real_img,width=None)
    else:
      # If the url is valid but the information is not relevant
      st.write('Could not find any relevant information related to Covid-19. Please try with a different URL')
  else:
    # If invalid url
    st.write('Please enter valid URL')

Overwriting app.py


In [None]:
from pyngrok import ngrok
public_url = ngrok.connect(port='80')
print (public_url)
!streamlit run --server.port 80 app.py >/dev/null

NgrokTunnel: "http://b2f12056bf86.ngrok.io" -> "http://localhost:80"
2021-03-31 11:51:29.568 An update to the [server] config option section was detected. To have these changes be reflected, please restart streamlit.
2021-03-31 11:51:30.733935: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-03-31 11:51:35.769 NumExpr defaulting to 2 threads.
