In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import tqdm
import torch
from sklearn.model_selection import train_test_split
import torch.nn as nn
from unidecode import unidecode
import random
import torch.nn.functional as F
import contractions
from torch.nn.utils.rnn import pack_padded_sequence,pad_packed_sequence
from torchtext.data import Field,TabularDataset,LabelField,BucketIterator
from torchtext import data
import re
import spacy
from spacy.tokenizer import Tokenizer

In [6]:
df=pd.read_csv("real_nd_fake_news/news.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [9]:
df.drop(["Unnamed: 0"],axis=1,inplace=True)

In [11]:
df["label"]=(df["label"] == "FAKE").astype("int")

In [12]:
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0


In [13]:
df["titletext"]=df["title"]+". "+df["text"]

In [26]:
df["titletext"]=df["titletext"].apply(lambda x:x.lower())


Unnamed: 0,title,text,label,titletext
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1,you can smell hillary’s fear. daniel greenfiel...
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1,watch the exact moment paul ryan committed pol...
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0,kerry to go to paris in gesture of sympathy. u...
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1,bernie supporters on twitter erupt in anger ag...
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0,the battle of new york: why this primary matte...


In [28]:
def expand_contractions(text):
    expanded_words=[]
    for word in text.split():
        expanded_words.append(contractions.fix(word))

    expanded_text=" ".join(expanded_words)
    return expanded_text

In [29]:
df["titletext"]=df["titletext"].apply(lambda x:expand_contractions(x))

In [36]:
df["titletext"]=df["titletext"].apply(lambda x: re.sub("[^A-Z a-z 0-9]+",'',x))

In [37]:
df["titletext"]=df["titletext"].apply(lambda x:([t for t in x.split() if not t.isdigit()]))

In [40]:
df["titletext"]=df["titletext"].apply(lambda x:" ".join(x))

In [41]:
df["titletext"]=df["titletext"].apply(lambda x: re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9+._-]+\.[a-zA-Z0-9+._-]+)',"",x))   

In [42]:
df["titletext"]=df["titletext"].apply(lambda x: re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9+._-]+\.[a-zA-Z0-9+._-]+)',"",x))   

In [43]:
df["titletext"]=df["titletext"].apply(lambda x:re.sub(r"(http|https|ftp)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])","",str(x)))

In [45]:
df["titletext"]=df["titletext"].apply(lambda x:unidecode(x))

In [47]:
df.drop(["title","text"],axis=1,inplace=True)

In [61]:
df.to_csv("real_nd_fake_news/real_train.csv",index=False)

In [62]:
df.head()

Unnamed: 0,label,titletext
0,1,you can smell hillarys fear daniel greenfield ...
1,1,watch the exact moment paul ryan committed pol...
2,0,kerry to go to paris in gesture of sympathy yo...
3,1,bernie supporters on twitter erupt in anger ag...
4,0,the battle of new york why this primary matter...


In [63]:
nlp=spacy.load("en_core_web_sm")
tokenizer=Tokenizer(nlp.vocab)

In [64]:
def spacy_tokenizer(x):
    return [tok.text for tok in tokenizer(x)]

In [65]:
TEXT=Field(tokenize=spacy_tokenizer,include_lengths=True)
LABEL=LabelField(dtype=torch.float)

In [66]:
fields=[("label",LABEL),("text",TEXT)]

In [69]:
training_data=TabularDataset(path="real_nd_fake_news/real_train.csv",format="csv",fields=fields,skip_header=False)

print(vars(training_data.examples[5]))

{'label': '0', 'text': ['the', 'battle', 'of', 'new', 'york', 'why', 'this', 'primary', 'matters', 'it', 'is', 'primary', 'day', 'in', 'new', 'york', 'and', 'frontrunners', 'hillary', 'clinton', 'and', 'donald', 'trump', 'are', 'leading', 'in', 'the', 'polls', 'trump', 'is', 'now', 'vowing', 'to', 'win', 'enough', 'delegates', 'to', 'clinch', 'the', 'republican', 'nomination', 'and', 'prevent', 'a', 'contested', 'convention', 'but', 'sensted', 'cruz', 'rtexas', 'bernie', 'sanders', 'dvt', 'and', 'ohio', 'gov', 'john', 'kasich', 'and', 'are', 'not', 'giving', 'up', 'just', 'yet', 'a', 'big', 'win', 'in', 'new', 'york', 'could', 'tip', 'the', 'scales', 'for', 'both', 'the', 'republican', 'and', 'democratic', 'frontrunners', 'in', 'this', 'years', 'race', 'for', 'the', 'white', 'house', 'clinton', 'and', 'trump', 'have', 'each', 'suffered', 'losses', 'in', 'recent', 'contests', 'shifting', 'the', 'momentum', 'to', 'their', 'rivals', 'we', 'have', 'won', 'eight', 'out', 'of', 'the', 'last'

In [72]:
train_data,valid_data=training_data.split(split_ratio=0.7,random_state=random.seed(2020))

In [73]:
len(train_data),len(valid_data)

(4435, 1901)

In [74]:
TEXT.build_vocab(train_data,min_freq=3)

In [75]:
LABEL.build_vocab(train_data)

In [77]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [78]:
print("Size of text vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

Size of text vocabulary: 32059
Size of LABEL vocabulary: 3


In [146]:
train_iteration,valid_iteration=BucketIterator.splits((train_data,valid_data),
                                    batch_size=32,
                                    sort_key=lambda x:len(x.text),
                                    sort=True,
                                    sort_within_batch=True,
                                    device=device)

In [147]:
len(train_iteration),len(valid_iteration)

(139, 60)

In [125]:
for batch in train_iteration:
    text,text_len=batch.text
#     label,label_len=batch.label
    print(len(text_len),len(batch.label))
    print(text_len)
    print(batch.label)
    break

128 128
tensor([23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22,
        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21,
        21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19,
        19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17,
        17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15,
        15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12,
        11, 10, 10, 10, 10, 10,  9,  9,  8,  8,  8,  8,  7,  6,  6,  4,  4,  4,
         2,  1])
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1.,
        0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1.,
        1., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1.,

In [119]:
# df["titletext"][0]

In [126]:
train,test=train_test_split(df,test_size=0.2,random_state=0)

In [127]:
len(train),len(test)

(5068, 1267)

In [130]:
train_iter,test_iter=BucketIterator.splits((train,test),
                                          device=device,
                                          sort_within_batch=True,
                                          sort_key=True,
                                          batch_size=32)

In [131]:
len(train_iter),len(test_iter)

(159, 40)

In [156]:
class Classifier(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_dim,output_dim,n_layers,bidirectional,dropout):
        super(Classifier,self).__init__()
        self.embedding=nn.Embedding(vocab_size,embedding_dim)
        self.lstm=nn.LSTM(embedding_dim,hidden_dim,num_layers=n_layers,bidirectional=bidirectional,batch_first=True)
        
        self.fc=nn.Linear(2*hidden_dim,output_dim)
        self.act=nn.Sigmoid()    
            
    def forward(self,text,text_len):
        embedding=self.embedding(text)
        
        packed_embedd=pack_padded_sequence(embedding,text_len)
        packed_output,(hidden,cell)=self.lstm(packed_embedd)
        
        
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden=torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1)
        
        dense_output=self.fc(hidden)
        
        #final activation functions
        outputs=self.act(dense_output)
        
        return outputs
        

In [157]:
size_of_vocab=len(TEXT.vocab)
embedding_dim=100
num_hidden_nodes=32
num_output_nodes=1
num_layers=2
bidirection=True
dropout=0.2


model=Classifier(size_of_vocab,embedding_dim,num_hidden_nodes,num_output_nodes,num_layers,bidirection,dropout)

In [158]:
model

Classifier(
  (embedding): Embedding(32059, 100)
  (lstm): LSTM(100, 32, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (act): Sigmoid()
)

In [135]:
optimizer=torch.optim.Adam(model.parameters(),lr=0.01)
criterion=nn.BCELoss()

In [136]:
def binary_accuracy(pred,y):
    rounded_pred=torch.round(pred)
    correct=(rounded_pred==y).float()
    acc=correct.sum() / len(correct)
    return acc

In [159]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()
        
        #compute the loss
        loss = criterion(predictions, batch.label)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [160]:
def evaluate(model,iterator,criterion):
    
    epoch_loss=0
    
    epoch_acc=0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            #text or no. of words
            text,text_len=batch.text
            
            #convert to 1d array
            predictions=model(text,text_len).squeeze()
            
            #compute loss and accuracy
            
            loss=criterion(predictions,batch.label)
            acc=binary_accuracy(predictions,batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)
            

In [163]:

N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iteration, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iteration, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'real_nd_fake_news/saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

KeyboardInterrupt: 