In [17]:
import os 
os.environ['KERAS_BACKEND'] = 'torch'
import sys 
sys.path.append('../')

import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification

import keras
from keras.layers import TorchModuleWrapper

In [3]:
fake_df = pd.read_csv('./data/Fake.csv')
real_df = pd.read_csv('./data/True.csv')

print(fake_df.head())
print(fake_df.subject.value_counts())

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  
subject
News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east 

In [4]:
fake_df['label'] = 0
real_df['label'] = 1

df = pd.concat([fake_df, real_df], axis=0)
df['text'] = df['title'] + ' ' + df['text']

train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

In [5]:
def tokenize_data(texts, labels, tokenizer, max_length=128):
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt"
    )
    encodings['labels'] = torch.tensor(labels.tolist())
    return encodings

In [6]:
class tokenizedDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenize_data(train['text'], train['label'], tokenizer)
val_encodings = tokenize_data(val['text'], val['label'], tokenizer)
test_encodings = tokenize_data(test['text'], test['label'], tokenizer)

train_dataset = tokenizedDataset(train_encodings)
val_dataset = tokenizedDataset(val_encodings)
test_dataset = tokenizedDataset(test_encodings)

In [42]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=2
)
model.to('cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [55]:
class Classifer(keras.Model):
    def __init__(self, model):
        super(Classifer, self).__init__()
        self.model = TorchModuleWrapper(model)

    def call(self, inputs):
        for key in inputs:
            inputs[key] = inputs[key].to(model.device)
        return self.model(**inputs)

In [48]:
classifer = Classifer(model)
classifer.summary()
classifer.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.SparseCategoricalCrossentropy(),
    metrics=[keras.metrics.SparseCategoricalAccuracy()]
)

In [62]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

In [59]:
for batch in train_loader:
    out = classifer(batch)
    print(out)
    break

SequenceClassifierOutput(loss=tensor(0.7203, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0909, -0.2192],
        [-0.1132, -0.0588],
        [ 0.1423, -0.3546],
        [-0.1873,  0.0442],
        [-0.1585, -0.3127],
        [-0.2364, -0.0462],
        [-0.0224, -0.2365],
        [-0.0529, -0.0844],
        [-0.0209, -0.3082],
        [ 0.1284, -0.0803],
        [-0.0448,  0.1449],
        [-0.0776, -0.0340],
        [ 0.1214, -0.0387],
        [ 0.1586,  0.0493],
        [-0.1997, -0.1001],
        [ 0.0618,  0.0485],
        [ 0.2774, -0.1057],
        [ 0.0538,  0.0725],
        [-0.1297, -0.2052],
        [-0.1642, -0.2206],
        [ 0.1444, -0.2110],
        [ 0.0444,  0.0609],
        [ 0.1077, -0.2176],
        [-0.2211, -0.0356],
        [-0.0419,  0.0767],
        [-0.1746,  0.0142],
        [ 0.1728, -0.2108],
        [ 0.0185, -0.1811],
        [-0.0279, -0.0167],
        [ 0.0717, -0.0328],
        [-0.0215, -0.0668],
        [ 0.1143, -0.1177]], devic