In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import tensorflow as tf
import seaborn as sns

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
from torchtext.data import LabelField ,Field, TabularDataset, BucketIterator, Iterator
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
df = pd.read_csv('../../data/cleandata/cleaned.csv')

df.head()

Unnamed: 0,text,category
0,donald trump sends embarrassing new year eve m...,1
1,drunk bragging trump staffer started russian c...,1
2,sheriff david clarke internet joke threatening...,1
3,trump obsessed obama coded website image chris...,1
4,pope francis called donald trump christmas spe...,1


In [4]:
df = df.sample(frac = 1)#.reset_index(drop=True)
df.head()

Unnamed: 0,text,category
3503,prominent holocaust attorney file massive laws...,1
43936,powerful hurricane fuel demand island nation c...,0
29589,trump moving u era bilateral trade white house...,0
27989,democrat amass support force showdown trump su...,0
30224,trump pack trade team veteran steel war chinaw...,0


In [5]:
import os

In [6]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(
    df,
    test_size=0.10,
    random_state=42,    
)

train, val = train_test_split(
    train,
    test_size=0.10,
    random_state=42,    
)

train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)
val.to_csv("val.csv", index=False)

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print('here')
# Model parameter
MAX_SEQ_LEN = 128
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

# Fields
print('here2')
category_field = LabelField(dtype=torch.float)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
fields = [('text', text_field), ('category', category_field)]

# TabularDataset
print('here3')
train, valid, test = TabularDataset.splits(
    path='/Users/admin/Desktop/moreML/FakeNewsDetection/testing_algorithms/Isita',
    train='train.csv', validation='val.csv', test='test.csv', format='CSV', fields=fields, skip_header=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors


here
here2
here3


In [13]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [20]:
# Iterators

train_iter = BucketIterator(dataset = train, batch_size=16, sort_key=lambda x: len(x.text),
                            device=device, train=True, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=16, sort_key=lambda x: len(x.text),
                            device=device, train=True, sort=True, sort_within_batch=True)
test_iter = Iterator(test, batch_size=16, device=device, train=False, shuffle=False, sort=False)



In [28]:
class BERT(nn.Module):

    def __init__(self):
        super(BERT, self).__init__()

        options_name = "bert-base-uncased"
        self.encoder = BertForSequenceClassification.from_pretrained(options_name)

    def forward(self, text, label):
        loss, text_fea = self.encoder(text, labels=label)[:2]

        return loss, text_fea
    def save_checkpoint(save_path, model, valid_loss):

        if save_path == None:
            return

        state_dict = {'model_state_dict': model.state_dict(),
                      'valid_loss': valid_loss}

        torch.save(state_dict, save_path)
        print(f'Model saved to ==> {save_path}')

    def load_checkpoint(load_path, model):

        if load_path==None:
            return

        state_dict = torch.load(load_path, map_location=device)
        print(f'Model loaded from <== {load_path}')

        model.load_state_dict(state_dict['model_state_dict'])
        return state_dict['valid_loss']


    def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):
        if(True):
            print('i die')
        if save_path == None:
            return

        state_dict = {'train_loss_list': train_loss_list,
                      'valid_loss_list': valid_loss_list,
                      'global_steps_list': global_steps_list}

        torch.save(state_dict, save_path)
        print(f'Model saved to ==> {save_path}')


    def load_metrics(load_path):

        if load_path==None:
            return

        state_dict = torch.load(load_path, map_location=device)
        print(f'Model loaded from <== {load_path}')

        return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']
    
    def train(model,
          optimizer,
          criterion = nn.BCELoss(),
          train_loader = train_iter,
          valid_loader = valid_iter,
          num_epochs = 5,
          eval_every = len(train_iter) // 2,
          best_valid_loss = float("Inf")):
    
        # initialize running values
        running_loss = 0.0
        valid_running_loss = 0.0
        global_step = 0
        train_loss_list = []
        valid_loss_list = []
        global_steps_list = []

        # training loop
        model.train()
        for epoch in range(num_epochs):
            for (labels, text), _ in train_loader:
                labels = labels.type(torch.LongTensor)           
                labels = labels.to(device)
                text = text.type(torch.LongTensor)  
                text = text.to(device)
                output = model(text, labels)
                loss, _ = output

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # update running values
                running_loss += loss.item()
                global_step += 1

                # evaluation step
                if global_step % eval_every == 0:
                    model.eval()
                    with torch.no_grad():                    

                        # validation loop
                        for (labels, text), _ in valid_loader:
                            labels = labels.type(torch.LongTensor)           
                            labels = labels.to(device)
                            text = text.type(torch.LongTensor)  
                            text = text.to(device)
                            output = model(text, labels)
                            loss, _ = output

                            valid_running_loss += loss.item()

                    # evaluation
                    average_train_loss = running_loss / eval_every
                    average_valid_loss = valid_running_loss / len(valid_loader)
                    train_loss_list.append(average_train_loss)
                    valid_loss_list.append(average_valid_loss)
                    global_steps_list.append(global_step)

                    # resetting running values
                    running_loss = 0.0                
                    valid_running_loss = 0.0
                    model.train()

                    # print progress
                    print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                          .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                                  average_train_loss, average_valid_loss))

                    # checkpoint
#                     if best_valid_loss > average_valid_loss:
#                         best_valid_loss = average_valid_loss
#                         save_checkpoint(file_path + '/' + 'model.pt', model, best_valid_loss)
#                         save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)

#         save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
        print('Finished Training!')

model = BERT().to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)

train(model=model, optimizer=optimizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

TypeError: 'TabularDataset' object is not callable