Code Source:
https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f 

In [3]:
import pandas as pd
import warnings
pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")

data = pd.read_csv('train_twt.csv')
data = data.drop(['Unnamed: 0', "screen_name", "class_type"], axis =1)
dic = {
    'bot': 1,
    'human':0
}
data['account.type'] = data['account.type'].map(dic)
data

Unnamed: 0,text,account.type
0,YEA now that note GOOD,1
1,Listen to This Charming Man by The Smiths htt...,0
2,wish i can i would be seeing other hoes on the...,1
3,The decade in the significantly easier schedul...,1
4,"""Theim class=\""alignnone size-full wp-image-60...",1
...,...,...
20707,Met on the Abversion of our science for the co...,1
20708,Land for their during the opportunity to the p...,1
20709,@TayandYou doesn't have a clue. You're right. ...,1
20710,Me And My Bestie https://t.co/vPq2iDkWZm,0


In [4]:
import re
for word in range(len(data)):
    cap = data["text"][word]
    word_list = cap.split(' ')
    regex = '(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+'
    url_list = re.findall(regex, cap)
    for i in word_list.copy():
        if i in url_list:
            word_list.remove(i) 
    new_caption = " ".join(word_list)
    data["text"][word] = new_caption
data

Unnamed: 0,text,account.type
0,YEA now that note GOOD,1
1,Listen to This Charming Man by The Smiths,0
2,wish i can i would be seeing other hoes on the...,1
3,The decade in the significantly easier schedul...,1
4,"""Theim class=\""alignnone size-full wp-image-60...",1
...,...,...
20707,Met on the Abversion of our science for the co...,1
20708,Land for their during the opportunity to the p...,1
20709,@TayandYou doesn't have a clue. You're right. ...,1
20710,Me And My Bestie,0


In [5]:
#take out mentions
for word in range(len(data)):
    cap = data["text"][word]
    word_list = cap.split(' ')
    for i in word_list.copy():
        if i.startswith('@') or i.startswith('#'):
            word_list.remove(i) 
    new_caption = " ".join(word_list)
    data["text"][word] = new_caption
data

Unnamed: 0,text,account.type
0,YEA now that note GOOD,1
1,Listen to This Charming Man by The Smiths,0
2,wish i can i would be seeing other hoes on the...,1
3,The decade in the significantly easier schedul...,1
4,"""Theim class=\""alignnone size-full wp-image-60...",1
...,...,...
20707,Met on the Abversion of our science for the co...,1
20708,Land for their during the opportunity to the p...,1
20709,doesn't have a clue. You're right. We love our...,1
20710,Me And My Bestie,0


In [6]:
for word in range(len(data)):
    pattern = '[0-9]'
    tweet_clean = data['text'][word].split(' ')
    tweet_clean = [re.sub(pattern, '', i) for i in tweet_clean]
    new_tweet = " ".join(tweet_clean)
    data['text'][word] = new_tweet
data

Unnamed: 0,text,account.type
0,YEA now that note GOOD,1
1,Listen to This Charming Man by The Smiths,0
2,wish i can i would be seeing other hoes on the...,1
3,The decade in the significantly easier schedul...,1
4,"""Theim class=\""alignnone size-full wp-image- \...",1
...,...,...
20707,Met on the Abversion of our science for the co...,1
20708,Land for their during the opportunity to the p...,1
20709,doesn't have a clue. You're right. We love our...,1
20710,Me And My Bestie,0


In [7]:
for word in range(len(data)):
    cap = data["text"][word]
    new_string = re.sub(r'[^\w\s]', '', cap)
    data["text"][word] = new_string
data

Unnamed: 0,text,account.type
0,YEA now that note GOOD,1
1,Listen to This Charming Man by The Smiths,0
2,wish i can i would be seeing other hoes on the...,1
3,The decade in the significantly easier schedul...,1
4,Theim classalignnone sizefull wpimage srchttp...,1
...,...,...
20707,Met on the Abversion of our science for the co...,1
20708,Land for their during the opportunity to the p...,1
20709,doesnt have a clue Youre right We love our cou...,1
20710,Me And My Bestie,0


In [8]:
data['text'] = data['text'].map(lambda x: x.lower())
data.to_csv("second_dataset_cleantwts.csv")
data

Unnamed: 0,text,account.type
0,yea now that note good,1
1,listen to this charming man by the smiths,0
2,wish i can i would be seeing other hoes on the...,1
3,the decade in the significantly easier schedul...,1
4,theim classalignnone sizefull wpimage srchttp...,1
...,...,...
20707,met on the abversion of our science for the co...,1
20708,land for their during the opportunity to the p...,1
20709,doesnt have a clue youre right we love our cou...,1
20710,me and my bestie,0


In [9]:
import numpy as np
np.random.seed(112)
df_train, df_val, df_test = np.split(data.sample(frac=1, random_state=42), 
                                     [int(.8*len(data)), int(.9*len(data))])

print(len(df_train),len(df_val), len(df_test))
type(df_train['text'][0])

16569 2071 2072


str

In [10]:
from transformers import BertTokenizer
import torch


tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = df['account.type'].values.tolist()
        self.texts = [tokenizer(str(text), 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text'].values.tolist()]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

2022-11-14 18:27:09.358277: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [12]:
from torch.optim import Adam
from tqdm import tqdm
import torch
import torchvision 

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=1, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=1)

    use_cuda = torch.cuda.is_available()
    print('cuda', use_cuda)
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0
            total_tp_train, total_tn_train, total_fp_train, total_fn_train = 0, 0, 0, 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output_train = model(input_id, mask)
                
                batch_loss = criterion(output_train, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output_train.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                tp = (output_train.argmax(dim=1) == 1 and train_label == 1).sum().item()
                tn = (output_train.argmax(dim=1) == 0 and train_label == 0).sum().item()
                fp = (output_train.argmax(dim=1) == 1 and train_label == 0).sum().item()
                fn = (output_train.argmax(dim=1) == 0 and train_label == 1).sum().item()

                total_tp_train += tp
                total_tn_train += tn
                total_fp_train += fp
                total_fn_train += fn

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0
            total_tp_val, total_tn_val, total_fp_val, total_fn_val = 0, 0, 0, 0


            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output_val = model(input_id, mask)

                    batch_loss = criterion(output_val, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output_val.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc

                    tp = (output_val.argmax(dim=1) == 1 and val_label == 1).sum().item()
                    tn = (output_val.argmax(dim=1) == 0 and val_label == 0).sum().item()
                    fp = (output_val.argmax(dim=1) == 1 and val_label == 0).sum().item()
                    fn = (output_val.argmax(dim=1) == 0 and val_label == 1).sum().item()

                    total_tp_val += tp
                    total_tn_val += tn
                    total_fp_val += fp
                    total_fn_val += fn
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}'
                )
            print(f'Train Accuracy: {(total_tp_train + total_tn_train) / (total_tp_train + total_tn_train + total_fp_train + total_fn_train): .3f}')
            print(f'Train Precision: {total_tp_train / (total_tp_train + total_fp_train): .3f}')
            print(f'Train Recall: {total_tp_train / (total_tp_train + total_fn_train): .3f}')

            print(f'Val Accuracy: {(total_tp_val + total_tn_val) / (total_tp_val + total_tn_val + total_fp_val + total_fn_val): .3f}')
            print(f'Val Precision: {total_tp_val / (total_tp_val + total_fp_val): .3f}')
            print(f'Val Recall: {total_tp_val / (total_tp_val + total_fn_val): .3f}')
                  
EPOCHS = 1
model = BertClassifier()
LR = 1e-6
print(df_train)
              
train(model, df_train, df_val, LR, EPOCHS)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


                                                    text  account.type
19517                    closing the store in an hour uf             0
19536  the section on the static keyword in the think...             0
18938  hey lets be clear but when you talk about the ...             1
12315                how about i cook you some burgersuf             0
6267   pervert alert birthday is april st how old is ...             1
...                                                  ...           ...
307    in the coming the programme of the spirit of t...             1
19274          fake woke people need to be executed asap             0
17690                                        photos shop             0
12580  me fuck im voting for him was id be making law...             1
19602  i gotta start praying to her dad is why i gott...             1

[16569 rows x 2 columns]
cuda False


100%|██████████| 16569/16569 [35:28:27<00:00,  7.71s/it]      


Epochs: 1 | Train Loss:  0.558                 | Train Accuracy:  0.679                 | Val Loss:  0.466                 | Val Accuracy:  0.755
Train Accuracy:  0.679
Train Precision:  0.726
Train Recall:  0.573
Val Accuracy:  0.755
Val Precision:  0.793
Val Recall:  0.686


In [13]:
def evaluate(model, test_data):

    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=1, shuffle = False)
    
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_tp, total_tn, total_fp, total_fn = 0, 0, 0, 0

    with torch.no_grad():

        for test_input, test_label in test_dataloader:
              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              tp = (output.argmax(dim=1) == 1 and test_label == 1).sum().item()
              tn = (output.argmax(dim=1) == 0 and test_label == 0).sum().item()
              fp = (output.argmax(dim=1) == 1 and test_label == 0).sum().item()
              fn = (output.argmax(dim=1) == 0 and test_label == 1).sum().item()

              total_tp += tp
              total_tn += tn
              total_fp += fp
              total_fn += fn
    
    print(f'Test Accuracy: {(total_tp + total_tn) / (total_tp + total_tn + total_fp + total_fn): .3f}')
    print(f'Test Precision: {total_tp / (total_tp + total_fp): .3f}')
    print(f'Test Recall: {total_tp / (total_tp + total_fn): .3f}')

    
evaluate(model, df_test)

Test Accuracy:  0.748
Test Precision:  0.791
Test Recall:  0.683
