In [1]:
!pip install transformers torch seaborn

In [2]:
import transformers
from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam

In [3]:
from tqdm.notebook import tqdm

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
# torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [6]:
# DATA_PATH = "../input/sarcbalanced/train-balanced-sarcasm.csv" 
# df = pd.read_csv(DATA_PATH,encoding="ISO-8859-1")
# df.columns = ["target", "text", "author","subreddit","score","ups","downs","date","created_utc","parent_comment"]

In [7]:
DATA_PATH = "../input/sarcbalancedcleaned/train-balanced-sarcasm-cleaned.csv"
df = pd.read_csv(DATA_PATH,encoding="ISO-8859-1")
df = df.drop(columns=['Unnamed: 0'])
df.columns = ["target", "comment", "author","subreddit","score","ups","downs","date","created_utc","parent_comment", "text",
       "parent_cleaned_comment"]

In [8]:
class_names = ['not_sarc', 'sarc']

In [9]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'

In [10]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [11]:
sample_txt = 'Are you serious??'

In [12]:
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [13]:
print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

In [14]:
tokenizer.sep_token, tokenizer.sep_token_id

In [15]:
tokenizer.cls_token, tokenizer.cls_token_id

In [16]:
tokenizer.pad_token, tokenizer.pad_token_id

In [17]:
tokenizer.unk_token, tokenizer.unk_token_id

In [18]:
encoding = tokenizer.encode_plus(
    sample_txt,
    max_length=32,
    add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors="pt",  # Return PyTorch tensors
)

encoding.keys()


In [19]:
print(len(encoding['input_ids'][0]))
encoding['input_ids'][0]

In [20]:
print(len(encoding['attention_mask'][0]))
encoding['attention_mask']

In [21]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

In [22]:
df['target']
df["target"] = pd.to_numeric(df["target"])
df.head()

In [23]:
MAX_LEN = 60

In [24]:
class SarcDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, item):
        text = self.df.text[item]
        target = self.df.target[item]

        encoding = self.tokenizer.encode_plus(
            str(text),
            add_special_tokens=True,
            max_length=self.max_len,
            # padding="longest",    
            pad_to_max_length=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors="pt",
        )
        return {
            "text": text,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "targets": torch.tensor(target, dtype=torch.long)
        }


In [25]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=RANDOM_SEED)
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=RANDOM_SEED)

In [26]:
df_train = df_train
df_test = df_test
df_val = df_val

In [27]:
df_train.dtypes

In [28]:
print(len(df_train))
print(len(df_val))
print(len(df_test))

In [29]:
df_train.reset_index(inplace=True, drop=True)
df_val.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)

In [30]:
def create_data_loader(df, tokenizer, MAX_LEN, batch_size):
    ds = SarcDataset(df, tokenizer, MAX_LEN)
    return DataLoader(ds, batch_size, num_workers=1)

In [31]:
BATCH_SIZE = 200
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [32]:
class SarcClassifier(nn.Module):

    def __init__(self, n_classes):
        super(SarcClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(outs["pooler_output"])
        return self.out(output)

In [33]:
model = SarcClassifier(len(class_names)).to(device)
# model.to(device)

In [34]:
len(class_names)

In [35]:
EPOCHS = 5

optimizer = Adam(model.parameters(), lr=2e-5, weight_decay=1e-5)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)


In [36]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()

    losses = []
    correct_predictions = 0

    for i, d in enumerate(tqdm(data_loader)):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)


        loss.backward()
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)


In [37]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)


In [38]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(df_val)
    )

    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

In [39]:
test_acc, test_loss = eval_model(model, test_data_loader, loss_fn, device, len(df_test))
print(f"Test loss: {test_loss:.4f} Test Accuracy: {test_acc:.4f}")

In [40]:
def confusion_mat(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    predlist=torch.zeros(0,dtype=torch.long, device='cpu')
    lbllist=torch.zeros(0,dtype=torch.long, device='cpu')
    losses = []
    correct_predictions = 0
    confusion_matrix = np.zeros((len(class_names), len(class_names)))
    with torch.no_grad():
        for i, d in enumerate(tqdm(data_loader)):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            classes = d["targets"].to(device)
            #classes = classes.to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)

            # Append batch prediction results
            predlist=torch.cat([predlist,preds.view(-1).cpu()])
            lbllist=torch.cat([lbllist,classes.view(-1).cpu()])

    return lbllist, predlist

In [41]:
lbllist, predlist = confusion_mat(model, test_data_loader, loss_fn, device, len(df_test))

In [42]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score 
conf_mat=confusion_matrix(lbllist, predlist)
cm = conf_mat #/ np.sum(conf_mat)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, 
                              display_labels=class_names)
disp.plot()

In [43]:
import seaborn as sn
df_cm = pd.DataFrame(cm, range(2), range(2))
plt.figure(figsize = (10,10))
sn.heatmap(df_cm, annot=True)

In [44]:
from sklearn.metrics import classification_report

print(classification_report(lbllist, predlist, target_names=class_names))

In [45]:
history['test_acc'].append(test_acc)
history['test_loss'].append(test_loss)

In [46]:
NAME = "bert-emoji"
torch.save(model, f"{NAME}.pth")
import pickle
with open(f'{NAME}-results.pickle', 'wb') as handle:
    pickle.dump(history, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(f'{NAME}-lbllist.pickle', 'wb') as handle:
    pickle.dump(lbllist, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(f'{NAME}-predlist.pickle', 'wb') as handle:
    pickle.dump(predlist, handle, protocol=pickle.HIGHEST_PROTOCOL)