# Bert

In [3]:
# Import
import pandas as pd
import nltk
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import BertTokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report as clsr
from sklearn.metrics import confusion_matrix, cohen_kappa_score, plot_confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.manifold import TSNE

In [4]:
df = pd.read_csv("../data/all_clean.csv")
text=df['text']
sp500_mean_label=df['sp500_mean_label']
sp500_last_label=df['sp500_last_label']
usdx_mean_label=df['usdx_mean_label']
usdx_last_label=df['usdx_last_label']

In [3]:
label_to_index = {}
index_to_label = {}
label_list = list(set(usdx_mean_label))
for i in range(len(label_list)):
    label = label_list[i]
    label_to_index[label] = i
    index_to_label[i] = label

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
input_ids = []
for text in text:
    encoded_sent = tokenizer.encode(text,
                                    add_special_tokens=True, 
                                    max_length=128,
                                    truncation=True)
    input_ids.append(encoded_sent)

In [5]:
input_ids = pad_sequences(input_ids, maxlen=128, dtype="long", 
                          value=0, truncating="post", padding="post")

In [6]:
attention_masks = []
for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)

In [7]:
labels = [label_to_index[label] for label in usdx_mean_label]

In [8]:
train_input_ids, test_input_ids, train_labels, test_labels = train_test_split(input_ids, labels, test_size=0.2, random_state=42)
train_attention_masks, test_attention_masks, train_labels, test_labels = train_test_split(attention_masks, labels, test_size=0.2, random_state=42)

In [9]:
train_input_ids = torch.tensor(train_input_ids)
test_input_ids = torch.tensor(test_input_ids)

train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

train_attention_masks = torch.tensor(train_attention_masks)
test_attention_masks = torch.tensor(test_attention_masks)

In [10]:
batch_size = 32
# Create the DataLoader for our training set.
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our test set.
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [11]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = len(label_list),  
    output_attentions = False, 
    output_hidden_states = False
)
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [12]:
device = torch.device("cpu")
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU')
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU


In [13]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-5,
                  eps = 1e-8)



In [14]:
dim_reducer = TSNE(n_components=2)

def visualize_layerwise_embeddings(hidden_states,masks,labels,epoch,title,layers_to_visualize):

    !mkdir -p /tmp/plots/{title}
    num_layers = len(layers_to_visualize)
    
    fig = plt.figure(figsize=(30,20)) #each subplot of size 6x6, each row will hold 4 plots
    title_name = "Epoch "+str(epoch)+" Embeddings"
    plt.suptitle(title_name, fontsize = 30, y = 0.92)
    labels = labels.numpy().reshape(-1)
    #Pic = pd.DataFrame()
    for i,layer_i in enumerate(layers_to_visualize):
        layer_embeds = hidden_states[layer_i]
        
        layer_averaged_hidden_states = torch.div(layer_embeds.sum(dim=1),masks.sum(dim=1,keepdim=True))
        layer_dim_reduced_embeds = dim_reducer.fit_transform(layer_averaged_hidden_states.detach().numpy())
        
        df = pd.DataFrame.from_dict({'x':layer_dim_reduced_embeds[:,0],'y':layer_dim_reduced_embeds[:,1],'label':labels})
        #Pic = pd.concat([Pic,df],ignore_index = True)
        
        fig = plt.subplot(331+i)
        plt.subplots_adjust(wspace =0.4, hspace =0.4)
        sns.scatterplot(data=df,x='x',y='y',hue='label')
        name = "layer "+str(layer_i+1)
        plt.title(name, fontsize = 20)
        
    #sns.scatterplot(data=Pic,x='x',y='y',hue='label')
    plt.savefig('vis'+str(epoch)+'.png')    

In [None]:
from tqdm import tqdm

average_loss = []
for epoch_i in range(10):
    print("")
    print('Epoch {}'.format(epoch_i + 1))
    print('Training...')
    
    total_loss = 0
    model.train()

    train_masks,train_ys = torch.zeros(0,128),torch.zeros(0,1)
    train_hidden_states = None

    progress = tqdm(range(len(train_dataloader)), position = 0)

    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        model.zero_grad()
        outputs = model(b_input_ids, 
                        token_type_ids=None,
                        attention_mask=b_input_mask, 
                        labels=b_labels,
                        output_hidden_states=True,
                        return_dict=True)
        
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        hidden_states = outputs.hidden_states[1:]
        train_masks = torch.cat([train_masks,b_input_mask.cpu()])
        train_ys = torch.cat([train_ys,b_labels.cpu().view(-1,1)])
        
        if type(train_hidden_states) == type(None):
          train_hidden_states = tuple(layer_hidden_states.cpu() for layer_hidden_states in hidden_states)
        else:
          train_hidden_states = tuple(torch.cat([layer_hidden_state_all,layer_hidden_state_batch.cpu()])for layer_hidden_state_all,layer_hidden_state_batch in zip(train_hidden_states,hidden_states))
        
        progress.update(1)
        
    avg_train_loss = total_loss / len(train_dataloader)
    average_loss.append(avg_train_loss)
    print("Average training loss: {0:.2f}".format(avg_train_loss))

    visualize_layerwise_embeddings(hidden_states=train_hidden_states,
                                 masks=train_masks,
                                 labels=train_ys,
                                 epoch=epoch_i,
                                 title='train_data',
                                 layers_to_visualize=[0,1,2,3,4,8,9,10,11])


Epoch 1
Training...


 17%|█▋        | 29/168 [00:48<07:24,  3.20s/it]

In [None]:
model.eval()
preds_test = []
labels_test = []
for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    preds_flat = np.argmax(logits, axis=1).flatten()
    labels_flat = label_ids.flatten()
    
    preds_test += list(preds_flat)
    labels_test += list(labels_flat)

In [None]:
print("Accuracy:", accuracy_score(labels_test, preds_test))
print("Precision:", precision_score(labels_test, preds_test, average="macro"))
print("Recall:", recall_score(labels_test, preds_test, average="macro"))
print("Macro F1:", f1_score(labels_test, preds_test, average="macro"))

In [None]:
plt.plot(average_loss, "-ob")
plt.xlabel("Epoch")
plt.ylabel("Average Loss")
plt.sivefig('avgloss.png')

pd.DataFrame(average_loss).to_csv("avgloss.csv")

In [None]:
labels_test_txt = list(map(lambda x: index_to_label[x], labels_test))
preds_test_txt = list(map(lambda x: index_to_label[x], preds_test))

def plot_matrix(y_true,y_pred, title=''):
    cf_matrix= confusion_matrix(y_true, y_pred)
    group_names = ['True Neg','False Pos','False Neg','True Pos']
    group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                     cf_matrix.flatten()/np.sum(cf_matrix)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    ax = sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues').set(title = title)
    plt.savefig('cfm'+'.png')  
    
    return ax


cnf_matrix = plot_matrix(labels_test_txt, preds_test_txt, labels=[0, 1])


In [None]:
plt.grid(False)
#mat = confusion_matrix(y_test, nb_labels, labels=['Extremely Popular', 'Very Popular'])

#plt.figure(figsize=(10, 8))
sns.heatmap(cnf_matrix.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=['Down', 'Up'],
            yticklabels=['Down', 'Up'], cmap = 'summer_r')
plt.xlabel('true label')
plt.ylabel('predicted label')
plt.savefig('reddit-nb-cm.png')

In [None]:
print(clsr(labels_test, preds_test, target_names=['Down', 'Up']))