In [None]:
import sys



    

In [None]:

!pip install intel-openmp
def pad_sents(sents, pad_token):
    """ Pad list of sentences according to the longest sentence in the batch.
    @param sents (list[list[int]]): list of sentences, where each sentence
                                    is represented as a list of words
    @param pad_token (int): padding token
    @returns sents_padded (list[list[int]]): list of sentences where sentences shorter
        than the max length sentence are padded out with the pad_token, such that
        each sentences in the batch now has equal length.
        Output shape: (batch_size, max_sentence_length)
    """
    sents_padded = []

    max_len = max(len(s) for s in sents)
    batch_size = len(sents)

    for s in sents:
        padded = [pad_token] * max_len
        padded[:len(s)] = s
        sents_padded.append(padded)

    return sents_padded

def sents_to_tensor(tokenizer, sents, device):
    """
    :param tokenizer: BertTokenizer
    :param sents: list[str], list of sentences (NOTE: untokenized, continuous sentences), reversely sorted
    :param device: torch.device
    :return: sents_tensor: torch.Tensor, shape(batch_size, max_sent_length), reversely sorted
    :return: masks_tensor: torch.Tensor, shape(batch_size, max_sent_length), reversely sorted
    :return: sents_lengths: torch.Tensor, shape(batch_size), reversely sorted
    """
    tokens_list = [tokenizer.tokenize(sent) for sent in sents]
    sents_lengths = [len(tokens) for tokens in tokens_list]
    # tokens_sents_zip = zip(tokens_list, sents_lengths)
    # tokens_sents_zip = sorted(tokens_sents_zip, key=lambda x: x[1], reverse=True)
    # tokens_list, sents_lengths = zip(*tokens_sents_zip)
    tokens_list_padded = pad_sents(tokens_list, '[PAD]')
    sents_lengths = torch.tensor(sents_lengths, device=device)

    masks = []
    for tokens in tokens_list_padded:
        mask = [0 if token=='[PAD]' else 1 for token in tokens]
        masks.append(mask)
    masks_tensor = torch.tensor(masks, dtype=torch.long, device=device)
    tokens_id_list = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list_padded]
    sents_tensor = torch.tensor(tokens_id_list, dtype=torch.long, device=device)

    return sents_tensor, masks_tensor, sents_lengths



In [None]:
!pip install transformers
!pip install torch

from transformers import BertForSequenceClassification, BertTokenizer, AdamW
import torch
from torch import nn
import torch.nn.functional as F



In [None]:
# Define the sentiment classification model

class SentimentClassifierModel(nn.Module):

    def __init__(self, bert_config, device, n_class):
        """
        :param bert_config: str, BERT configuration description
        :param device: torch.device
        :param n_class: int
        """

        super(SentimentClassifierModel, self).__init__()

        self.n_class = n_class
        self.bert_config = bert_config
        self.bert = BertForSequenceClassification.from_pretrained(self.bert_config, num_labels=self.n_class)
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_config)
        self.device = device

    def forward(self, sents):
        """
        :param sents: list[str], list of sentences (NOTE: untokenized, continuous sentences)
        :return: pre_softmax, torch.tensor of shape (batch_size, n_class)
        """

        sents_tensor, masks_tensor, sents_lengths = sents_to_tensor(self.tokenizer, sents, self.device)
        pre_softmax = self.bert(input_ids=sents_tensor, attention_mask=masks_tensor)

        return pre_softmax

    @staticmethod
    def load(model_path: str, device):
        """ Load the model from a file.
        @param model_path (str): path to model
        @return model (nn.Module): model with saved parameters
        """
        params = torch.load(model_path, map_location=lambda storage, loc: storage)
        args = params['args']
        model = SentimentClassifierModel(device=device, **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the model to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args': dict(bert_config=self.bert_config, n_class=self.n_class),
            'state_dict': self.state_dict()
        }

        torch.save(params, path)

In [None]:
import pandas as pd
!pip install PyDrive


from google.colab import drive
drive.mount(pwd)
df= pd.read_csv("/content/gdrive/MyDrive/Tweets_Tagging.csv", encoding='utf-8')
# df= pd.read_csv("/content/gdrive/MyDrive/CSS/Dataset/Tweets_Tagging.csv", encoding='utf-8')
df.head()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Unnamed: 0,text,label
0,its been already 5years serving under â€œunifo...,0.0
1,already made up my mind. its time to let go .....,-1.0
2,i miss her so much please meet me soon craving...,-1.0
3,i dont deserve this. seriously.,-1.0
4,not in the mood today. totally,-1.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
list_of_text = []
for i in df['text']:
    i = str(i)
    list_of_text.append(i)
df["text"] = list_of_text
df

Unnamed: 0,text,label
0,its been already 5years serving under â€œunifo...,0.0
1,already made up my mind. its time to let go .....,-1.0
2,i miss her so much please meet me soon craving...,-1.0
3,i dont deserve this. seriously.,-1.0
4,not in the mood today. totally,-1.0
...,...,...
1803,English is one of the most problematic languag...,0.0
1804,@theViscountBP I identify with Johor and Wilay...,0.0
1805,@addicted_marv Sure when?,0.0
1806,@AzShah_ Yes,0.0


In [None]:
df.text = [''.join([i if ord(i) < 128 else '' for i in df.text]) for text in df.text]
df.text = df.text.str.replace(r'_[\S]?',r'')

TypeError: ignored

In [None]:
df.text

0       its been already 5years serving under â€œunifo...
1       already made up my mind. its time to let go .....
2       i miss her so much please meet me soon craving...
3                         i dont deserve this. seriously.
4                          not in the mood today. totally
                              ...                        
1803    English is one of the most problematic languag...
1804    @theViscountBP I identify with Johor and Wilay...
1805                            @addicted_marv Sure when?
1806                                         @AzShah_ Yes
1807                      Baby we know it's for the dick.
Name: text, Length: 1808, dtype: object

In [None]:
# Remove URL, RT, mention(@)

df.text = df.text.str.replace(r'http(\S)+', r'')
df.text = df.text.str.replace(r'http ...', r'')
df.text = df.text.str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
df.text = df.text.str.replace(r'@[\S]+',r'')

# Remove non-ascii words or characters
# df.text = [''.join([i if ord(i) < 128 else '' for i in text]) for text in df.text]
# df.text = df.text.str.replace(r'_[\S]?',r'')

# Remove extra space
df.text = df.text.str.replace(r'[ ]{2, }',r' ')

# Remove &, < and >
df.text = df.text.str.replace(r'&amp;?',r'and')
df.text = df.text.str.replace(r'&lt;',r'<')
df.text = df.text.str.replace(r'&gt;',r'>')

# Insert space between words and punctuation marks
df.text = df.text.str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
df.text = df.text.str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

# Lowercased and strip
df.text = df.text.str.lower()
df.text = df.text.str.strip()

df


Unnamed: 0,text,label
0,its been already 5years serving under â € œuni...,0.0
1,already made up my mind . its time to let go ....,-1.0
2,i miss her so much please meet me soon craving...,-1.0
3,i dont deserve this . seriously .,-1.0
4,not in the mood today . totally,-1.0
...,...,...
1803,english is one of the most problematic languag...,0.0
1804,i identify with johor and wilayah because my r...,0.0
1805,sure when ?,0.0
1806,yes,0.0


In [None]:
df['text_length'] = [len(text.split(' ')) for text in df.text]
print(df.shape)


(1808, 3)


In [None]:
df['BERT_processed_text'] = '[CLS] '+ df.text
df['BERT_processed_text']

0       [CLS] its been already 5years serving under â ...
1       [CLS] already made up my mind . its time to le...
2       [CLS] i miss her so much please meet me soon c...
3                 [CLS] i dont deserve this . seriously .
4                   [CLS] not in the mood today . totally
                              ...                        
1803    [CLS] english is one of the most problematic l...
1804    [CLS] i identify with johor and wilayah becaus...
1805                                    [CLS] sure when ?
1806                                            [CLS] yes
1807             [CLS] baby we know it ' s for the dick .
Name: BERT_processed_text, Length: 1808, dtype: object

In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
df['BERT_processed_text_length'] = [len(tokenizer.tokenize(sents)) for sents in df.BERT_processed_text]

In [None]:
df['label'] = df['label'].fillna(0).astype(int)

In [None]:
df.BERT_processed_text_length

0       29
1       15
2       35
3        9
4        8
        ..
1803    15
1804    17
1805     4
1806     2
1807    11
Name: BERT_processed_text_length, Length: 1808, dtype: int64

In [None]:
label_dict = dict()
for i, l in enumerate(list(df.label.value_counts().keys())):
     label_dict.update({l: i})
     print(label_dict)

# label_dict
df['tweet_labels'] = [label_dict[label] for label in df.label]

df['airline_sentiment_label'] = [label_dict[label] for label in df.label]

{0: 0}
{0: 0, 1: 1}
{0: 0, 1: 1, -1: 2}


In [None]:
df.label

0       0
1      -1
2      -1
3      -1
4      -1
       ..
1803    0
1804    0
1805    0
1806    0
1807    0
Name: label, Length: 1808, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split




In [None]:
# Define training params
label_names = ['positive', 'negative', 'neutral']
model_name = 'st-sentiment'
device = torch.device("cuda:0")
bert_size = 'bert-base-uncased'

train_batch_size = 10 # batch size
clip_grad = 1.0 # gradient clipping
log_every = 10 # number of mini-batches before logging
max_epoch = 20 # max number of epochs
max_patience = 3 # number of iterations to wait before decaying learning rate
max_num_trial = 3 # number of trials before terminating training
lr_decay = 0.5 # learning rate decay
lr_bert = 0.00002 # BERT learning rate
lr = 0.001 # learning rate
valid_niter = 500 # perform validation after n iterations
dropout = 0.3 # dropout rate
verbose = True

prefix = model_name + '_' + bert_size
model_save_path = pwd + '/My Drive/Colab Notebooks/' + prefix+'_model.bin'

In [None]:
# Split up data into train and validation, where validation is 20% of the dataset
training_data,validation_data = train_test_split(df,test_size=0.2,random_state=42)
print(len(df), len(training_data), len(validation_data))

1808 1446 362


In [None]:
df=df.copy()

df['label'] = df['label'].fillna(0).astype(int)
df

Unnamed: 0,text,label,text_length,BERT_processed_text,BERT_processed_text_length,tweet_labels,airline_sentiment_label
0,its been already 5years serving under â € œuni...,0,21,[CLS] its been already 5years serving under â ...,29,0,0
1,already made up my mind . its time to let go ....,-1,13,[CLS] already made up my mind . its time to le...,15,2,2
2,i miss her so much please meet me soon craving...,-1,25,[CLS] i miss her so much please meet me soon c...,35,2,2
3,i dont deserve this . seriously .,-1,7,[CLS] i dont deserve this . seriously .,9,2,2
4,not in the mood today . totally,-1,7,[CLS] not in the mood today . totally,8,2,2
...,...,...,...,...,...,...,...
1803,english is one of the most problematic languag...,0,14,[CLS] english is one of the most problematic l...,15,0,0
1804,i identify with johor and wilayah because my r...,0,14,[CLS] i identify with johor and wilayah becaus...,17,0,0
1805,sure when ?,0,3,[CLS] sure when ?,4,0,0
1806,yes,0,1,[CLS] yes,2,0,0


In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

train_label = dict(training_data.label.value_counts())
label_max = float(max(train_label.values()))
train_label_weight = torch.tensor([label_max/train_label[i] for i in range(len(train_label)-1)], device=device)

pp.pprint(train_label_weight)

RuntimeError: ignored

In [None]:

# Set up model and optimizer
import time
start_time = time.time()

model = SentimentClassifierModel(bert_size, device, len(label_names))
optimizer = AdamW([
        {'params': model.bert.bert.parameters()},
        {'params': model.bert.classifier.parameters(), 'lr': float(lr)}
    ], lr=float(lr_bert))

# model = model.to(device)
print('Use device: %s' % device, file=sys.stderr)
print('Done! time elapsed %.2f sec' % (time.time() - start_time), file=sys.stderr)
print('-' * 80, file=sys.stderr)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
# Util functions for training
import math
import logging
import pickle
import numpy as np
import torch
import pandas as pd
import sys
from docopt import docopt
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, \
    f1_score, precision_score, recall_score, roc_auc_score

import matplotlib
matplotlib.use('agg')
from matplotlib import pyplot as plt

def batch_iter(data, batch_size, shuffle=False, bert=None):
    """ Yield batches of sentences and labels reverse sorted by length (largest to smallest).
    @param data (dataframe): dataframe with ProcessedText (str) and label (int) columns
    @param batch_size (int): batch size
    @param shuffle (boolean): whether to randomly shuffle the dataset
    @param bert (str): whether for BERT training. Values: "large", "base", None
    """
    batch_num = math.ceil(data.shape[0] / batch_size)
    index_array = list(range(data.shape[0]))

    if shuffle:
        data = data.sample(frac=1)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]
        examples = data.iloc[indices].sort_values(by='BERT_processed_text_length', ascending=False)
        sents = list(examples.BERT_processed_text)

        targets = list(examples.label.values)
        yield sents, targets  # list[list[str]] if not bert else list[str], list[int]
        
def validation(model, df_val, bert_size, loss_func, device):
    """ validation of model during training.
    @param model (nn.Module): the model being trained
    @param df_val (dataframe): validation dataset
    @param bert_size (str): large or base
    @param loss_func(nn.Module): loss function
    @param device (torch.device)
    @return avg loss value across validation dataset
    """
    was_training = model.training
    model.eval()

    df_val = df_val.sort_values(by='BERT_processed_text_length', ascending=False)

    ProcessedText_BERT = list(df_val.BERT_processed_text)
    InformationType_label = list(df_val.label)

    val_batch_size = 32

    n_batch = int(np.ceil(df_val.shape[0]/val_batch_size))

    total_loss = 0.

    with torch.no_grad():
        for i in range(n_batch):
            sents = ProcessedText_BERT[i*val_batch_size: (i+1)*val_batch_size]
            targets = torch.tensor(InformationType_label[i*val_batch_size: (i+1)*val_batch_size],
                                   dtype=torch.long, device=device)
            batch_size = len(sents)
            pre_softmax = model(sents)[0]
            batch_loss = loss_func(pre_softmax, targets)
            total_loss += batch_loss.item()*batch_size

    if was_training:
        model.train()

    return total_loss/df_val.shape[0]

def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, path='cm', cmap=plt.cm.Reds):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    pickle.dump(cm, open(path, 'wb'))

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax


In [None]:
# Train

model.train()
cn_loss = torch.nn.CrossEntropyLoss(reduction='mean')
torch.save(cn_loss, 'loss_func')  # for later testing

# Initialize training variables
num_trial = 0
train_iter = 0
patience = 0
cum_loss = 0
report_loss = 0
cum_examples = report_examples = epoch = 0
hist_valid_scores = []

In [None]:
! ls 

gdrive	loss_func  sample_data


In [None]:
train_batch_size

10

In [None]:



import time

train_time = begin_time = time.time()
print('Begin Maximum Likelihood training...')

# model = nn.Linear()  #The sigmoid funtion can also be applied here as model=nn.Sigmoid(nn.Linear())

# Training loop
while True:
    epoch += 1
    for sents, targets in batch_iter(training_data, batch_size=train_batch_size, shuffle=True, bert='base'):  # for each epoch
        train_iter += 1
        optimizer.zero_grad()
        batch_size = len(sents)
        pre_softmax = model(sents)[0]

        # Calculate loss and gradient function
        loss = cn_loss(pre_softmax, torch.tensor(targets, dtype=torch.long, device=device))
        loss.backward()

        # Next step
        optimizer.step()

        batch_losses_val = loss.item() * batch_size
        report_loss += batch_losses_val
        cum_loss += batch_losses_val

        report_examples += batch_size
        cum_examples += batch_size

        if train_iter % log_every == 0:
            print('epoch %d, iter %d, avg. loss %.2f, '
                'cum. examples %d, speed %.2f examples/sec, '
                'time elapsed %.2f sec' % (epoch, train_iter,
                    report_loss / report_examples,
                    cum_examples,
                    report_examples / (time.time() - train_time),
                    time.time() - begin_time), file=sys.stderr)

            train_time = time.time()
            report_loss = report_examples = 0.

        # perform validation
        if train_iter % valid_niter == 0:
            print('epoch %d, iter %d, cum. loss %.2f, cum. examples %d' % (epoch, train_iter,
                cum_loss / cum_examples,
                cum_examples), file=sys.stderr)

            cum_loss = cum_examples = 0.

            print('begin validation ...', file=sys.stderr)

            validation_loss = validation(model, validation_data, bert_size, cn_loss, device)   # dev batch size can be a bit larger

            print('validation: iter %d, loss %f' % (train_iter, validation_loss), file=sys.stderr)

            is_better = len(hist_valid_scores) == 0 or validation_loss < min(hist_valid_scores)
            hist_valid_scores.append(validation_loss)

            if is_better:
                patience = 0
                print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)

                model.save(model_save_path)

                # also save the optimizers' state
                torch.save(optimizer.state_dict(), model_save_path + '.optim')
            elif patience < int(max_patience):
                patience += 1
                print('hit patience %d' % patience, file=sys.stderr)

                if patience == int(max_patience):
                    num_trial += 1
                    print('hit #%d trial' % num_trial, file=sys.stderr)
                    if num_trial == max_num_trial:
                        print('early stop!', file=sys.stderr)
                        exit(0)

                    # decay lr, and restore from previously best checkpoint
                    print('load previously best model and decay learning rate to %f%%' %
                        (float(lr_decay)*100), file=sys.stderr)

                    # load model
                    params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                    model.load_state_dict(params['state_dict'])
                    model = model.to(device)

                    print('restore parameters of the optimizers', file=sys.stderr)
                    optimizer.load_state_dict(torch.load(model_save_path + '.optim'))

                    # set new lr
                    for param_group in optimizer.param_groups:
                        param_group['lr'] *= float(lr_decay)

                    # reset patience
                    patience = 0

            if epoch == int(max_epoch):
                print('reached maximum number of epochs!', file=sys.stderr)
                exit(0)

Begin Maximum Likelihood training...


RuntimeError: ignored

In [None]:
training_data

Unnamed: 0,text,label,text_length,BERT_processed_text,BERT_processed_text_length
900,do u think it ' s accurate ? ðÿ ˜±,0.0,10,[CLS] do u think it ' s accurate ? ðÿ ˜±,12
1800,at first i thought it was the metal wire that ...,0.0,47,[CLS] at first i thought it was the metal wire...,50
1128,the future of pan - african fintech collaborat...,1.0,11,[CLS] the future of pan - african fintech coll...,13
964,is it because i malay ?,0.0,6,[CLS] is it because i malay ?,7
394,thk some of em find wtc too far ... weiyi not ...,0.0,32,[CLS] thk some of em find wtc too far ... weiy...,45
...,...,...,...,...,...
1130,super proud to be part of a team consistently ...,1.0,36,[CLS] super proud to be part of a team consist...,41
1294,was so excited to try this # vegetarian # chic...,1.0,46,[CLS] was so excited to try this # vegetarian ...,52
860,looking for a christmas feast this year w / o ...,0.0,24,[CLS] looking for a christmas feast this year ...,27
1459,checked out ' s stock of # sangria in ready ...,1.0,40,[CLS] checked out ' s stock of # sangria in ...,46


In [None]:
df['label'] = df['label'].fillna(0).astype(int)    import numpy as np
    import pickle
    from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, \
    f1_score, precision_score, recall_score, roc_auc_score
    import matplotlib
    matplotlib.use('agg')
    from matplotlib import pyplot as plt

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, path='cm', cmap=plt.cm.Reds):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    pickle.dump(cm, open(path, 'wb'))

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
    print('load best model...')

    model = SentimentClassifierModel.load('/content/gdrive/My Drive/Colab Notebooks/' + prefix + '_model.bin', device)

    model.to(device)

    model.eval()

    df_test = validation_data

    df_test = df_test.sort_values(by='BERT_processed_text_length', ascending=False)

    test_batch_size = 32

    n_batch = int(np.ceil(df_test.shape[0]/test_batch_size))

    cn_loss = torch.load('loss_func', map_location=lambda storage, loc: storage).to(device)

    ProcessedText_BERT = list(df_test.BERT_processed_text)
    InformationType_label = list(df_test.airline_sentiment_label)

    test_loss = 0.
    prediction = []
    prob = []

    softmax = torch.nn.Softmax(dim=1)

    with torch.no_grad():
        for i in range(n_batch):
            sents = ProcessedText_BERT[i*test_batch_size: (i+1)*test_batch_size]
            targets = torch.tensor(InformationType_label[i * test_batch_size: (i + 1) * test_batch_size],
                                   dtype=torch.long, device=device)
            batch_size = 10

            pre_softmax = model(sents)[0]
            batch_loss = cn_loss(pre_softmax, targets)
            test_loss += batch_loss.item()*batch_size
            prob_batch = softmax(pre_softmax)
            prob.append(prob_batch)

            prediction.extend([t.item() for t in list(torch.argmax(prob_batch, dim=1))])

    prob = torch.cat(tuple(prob), dim=0)
    loss = test_loss/df_test.shape[0]

    pickle.dump([label_names[i] for i in prediction], open(prefix+'_test_prediction', 'wb'))
    pickle.dump(prob.data.cpu().numpy(), open(prefix + '_test_prediction_prob', 'wb'))

    accuracy = accuracy_score(df_test.airline_sentiment_label.values, prediction)
    matthews = matthews_corrcoef(df_test.airline_sentiment_label.values, prediction)

    precisions = {}
    recalls = {}
    f1s = {}
    aucrocs = {}

    for i in range(len(label_names)):
        prediction_ = [1 if pred == i else 0 for pred in prediction]
        true_ = [1 if label == i else 0 for label in df_test.airline_sentiment_label.values]
        f1s.update({label_names[i]: f1_score(true_, prediction_)})
        precisions.update({label_names[i]: precision_score(true_, prediction_)})
        recalls.update({label_names[i]: recall_score(true_, prediction_)})
        aucrocs.update({label_names[i]: roc_auc_score(true_, list(t.item() for t in prob[:, i]))})

    metrics_dict = {'loss': loss, 'accuracy': accuracy, 'matthews coef': matthews, 'precision': precisions,
                         'recall': recalls, 'f1': f1s, 'aucroc': aucrocs}

    pickle.dump(metrics_dict, open(prefix+'_evaluation_metrics', 'wb'))

    cm = plot_confusion_matrix(list(df_test.airline_sentiment_label.values), prediction, label_names, normalize=False,
                          path=prefix+'_test_confusion_matrix', title='confusion matrix for test dataset')
    plt.savefig(prefix+'_test_confusion_matrix', format='png')
    cm_norm = plot_confusion_matrix(list(df_test.airline_sentiment_label.values), prediction, label_names, normalize=True,
                          path=prefix+'_test normalized_confusion_matrix', title='normalized confusion matrix for test dataset')
    plt.savefig(prefix+'_test_normalized_confusion_matrix', format='png')

    print('loss: %.2f' % loss)
    print('accuracy: %.2f' % accuracy)
    print('matthews coef: %.2f' % matthews)
    print('-' * 80)
    for i in range(len(label_names)):
        print('precision score for %s: %.2f' % (label_names[i], precisions[label_names[i]]))
        print('recall score for %s: %.2f' % (label_names[i], recalls[label_names[i]]))
        print('f1 score for %s: %.2f' % (label_names[i], f1s[label_names[i]]))
        print('auc roc score for %s: %.2f' % (label_names[i], aucrocs[label_names[i]]))
        print('-' * 80)

load best model...


FileNotFoundError: ignored

In [None]:

import pandas as pd
# %cd /content/drive/My Drive/CSSinsta/
# st_df = pandas.read_csv('/content/gdrive/MyDrive/2020tweets.xlsx', index_col=0, encoding='latin-1')
# st_df= pd.read_csv("/content/gdrive/MyDrive/Tweets_Tagging.csv", encoding='utf-8')
st_df= pd.read_excel("/content/gdrive/MyDrive/2020tweets.xlsx")

# df = pd.read_csv('2020tweets.xlsx')
st_df.head()

In [None]:
st_df['text']=st_df['tweet']

In [None]:
st_df=st_df.drop(['tweet'], axis=1)
st_df

In [None]:
# Remove URL, RT, mention(@)

st_df['text'] = st_df.text

st_df.text = st_df.text.str.replace(r'http(\S)+', r'')
st_df.text = st_df.text.str.replace(r'http ...', r'')
st_df.text = st_df.text.str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
st_df.text = st_df.text.str.replace(r'@[\S]+',r'')

# Remove non-ascii words or characters
st_df.text = [''.join([i if ord(i) < 128 else '' for i in text]) for text in st_df.text]
st_df.text = st_df.text.str.replace(r'_[\S]?',r'')

# Remove extra space
st_df.text = st_df.text.str.replace(r'[ ]{2, }',r' ')

# Remove &, < and >
st_df.text = st_df.text.str.replace(r'&amp;?',r'and')
st_df.text = st_df.text.str.replace(r'&lt;',r'<')
st_df.text = st_df.text.str.replace(r'&gt;',r'>')

# Insert space between words and punctuation marks
st_df.text = st_df.text.str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
st_df.text = st_df.text.str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

# Lowercased and strip
st_df.text = st_df.text.str.lower()
st_df.text = st_df.text.str.strip()

st_df['text_length'] = [len(text.split(' ')) for text in st_df.text]
print(st_df.shape)


In [None]:
st_df

In [None]:
st_df['BERT_processed_text'] = '[CLS] '+ st_df.text
st_df.BERT_processed_text

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
st_df['BERT_processed_text_length'] = [len(tokenizer.tokenize(sent)) for sent in st_df.text]
st_df.BERT_processed_text_length

In [None]:
st_df

In [None]:
model = SentimentClassifierModel.load('/content/gdrive/My Drive/Colab Notebooks/' + prefix + '_model.bin', device)

model.to(device)

In [None]:
st_df = st_df.sort_values(by='BERT_processed_text_length', ascending=False)

In [None]:
st_df

#st_df.drop(st_df.tail(5000).index,inplace=True)



In [None]:
cn_loss = torch.load('loss_func', map_location=lambda storage, loc: storage).to(device)

In [None]:
ProcessedText_BERT = list(st_df.BERT_processed_text)

In [None]:
ProcessedText_BERT

In [None]:
softmax = torch.nn.Softmax(dim=1)

In [None]:
labels = ['negative', 'neutral', 'positive']

In [None]:
sents = ProcessedText_BERT[:2]
sents


pre_softmax = model(sents)[0]
pre_softmax

prob = softmax(pre_softmax)
prob
label_indexes = [t.item() for t in list(torch.argmax(prob, dim=1))]

In [None]:

prediction = labels[label_indexes[1]]
prediction


In [None]:

predictions = []
with torch.no_grad():
  sents = ProcessedText_BERT
  pre_softmax = model(sents)[0]
  prob = softmax(pre_softmax)
  predictions.extend([t.item() for t in list(torch.argmax(prob, dim=1))])
print(predictions)


In [None]:
[labels[pred_val] for pred_val in predictions]