In [1]:
import numpy as np
import pandas as pd
import re, pickle, nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
# from sklearn.ensemble import RandomForestClassifier
import os
current_path = os.getcwd()
import sentiment_custom_for_training as sent
from transformers import pipeline
import torch


In [3]:
def data_load(df_own_path, df_twt_path_train, df_twt_path_test):

    df_twt_train = pd.read_csv(df_twt_path_train, sep=",")
    df_twt_test = pd.read_csv(df_twt_path_test, sep=",")
    df_own = pd.read_table(df_own_path, sep=" ")
    x_own_raw, y_own_raw = df_own.Sentence.values, df_own.Label.values

    df_twt_train["sentiment"].replace(to_replace="negative", value=0, inplace=True)
    df_twt_train["sentiment"].replace(to_replace="neutral", value=2, inplace=True)
    df_twt_train["sentiment"].replace(to_replace="positive", value=1, inplace=True)
    df_twt_test["sentiment"].replace(to_replace="negative", value=0, inplace=True)
    df_twt_test["sentiment"].replace(to_replace="neutral", value=2, inplace=True)
    df_twt_test["sentiment"].replace(to_replace="positive", value=1, inplace=True)

    x_raw, y_raw = list(df_twt_train.text.values) + (list(x_own_raw)), list(df_twt_train.sentiment.values) + (list(y_own_raw))
    documents = sent.pre_processing(x_raw)

    test_texts, test_labels = sent.pre_processing(list(df_twt_test.text.values)), list(df_twt_test.sentiment.values)

    train_texts, train_labels = documents, y_raw
    train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
    
    return train_texts, val_texts, train_labels, val_labels, test_texts, test_labels

df_own_path = current_path + "/sentiment_data_model/sentiment_data/sentiment_data.txt"
df_twt_path_train, df_twt_path_test = current_path + "/sentiment_data_model/sentiment_data/train.csv", current_path + "/sentiment_data_model/sentiment_data/test.csv"

train_texts, val_texts, train_labels, val_labels, test_texts, test_labels = data_load(df_own_path, df_twt_path_train, df_twt_path_test)

In [4]:
from transformers import BertTokenizer
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# # Print the original sentence.
# print('Original: ', train_texts[0])
# # Print the sentence split into tokens
# print('Tokenized: ', tokenizer.tokenize(train_texts[0]))
# # Print the sentence mapped to token ids.
# print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_texts[0])))

Loading BERT tokenizer...


In [5]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [6]:
np.shape(train_labels)

(22030,)

In [7]:
print(len(train_encodings))
# print(train_encodings)
print(len(train_encodings.input_ids))


3
22030


In [8]:
import torch

class OwnData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = OwnData(train_encodings, train_labels)
val_dataset = OwnData(val_encodings, val_labels)
test_dataset = OwnData(test_encodings, test_labels)

In [9]:
test_dataset[0]

{'input_ids': tensor([  101,  2197,  5219,  1997,  1996,  2154,  8299,  1056,  9148, 25856,
          2594,  4012,  6163,  9351,  2232,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor(2)}

In [13]:
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification, AdamW
import torch.nn as nn

loss_fn = nn.CrossEntropyLoss()

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

# from transformers import BertTokenizer
# # Load the BERT tokenizer.
# print('Loading BERT tokenizer...')
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to(device)
model.train()
epoch_num = 5

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(epoch_num):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # print(labels.size())
        # print(labels)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        # print(outputs)
        # print(outputs[0])
        loss = outputs[0]
        loss.backward()
        optim.step()

    model.eval()
    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch_val in val_loader:
        # Load batch to GPU
        # b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        input_ids = batch_val['input_ids'].to(device)
        attention_mask = batch_val['attention_mask'].to(device)
        labels = batch_val['labels'].to(device)

        # Compute logits
        with torch.no_grad():
            logits = model(input_ids, attention_mask)

        print("logits[0]: ", logits[0].size())
        print("logits: ", logits.size())

        # Compute loss
        loss = loss_fn(logits[0], labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits[0], dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    print("epoch: ", epoch)
    print("val_loss: ", val_loss)
    print("val_accuracy: ", val_accuracy)

cpu


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [4]:
"""
Load fine-tuned model
"""

# # from transformers import BertTokenizer, BertForSequenceClassification, pipeline
# # Load the BERT tokenizer.
# print('Loading BERT tokenizer...')
# model_path = current_path + "/finetune_bert"
# tokenizer_path = model_path + "/tokenizer/tokenizer_finetune_bert"

# tokenizer = BertTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)
# model = BertForSequenceClassification.from_pretrained(model_path)

Loading BERT tokenizer...


In [34]:
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
print(classifier('yes obese'))
print(classifier('i am not obese'))
print(classifier('not really'))
print(classifier('why not'))

[{'label': 'LABEL_1', 'score': 0.9273343086242676}]
[{'label': 'LABEL_0', 'score': 0.9883109927177429}]
[{'label': 'LABEL_0', 'score': 0.9912934899330139}]
[{'label': 'LABEL_1', 'score': 0.9699274897575378}]


In [41]:
type(classifier('why not')[0]['label']=='LABEL_0')
classifier('why not')[0]['label']=='LABEL_1'

True

In [6]:
text = "why not"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.6235,  0.5978]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)


In [7]:
text = "not really"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.7551, -0.8632]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)


In [8]:
text = "i am not obese"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.3874,  0.4531]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)


In [10]:
text = "no i am not obese"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.4157,  0.3758]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)


In [11]:
text = "yes obese"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.0982,  0.8526]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)


In [17]:
text = "what a sad day"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
print(outputs)
print(torch.argmax(outputs[0]))

SequenceClassifierOutput(loss=None, logits=tensor([[0.1410, 0.2257]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
tensor(1)


In [14]:
# save
# model_path = current_path + "/finetune_bert"
# tokenizer_path = model_path + "/tokenizer/tokenizer_finetune_bert"

# model.save_pretrained(model_path)
# tokenizer.save_pretrained(tokenizer_path)

('/home/xinsun/Dev_env/dev/test_bot/custom_component/finetune_bert/tokenizer/tokenizer_finetune_bert/tokenizer_config.json',
 '/home/xinsun/Dev_env/dev/test_bot/custom_component/finetune_bert/tokenizer/tokenizer_finetune_bert/special_tokens_map.json',
 '/home/xinsun/Dev_env/dev/test_bot/custom_component/finetune_bert/tokenizer/tokenizer_finetune_bert/vocab.txt',
 '/home/xinsun/Dev_env/dev/test_bot/custom_component/finetune_bert/tokenizer/tokenizer_finetune_bert/added_tokens.json')

In [153]:
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)

for test in test_loader:
        print(test)        
        input_ids = test['input_ids'].to(device)
        attention_mask = test['attention_mask'].to(device)
        labels = test['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
 

{'input_ids': tensor([[ 101, 2428, 3147, 2154,  102,    0,    0],
        [ 101, 2017, 2024, 2157,  102,    0,    0],
        [ 101, 1045, 2514, 2307,  102,    0,    0],
        [ 101, 2498, 2569,  102,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0, 0, 0]]), 'labels': tensor([0, 1, 1, 0])}
{'input_ids': tensor([[  101,  2748,   102,     0,     0,     0,     0],
        [  101,  1045,  2572,  6517,   102,     0,     0],
        [  101,  1997,  2607,  2025, 15578,  3366,   102],
        [  101,  1045,  2572,  4452,  2053,   102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 0, 0, 0, 0],
        [1,

In [147]:
for i in test_encodings["input_ids"]:
    print(tokenizer.decode(i))

[CLS] nothing special [SEP] [PAD] [PAD] [PAD]
[CLS] i would say no [SEP] [PAD]
[CLS] yes [SEP] [PAD] [PAD] [PAD] [PAD]
[CLS] i am sad [SEP] [PAD] [PAD]
[CLS] really cold day [SEP] [PAD] [PAD]
[CLS] i don think so [SEP] [PAD]
[CLS] i feel great [SEP] [PAD] [PAD]
[CLS] of course not obese [SEP]
[CLS] i am afraid no [SEP] [PAD]
[CLS] you are right [SEP] [PAD] [PAD]
