In [1]:
from transformers import AutoTokenizer, BertTokenizerFast,AutoModel,AutoModelForTokenClassification
from transformers import pipeline

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from torch import optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
robertal = "roberta-large"
bert = "dslim/bert-base-NER"
# tokenizer = AutoTokenizer.from_pretrained(robertal)
# model = AutoModelForTokenClassification.from_pretrained(robertal)

# nlp = pipeline("ner", model=model, tokenizer=tokenizer)
# example = "My name is Wolfgang and I live in Berlin"

# ner_results = nlp(example)
# print(ner_results)

# Data Preparation

In [2]:
df = pd.read_csv('data/product_ids - feature_table.csv')

In [24]:
def extract_location(text, keyword):
    start = text.index(keyword)
    end = start+len(keyword)
    return [start,end]

In [26]:
df['location']=df.apply(lambda x: extract_location(x['review_text'],x['review_keyword']),axis=1)

In [27]:
df

Unnamed: 0,feature_id,feature_name,review_id,review_text,review_keyword,location
0,1,easy to use,4,The Amazon Fire TV Omni is a big deal for Amaz...,easier to use,"[95, 108]"
1,2,price,4,The Amazon Fire TV Omni is a big deal for Amaz...,a price that's a little more premium,"[149, 185]"
2,3,performance,4,The Amazon Fire TV Omni is a big deal for Amaz...,disappointing performance,"[119, 144]"
3,4,resolution,11,The TCL 3 Series S325 is an entry-level 1080p ...,the highest supported resolution of 1080p @ 60Hz,"[541, 589]"
4,4,resolution,11,The TCL 3 Series S325 is an entry-level 1080p ...,an entry-level 1080p,"[25, 45]"
5,5,picture quality,11,The TCL 3 Series S325 is an entry-level 1080p ...,mediocre picture quality.,"[58, 83]"
6,6,smart interface,11,The TCL 3 Series S325 is an entry-level 1080p ...,The TV's Roku smart interface is the same as h...,"[591, 678]"


# Tokenizer and Model

In [7]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [40]:
class BertModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.bert = AutoModel.from_pretrained(config['model_name'])  # BERT model
        self.dropout = nn.Dropout(p=config['dropout'])
        self.config = config
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = self.fc1(outputs[0])
        logits = self.fc2(self.dropout(logits))
        logits = self.fc3(self.dropout(logits)).squeeze(-1)
        return logits
    
    
config = {
    "max_length": 416,
    "padding": "max_length",
    "return_offsets_mapping": True,
    "truncation": "only_second",
    "model_name": 'bert-base-uncased',#"roberta-large",
    "dropout": 0.2,
    "lr": 1e-5,
    "test_size": 0.2,
    "seed": 1268,
    "batch_size": 8
}

model = BertModel(config)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Dataset and Dataloader

In [31]:
class NerTrainDataset(Dataset):
    def __init__(self, data,tokenizer, config):
        self.data = data
        self.tokenizer = tokenizer
        self.config = config
    
    def __len__(self):
        return len(self.data)
    
    def get_label(self, offset_mapping,location):
        labels = np.zeros(len(offset_mapping))
        start,end = location
        for i,offsets in enumerate(offset_mapping):
            s,e = offsets
            if s >= start and e <= end:
                labels[i]=1
        return labels
    
    def __getitem__(self, idx):
        example = self.data.loc[idx]
        
        tokenized = self.tokenizer(
            example["feature_name"],
            example["review_text"],
            truncation = self.config['truncation'],
            max_length = self.config['max_length'],
            padding = self.config['padding'],
            return_offsets_mapping = self.config['return_offsets_mapping']
        )
        tokenized["sequence_ids"] = tokenized.sequence_ids()
        
       # print(example['location'])
        input_ids = np.array(tokenized["input_ids"])
        attention_mask = np.array(tokenized["attention_mask"])
        token_type_ids = np.array(tokenized["token_type_ids"])
        offset_mapping = np.array(tokenized["offset_mapping"])
        sequence_ids = np.array(tokenized["sequence_ids"]).astype("float16")
        location = example['location']

        label = self.get_label(offset_mapping,location)
        

        return input_ids, attention_mask, token_type_ids, offset_mapping, sequence_ids, label

In [32]:
data = NerTrainDataset(df, tokenizer,config)
dataloader = DataLoader(data, batch_size=1, shuffle=False)

# Training

In [77]:
import torch.nn.functional as F 
def training(dl,optimizer,epoches = 5,best = 1):
    model.train()
    losses =[]
    for i in tqdm(range(epoches)):
        for batch in dl:
            input_ids = batch[0]
            attention_mask = batch[1]
            token_type_ids = batch[2]
            offset_mapping = batch[3]
            sequence_ids = batch[4]
            label = batch[5]
            
            weight_0 = 1- sum(label[0]==0)/len(label[0])
            weight_1 = 1-weight_0
            weight = torch.where(label>0,weight_1,weight_0)
            
            
            logits = model(input_ids, attention_mask, token_type_ids)
            loss = F.binary_cross_entropy_with_logits(logits,label,weight = weight)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
            
        train_loss = np.mean(losses)
        print('the train loss is:',train_loss)
        if train_loss < best:
            best = train_loss
            torch.save(model.state_dict(),'ner2.pt')
        ### END SOLUTION
#         valid_loss, valid_acc = valid_metrics(model, valid_dl)
#         print("train loss  %.3f val loss %.3f and accuracy %.3f" % (
#             train_loss, valid_loss, valid_acc))       

In [78]:
learning_rate = 0.0001
wd = 1e-5
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,weight_decay= wd)

In [79]:
training(dataloader,optimizer,epoches = 5,best = 1)

  0%|          | 0/5 [00:00<?, ?it/s]

the train loss is: 0.02050299409790427
the train loss is: 0.017183706271089078
the train loss is: 0.01958349810447427
the train loss is: 0.01862410786085852
the train loss is: 0.017452359542657433


In [52]:
batch = next(iter(dataloader))

In [80]:
model.load_state_dict(torch.load('ner2.pt'))

<All keys matched successfully>

In [81]:
# batch = next(iter(dataloader))
input_ids = batch[0]
attention_mask = batch[1]
token_type_ids = batch[2]
offset_mapping = batch[3]
sequence_ids = batch[4]
label = batch[5]

In [82]:
logits = model(input_ids, attention_mask, token_type_ids)
predicted = logits.detach().cpu().numpy()
offset_mapping = offset_mapping.numpy()
sequence_ids = sequence_ids.numpy()

In [84]:
for pred, offsets, seq_ids in zip(predicted, offset_mapping, sequence_ids):
    pred = 1 / (1 + np.exp(-pred)) # which is sigmoid function    
    start_idx = None
    end_idx = None
    
    for pred, offset, seq_id in zip(pred, offsets, seq_ids):
        if not seq_id or seq_id == 0:
            continue
    
        if pred > 0.5:
            if not start_idx:
                start_idx = offset[0]
            end_idx = offset[1]
            
        elif start_idx:
            print("Current index", f"{start_idx} {end_idx}")
            print("Word: ", text[start_idx:end_idx])
            start_idx = None

Current index 27 202
Word:   is a very basic, entry-level 4k TV. Along with the Fire TV Omni, it's one of the first Amazon-branded TVs. It's best-suited for a dark room, as the VA panel delivers deep bla
Current index 204 236
Word:  s but can't get bright enough to
