In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.metrics import f1_score
#import re
#import time
from tqdm import tqdm
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

#import nltk
#from bs4 import BeautifulSoup
import transformers
from transformers import AdamW

import torch.optim as optimizers
#from torch.utils.data import random_split
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
#set random seed
def set_seed(seed: int = 123):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
    
set_seed(1234)

In [4]:
all_data = pd.read_csv('data/nlp_prepared_bert.csv')

# pd.read_csvでDataFrameを読み込むとデータ内の「””」が「nan」に置換されるバグ？があったので、再び[""]に戻す
all_data.html_raw = all_data.html_raw.fillna("").astype(str)

In [5]:
all_data.head()

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,data_type,html_compiled,html_raw
0,0,4001-5000,CH,29,publishing,young adult,"<div class=""contents""><div><span class=""bold"">...",0.0,train,<div><div><span>Mark Saggia</span> is an Itali...,Mark Saggia is an Italian writer who emigrated...
1,1,3001-4000,NL,34,fashion,ready-to-wear,"<div class=""contents""><div><h1 class=""page-anc...",0.0,train,"<div><div><h1>Hello, I am Augustinas. I am a g...","Hello, I am Augustinas. I am a graphic designe..."
2,2,19001-20000,US,30,food,spaces,"<div class=""contents""><div><p> As our society ...",0.0,train,<div><div><p> As our society begins to wake up...,As our society begins to wake up from the han...
3,3,2001-3000,US,41,technology,3d printing,"<div class=""contents""><div><p>My name is Donal...",0.0,train,<div><div><p>My name is Donald Osborne and I a...,My name is Donald Osborne and I am an entrepre...
4,4,2001-3000,GB,29,technology,diy electronics,"<div class=""contents""><div><div class=""templat...",1.0,train,<div><div><div> <figure> <img> </figure> </div...,"We all love to play, don't we! No matter the ..."


In [6]:
class HtmlDataset(Dataset):
    def __init__(self, csv_file, transform, stage):
        self.csv_file = csv_file
        self.transform = transform
        self.stage = stage
    
    def __len__(self):
        return len(self.csv_file)
    
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        html_text = self.csv_file.html_raw.iloc[idx]
        html_text = str(html_text)
        if self.stage == "train":
            label = self.csv_file.state.iloc[idx]
        elif self.stage == "eval":
            label = self.csv_file.id.iloc[idx]
            
        if self.transform:
            html_text = self.transform(html_text)
            
        return html_text, label

In [7]:
#define transform
class BERT_Tokenize(object):
    def __init__(self, model_type, max_len):
        self.max_len = max_len
        
        if model_type == "BERT":
            from transformers import BertTokenizer
            self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
            
        elif model_type == "ALBERT":
            from transformers import AlbertTokenizer
            self.bert_tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
            
        elif model_type == "XLNET":
            from transformers import XLNetTokenizer
            self.bert_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
        
        elif model_type == "ROBERTA":
            from transformers import RobertaTokenizer
            self.bert_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        
        elif model_type == "XLMROBERTA":
            from transformers import XLMRobertaTokenizer
            self.bert_tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
            
        elif model_type == "ELECTRA":
            from transformers import ElectraTokenizer
            self.bert_tokenizer = ElectraTokenizer.from_pretrained("google/electra-base-discriminator")
            
    
    def __call__(self,text):
        inputs = self.bert_tokenizer.encode_plus(
                        text,                       # Sentence to encode.
                        add_special_tokens = True,  # Add '[CLS]' and '[SEP]'
                        max_length = self.max_len,  # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,  # Construct attn. masks.
                        return_tensors = "pt"
                   )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        del text, inputs
        return ids, mask

In [9]:
class BertHead(nn.Module):
    def __init__(self, model_type, out_dim, stage):
        super(BertHead, self).__init__()
        if model_type == "ALBERT":
            from transformers import AlbertTokenizer, AlbertModel
            self.base_model = AlbertModel.from_pretrained("albert-base-v2")
            
        elif model_type == "BERT":
            from transformers import BertTokenizer, BertModel
            self.base_model = BertModel.from_pretrained("bert-base-uncased")
            
        elif model_type == "XLNET":
            from transformers import XLNetTokenizer, XLNetModel
            self.base_model = XLNetModel.from_pretrained("xlnet-base-cased")
            
        elif model_type == "ROBERTA":
            from transformers import RobertaTokenizer, RobertaModel
            self.base_model = RobertaModel.from_pretrained("roberta-base")
        
        elif model_type == "XLMROBERTA":
            from transformers import XLMRobertaTokenizer, XLMRobertaModel
            self.base_model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        
        elif model_type == "ELECTRA":
            from transformers import ElectraTokenizer, ElectraModel
            self.base_model = ElectraModel.from_pretrained("google/electra-base-discriminator")
            
        
        self.stage = stage
        dropout = 0.2
        self.classifier = nn.Sequential(
            nn.Linear(768, 768), nn.ReLU(), nn.Dropout(p=dropout),
            nn.Linear(768, 768), nn.ReLU(), nn.Dropout(p=dropout),
            nn.Linear(768, out_dim))
         
        """
        for param in self.base_model.parameters():
            param.requires_grad = True
        """
        
        # for pooler function
        self.dense = nn.Linear(768, 768)
        self.activation = nn.Tanh()
        
    def pooler(self, hidden_states):
        first_token_tensor = hidden_states[0][:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output
    
    def forward(self, x):
        ids, mask = x
        x = self.base_model(input_ids=torch.squeeze(ids,dim=1), attention_mask=torch.squeeze(mask, dim=1))
        if self.stage == "train":
            x = self.classifier(x[1])
            preds = torch.sigmoid(x)
        else:
            try:
                preds = torch.tensor(x[1])
            except:
                preds = torch.tensor(self.pooler(x[1]))
            
        del ids, mask
        return preds

In [10]:
def extract_embed_output(model_type="BERT"):
    # config
    csv_file = all_data
    max_length = 512
    bert_tokenizer = BERT_Tokenize(model_type, max_length)
    transform = bert_tokenizer
    out_dim = 1
    batch_size = 16
    num_workers = 4
    stage = "eval"
    
    train_dataset = HtmlDataset(csv_file=csv_file,
                                transform=bert_tokenizer,
                                stage=stage)
    train_dataloader = DataLoader(train_dataset,
                                 batch_size=batch_size,
                                 num_workers=num_workers)
    model = BertHead(model_type=model_type,
                    out_dim=out_dim,
                    stage=stage)
    
    preds = torch.tensor([])
    ids = []
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    for x, t in tqdm(train_dataloader):
        x[0], x[1] = x[0].to(device), x[1].to(device)
        x = model(x)
        preds = torch.cat((preds, x.to("cpu")), dim=0)
        ids += list(map(int, t.to("cpu")))
        
    preds = torch.squeeze(preds)
    preds = preds.tolist()
    prediction_df = pd.DataFrame(preds)
    rename_dict = {}
    for n in range(768):
        rename_dict[n] = f"{model_type}_{n}"
    prediction_df = prediction_df.rename(columns=rename_dict)
    prediction_df["id"] = ids
    
    return prediction_df

In [11]:
model_list = ["BERT", "ALBERT", "ROBERTA", "XLMROBERTA", ]　　#　"XLNET"は最後の最後(tqdmで1319/1319)でフリーズするバグがあった "ELECTORA"未検証
for model in model_list:
    embed_out_df = extract_embed_output(model_type=model)
    embed_out_df.to_csv(f'{model}_embeded.csv', index=False)
    


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




  0%|          | 0/1319 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. 