In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()


%matplotlib inline

from matplotlib import pyplot as plt

import seaborn as sns
import re


import warnings
warnings.filterwarnings("ignore")

In [3]:
#ドライブへのアクセス
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/My Drive/00_datascience/19_ufj_bank"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/00_datascience/19_ufj_bank


In [4]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
#sub = pd.read_csv('input/sample_submit.csv')

# bertの設定

In [5]:
#htmlタグのクリーニング
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [6]:
#bertによる特徴抽出
import torch
import transformers

from transformers import BertTokenizer


class BertSequenceVectorizer:
    def __init__(self, model_name="bert-base-uncased", max_len=512):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_name = model_name
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = max_len

    def vectorize(self, sentence: str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():    
            return seq_out.cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
        else:
            return seq_out.detach().numpy()

In [7]:
BSV = BertSequenceVectorizer()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# train_dataのcsv出力

In [8]:
#htmlタグのクリーニングの実行
for i in range(len(train)):
  train.loc[i, 'cleaned_text'] = remove_html(train.loc[i, 'html_content'])

In [9]:
#全部一気に乗らないため、前半後半を分けて出力
train1 = train.loc[:5000,]
train2 = train.loc[5001:,]

In [10]:
seq_out_train1 = train1['cleaned_text'].progress_apply(lambda x: BSV.vectorize(x))

  0%|          | 0/5001 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors


In [11]:
#ベースのデータフレーム作る
context_mean_f = np.mean(seq_out_train1[0], axis=0)
batch_mean_f = np.mean(context_mean_f, axis=0)
seq_train_df1 = pd.DataFrame(batch_mean_f).T

#最終層のデータを768次元に落とす
for i in range(1, len(train1)):
  context_mean = np.mean(seq_out_train1[i], axis=0)
  batch_mean = np.mean(context_mean, axis=0)
  seq_train_df1 = seq_train_df1.append(pd.DataFrame(batch_mean).T)

seq_train_df1.to_csv('input/05_seq_train_df1.csv')

In [10]:
#後半の実装
seq_out_train2 = train2['cleaned_text'].progress_apply(lambda x: BSV.vectorize(x))

#ベースのデータフレーム作る
context_mean_f = np.mean(seq_out_train2[5001], axis=0)
batch_mean_f = np.mean(context_mean_f, axis=0)
seq_train_df2 = pd.DataFrame(batch_mean_f).T

#最終層のデータを768次元に落とす
for i in range(5002, len(train)):
  context_mean = np.mean(seq_out_train2[i], axis=0)
  batch_mean = np.mean(context_mean, axis=0)
  seq_train_df2 = seq_train_df2.append(pd.DataFrame(batch_mean).T)

seq_train_df2.to_csv('input/05_seq_train_df2.csv')

  0%|          | 0/4790 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (976 > 512). Running this sequence through the model will result in indexing errors


# test dataのcsv出力

In [8]:
for i in range(len(test)):
  test.loc[i, 'cleaned_text'] = remove_html(test.loc[i, 'html_content'])

In [9]:
#全部一気に乗らないため、前半後半を分けて出力
test1 = test.loc[:5000,]
test2 = test.loc[5001:,]

前半の実行

In [10]:
seq_out_test1 = test1['cleaned_text'].progress_apply(lambda x: BSV.vectorize(x))

  0%|          | 0/5001 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (624 > 512). Running this sequence through the model will result in indexing errors


In [11]:
#ベースのデータフレーム作る
context_mean_f = np.mean(seq_out_test1[0], axis=0)
batch_mean_f = np.mean(context_mean_f, axis=0)
seq_test_df1 = pd.DataFrame(batch_mean_f).T

#最終層のデータを768次元に落とす
for i in range(1, len(test1)):
  context_mean = np.mean(seq_out_test1[i], axis=0)
  batch_mean = np.mean(context_mean, axis=0)
  seq_test_df1 = seq_test_df1.append(pd.DataFrame(batch_mean).T)

seq_test_df1.to_csv('input/05_seq_test_df1.csv')

後半の実行

In [10]:
seq_out_test2 = test2['cleaned_text'].progress_apply(lambda x: BSV.vectorize(x))

  0%|          | 0/4799 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1615 > 512). Running this sequence through the model will result in indexing errors


In [11]:
#ベースのデータフレーム作る
context_mean_f = np.mean(seq_out_test2[5001], axis=0)
batch_mean_f = np.mean(context_mean_f, axis=0)
seq_test_df2 = pd.DataFrame(batch_mean_f).T

#最終層のデータを768次元に落とす
for i in range(5002, len(test)):
  context_mean = np.mean(seq_out_test2[i], axis=0)
  batch_mean = np.mean(context_mean, axis=0)
  seq_test_df2 = seq_test_df2.append(pd.DataFrame(batch_mean).T)

seq_test_df2.to_csv('input/05_seq_test_df2.csv')

# train, testをそれぞれひとまとめにして出力

In [12]:
train1 = pd.read_csv('input/05_seq_train_df1.csv')
train2 = pd.read_csv('input/05_seq_train_df2.csv')
test1 = pd.read_csv('input/05_seq_test_df1.csv')
test2 = pd.read_csv('input/05_seq_test_df2.csv')

In [13]:
train_df = pd.concat([train1, train2], ignore_index=True)
train_df.drop(['Unnamed: 0'], axis=1, inplace=True)
test_df = pd.concat([test1, test2], ignore_index=True)
test_df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [14]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9791 entries, 0 to 9790
Columns: 768 entries, 0 to 767
dtypes: float64(768)
memory usage: 57.4 MB


In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9791 entries, 0 to 9790
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            9791 non-null   object
 1   goal          9791 non-null   object
 2   country       9791 non-null   object
 3   duration      9791 non-null   int64 
 4   category1     9791 non-null   object
 5   category2     9791 non-null   object
 6   html_content  9791 non-null   object
 7   state         9791 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 612.1+ KB


In [16]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9800 entries, 0 to 9799
Columns: 768 entries, 0 to 767
dtypes: float64(768)
memory usage: 57.4 MB


In [17]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9800 entries, 0 to 9799
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            9800 non-null   object
 1   goal          9800 non-null   object
 2   country       9800 non-null   object
 3   duration      9800 non-null   int64 
 4   category1     9800 non-null   object
 5   category2     9800 non-null   object
 6   html_content  9800 non-null   object
 7   cleaned_text  9800 non-null   object
dtypes: int64(1), object(7)
memory usage: 612.6+ KB


In [18]:
train_df.to_csv('input/05_seq_train_df.csv')
test_df.to_csv('input/05_seq_test_df.csv')