In [63]:
import torch
print(torch.cuda.is_available())

True


In [64]:
#! pip install pandas

Importing Corpus

In [65]:
import numpy as np
import pandas as pd
from transformers import BertJapaneseTokenizer, BertForSequenceClassification

In [66]:
corpus = []
corpus_n = 3
for i in range(corpus_n):
    corpus.append(pd.read_csv('./corpus_' + str(i+1) + '.csv',delimiter=',',encoding='utf-8').dropna())
    corpus[i].columns = ['question','item_in_question','question_type']

Tokenizer

In [67]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn.functional as F

In [68]:
class QuestionsDataset(Dataset):
    def __init__(self, file_paths, tokenizer, max_len):
        #self.data = pd.read_csv(file_path)
        self.data = pd.DataFrame(columns = ['question','item_in_question','question_type'])
        for f in file_paths:
            c = pd.read_csv(f,delimiter=',',encoding='utf-8').dropna()
            self.data = pd.concat([self.data, c], ignore_index=True, sort=False)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        question = self.data.loc[index, 'question']
        item_in_question = self.data.loc[index, 'item_in_question']
        question_type = self.data.loc[index, 'question_type']

        encoding = self.tokenizer.encode_plus(
            question,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        inputs = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

        targets = {
            'item_in_question': item_in_question,
            'question_type': question_type
        }

        return inputs, targets

In [69]:
# Initialize the tokenizer
tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")

In [70]:
#Initialize dataset
dataset = QuestionsDataset(['./corpus_1.csv','./corpus_2.csv'],tokenizer,128)

In [71]:
# Keyword Extraction
import torch
import torch.nn as nn
from transformers import BertModel

class BertForKeywordExtraction(nn.Module):
    def __init__(self, model_name, num_labels_item, num_labels_type):
        super(BertForKeywordExtraction, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(p=0.4)
        self.classifier_item = nn.Linear(self.bert.config.hidden_size, num_labels_item)
        self.classifier_type = nn.Linear(self.bert.config.hidden_size, num_labels_type)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        item_logits = self.classifier_item(pooled_output)
        type_logits = self.classifier_type(pooled_output)
        return item_logits, type_logits

# Initialize the model
data = dataset.data
num_labels_item = len(set(data['item_in_question']))
num_labels_type = len(set(data['question_type']))
model = BertForKeywordExtraction('cl-tohoku/bert-base-japanese', num_labels_item, num_labels_type)

In [72]:
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn.functional as F

def train_model(model, dataset, tokenizer, num_epochs=3, batch_size=16, learning_rate=3e-5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_dataloader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()
            input_ids = batch[0]['input_ids'].to(device)
            attention_mask = batch[0]['attention_mask'].to(device)
            item_in_question = batch[1]['item_in_question'].to(device)
            question_type = batch[1]['question_type'].to(device)

            item_logits, type_logits = model(input_ids, attention_mask)

            loss_item = loss_fn(item_logits, item_in_question)
            loss_type = loss_fn(type_logits, question_type)
            loss = loss_item + loss_type

            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_train_loss:.4f}")

# Prepare the dataset and labels
item_to_idx = {item: idx for idx, item in enumerate(data['item_in_question'].unique())}
type_to_idx = {type_: idx for idx, type_ in enumerate(data['question_type'].unique())}

data['item_in_question'] = data['item_in_question'].map(item_to_idx)
data['question_type'] = data['question_type'].map(type_to_idx)

# Save the processed dataset
data.to_csv('questions_processed.csv', index=False)

# Initialize the dataset
dataset = QuestionsDataset(['questions_processed.csv'], tokenizer, max_len=128)

# Train the model
train_model(model, dataset, tokenizer)




Epoch 1/3, Loss: 3.7039
Epoch 2/3, Loss: 2.6531
Epoch 3/3, Loss: 2.1228


In [73]:
print(item_to_idx)

{'ユニバーサルグリル': 0, 'ガラリ': 1, '吸込口': 2, 'シーリングディフューザー': 3, '吹出口': 4, 'スリットグリル': 5, 'ノズル': 6, 'ラインディフューザー': 7, 'エアフィルター': 8, '防火ダンパー': 9, '一般ダンパー': 10, '排煙口': 11, 'フィルターケーシング': 12, 'ダンパー': 13}


In [74]:
idx_to_item = {idx: item for item, idx in item_to_idx.items()}
idx_to_type = {idx: type_ for type_, idx in type_to_idx.items()}

def predict_keywords(question, model, tokenizer):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    encoding = tokenizer.encode_plus(
        question,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        item_logits, type_logits = model(input_ids, attention_mask)

    item_idx = torch.argmax(item_logits, dim=1).item()
    type_idx = torch.argmax(type_logits, dim=1).item()

    return idx_to_item[item_idx], idx_to_type[type_idx]

# Example prediction
question = "ラインディフューザーの料金はいくらですか"
item, qtype = predict_keywords(question, model, tokenizer)
print(f"Item in Question: {item}, Question Type: {qtype}")


Item in Question: ラインディフューザー, Question Type: いくら


In [75]:
train_model(model, dataset, tokenizer)

Epoch 1/3, Loss: 1.8160
Epoch 2/3, Loss: 1.2692
Epoch 3/3, Loss: 1.0489


In [76]:
model.eval()

BertForKeywordExtraction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [77]:
train_model(model, dataset, tokenizer)

Epoch 1/3, Loss: 0.9055
Epoch 2/3, Loss: 0.6370
Epoch 3/3, Loss: 0.5256


In [78]:
torch.save(model, "model-3.pt")

In [79]:
dictionary = {'item_to_idx':item_to_idx,'type_to_idx':type_to_idx,'idx_to_item':idx_to_item,'idx_to_type':idx_to_type}
np.save('converter.npy', dictionary)

In [80]:
item_id, type_id = predict_keywords("グリルの長さを教えてください", model, tokenizer)

In [81]:
question = "ラインディフューザーの料金はいくらですか"
item, qtype = predict_keywords(question, model, tokenizer)
print(f"Item in Question: {item}, Question Type: {qtype}")

Item in Question: ラインディフューザー, Question Type: いくら


In [82]:
question = "グリルの長さを教えてください"
item, qtype = predict_keywords(question, model, tokenizer)
print(f"Item in Question: {item}, Question Type: {qtype}")

Item in Question: ユニバーサルグリル, Question Type: 長さ


In [83]:
question = "スリットグリルの納品について知りたいです。"
item, qtype = predict_keywords(question, model, tokenizer)
print(f"Item in Question: {item}, Question Type: {qtype}")

Item in Question: スリットグリル, Question Type: いつ


In [84]:
question = "スリットグリルの長さはどのぐらいでしょうか。"
item, qtype = predict_keywords(question, model, tokenizer)
print(f"Item in Question: {item}, Question Type: {qtype}")

Item in Question: スリットグリル, Question Type: 長さ


In [85]:
question = "スリットグリルの価格を教えてください"
item, qtype = predict_keywords(question, model, tokenizer)
print(f"Item in Question: {item}, Question Type: {qtype}")

Item in Question: スリットグリル, Question Type: いくら


In [86]:
question = "グリルは何センチ長いですか"
item, qtype = predict_keywords(question, model, tokenizer)
print(f"Item in Question: {item}, Question Type: {qtype}")

Item in Question: ノズル, Question Type: 長さ


In [87]:
question = "一般ダンパーの費用はどのぐらいですか。教えてください。"
item, qtype = predict_keywords(question, model, tokenizer)
print(f"Item in Question: {item}, Question Type: {qtype}")

Item in Question: 一般ダンパー, Question Type: いくら


In [88]:
question = "一般ダンパーは何グラムですか"
item, qtype = predict_keywords(question, model, tokenizer)
print(f"Item in Question: {item}, Question Type: {qtype}")

Item in Question: 一般ダンパー, Question Type: 重さ
