### 1.　特徴量

In [62]:
import pandas as pd
import numpy as np
import os


In [63]:
col_query = "query"
col_query_id = "query_id"
col_product_id = "product_id" 
col_product_title = "product_title"
col_product_locale = "product_locale"
col_esci_label = "esci_label" 
col_small_version = "small_version"
col_split = "split"
col_gain = 'gain'

dataset_path = '/home/sugiyama/ir/esci-data/shopping_queries_dataset'
df_examples = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_examples.parquet'))
df_products = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_products.parquet'))
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=[col_product_locale, col_product_id],
    right_on=[col_product_locale, col_product_id]
)

train = df_examples_products[df_examples_products[col_split] == "train"] # 学習データ
test = df_examples_products[df_examples_products[col_split] == "test"] # テストデータ

train = train[train[col_product_locale] == 'us'][:7000]
test = test[test[col_product_locale] == 'us']

### 2. 前処理

tfの計算

In [64]:
# from sklearn.feature_extraction.text import CountVectorizer
# import numpy as np

# train_product_title = train['product_title'].tolist()

# wc = CountVectorizer()
# x = wc.fit_transform(train_product_title)
# wcX = np.array(x)

tf-idfの計算

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_product_title = train['product_title'].tolist()

def tf_idf(docs):
    vectorizer = TfidfVectorizer(smooth_idf=False)
    X = vectorizer.fit_transform(docs)

    values = X.toarray()
    feature_names = vectorizer.get_feature_names()
    idf = vectorizer.idf_
    tfidf_df = pd.DataFrame(values, columns = feature_names)

    return values, feature_names, idf, tfidf_df

In [66]:
values, feature_names, idf, tfidf = tf_idf(train_product_title)

In [67]:
tfidf_sum = np.sum(values,axis=1)
tfidf_max = np.max(values, axis=1)
tfidf_std = np.std(values, axis=1)
tfidf_var = np.var(values, axis=1)


In [68]:
train['tfidf_sum'] = tfidf_sum
train['tfidf_max'] = tfidf_max
train['tfidf_std'] = tfidf_std
train['tfidf_var'] = tfidf_var

In [69]:
train_title_set = set()
for text in train_product_title:
    texts = text.split()
    for i in texts:
        train_title_set.add(i)

In [70]:
### テストデータに適用

from sklearn.feature_extraction.text import TfidfVectorizer

test_product_title = test['product_title'].tolist()

def for_text(title_set, test_titles):

    result = []
    for title in test_titles:
        texts = title.split()
        tem = ''
        for i in texts:
            if i in title_set:
                tem += i + ' '
        result.append(tem)
    
    return result

test['for_test_on_train'] = for_text(train_title_set, test_product_title)

In [71]:
test_product_title = test['for_test_on_train'].tolist()

values, feature_names, idf, tfidf = tf_idf(test_product_title)

tfidf_sum = np.sum(values,axis=1)
tfidf_max = np.max(values, axis=1)
tfidf_std = np.std(values, axis=1)
tfidf_var = np.var(values, axis=1)

test['tfidf_sum'] = tfidf_sum
test['tfidf_max'] = tfidf_max
test['tfidf_std'] = tfidf_std
test['tfidf_var'] = tfidf_var

BM25を計測する

In [72]:
# # BM25の計測
# from rank_bm25 import BM25Okapi

# train_product_title = train['product_title'].tolist()

# def caluculate_idf(title):
#     titles = [ i.split() for i in title]
#     bm25 = BM25Okapi(titles)
#     return bm25

# bm25 = caluculate_idf(train_product_title)

In [73]:
# queries = train['query'].str.split()

# # すべてのBM25スコアを一度に計算する
# all_bm25_scores = [bm25.get_scores(query) for query in queries]

# # 各queryに対するBM25スコアのリストから、対応する位置のスコアを取得する
# result = [scores[i] for scores, i in zip(all_bm25_scores, range(len(queries)))]


In [74]:
# with open("/home/sugiyama/ir/data/bm25socre_for_5/bm25.txt", "w") as f:
#     for i in result:
#         f.write(str(i))
#         f.write("\n")

フィールド長を計測する

In [75]:
# フィールド長
def length_of_field(text):
    texts = str(text).split()

    return len(texts)

train['product_title_len'] = train['product_title'].apply(length_of_field)
train['product_description_len'] = train['product_description'].apply(length_of_field)

test['product_title_len'] = test['product_title'].apply(length_of_field)
test['product_description_len'] = test['product_description'].apply(length_of_field)

### 2.　前処理

In [76]:
def normalize(data):
    max_value = data.max()
    min_value = data.min()
    return (data - min_value) /(max_value - min_value)

In [77]:
train['tfidf_sum'] = normalize(train['tfidf_sum'])
train['tfidf_max'] = normalize(train['tfidf_max'])
train['tfidf_std'] = normalize(train['tfidf_std'])
train['tfidf_var'] = normalize(train['tfidf_var'])

In [78]:
test['tfidf_sum'] = normalize(test['tfidf_sum'])
test['tfidf_max'] = normalize(test['tfidf_max'])
test['tfidf_std'] = normalize(test['tfidf_std'])
test['tfidf_var'] = normalize(test['tfidf_var'])

### 3. 線形モデル

In [79]:
gain = {'E': 4, 'S': 2, 'C': 1, 'I': 0}
train['int_label'] = train['esci_label'].apply(lambda x: gain[x])
test['int_label'] = test['esci_label'].apply(lambda x: gain[x])

In [80]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X = train[['tfidf_sum', 'tfidf_max', 'tfidf_std', 'tfidf_var', 'product_title_len', 'product_description_len']]
y = train['int_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### 4. ランキングモデルの評価

In [81]:
test_x = test[['tfidf_sum', 'tfidf_max', 'tfidf_std', 'tfidf_var', 'product_title_len', 'product_description_len']]
test_y = test['int_label']

y_pred_test = model.predict(test_x)
test['pred_test'] = y_pred_test
test = test.sort_values(by=['query_id'], ascending=False)


In [82]:
def ndcg(y_true, y_pred, k=None, powered=False):
    def dcg(scores, k=None, powered=False):
        if k is None:
            k = scores.shape[0]
        if not powered:
            ret = scores[0]
            for i in range(1, k):
                ret += scores[i] / np.log2(i + 1)
            return ret
        else:
            ret = 0
            for i in range(k):
                ret += (2 ** scores[i] - 1) / np.log2(i + 2)
            return ret
    
    ideal_sorted_scores = np.sort(y_true)[::-1]
    ideal_dcg_score = dcg(ideal_sorted_scores, k=k, powered=powered)
    
    pred_sorted_ind = np.argsort(y_pred)[::-1]
    pred_sorted_scores = y_true[pred_sorted_ind]
    dcg_score = dcg(pred_sorted_scores, k=k, powered=powered)
    
    return dcg_score / ideal_dcg_score

In [83]:
def ndcg1(y_true, y_pred, k=None):
    return ndcg(y_true, y_pred, k=k, powered=False)

In [84]:
def avg_ndcg(data, id_list):
    sum = 0
    for id in id_list:
        df = data[data["query_id"] == id]
        
        sum += ndcg1(np.array(df['int_label'].tolist()), np.array(df['pred_test'].tolist()))

    return sum / len(id_list)

In [85]:
test_query_id_list = set(test['query_id'])

ndcg_result = avg_ndcg(test, test_query_id_list)
ndcg_result

0.897875351120962

### 5. 決定木

In [86]:
gain = {'E': 3, 'S': 2, 'C': 1, 'I': 0}
train['int_label'] = train['esci_label'].apply(lambda x: gain[x])
test['int_label'] = test['esci_label'].apply(lambda x: gain[x])

In [87]:
train_x = train[['query_id','tfidf_sum', 'tfidf_max', 'tfidf_std', 'tfidf_var', 'product_title_len', 'product_description_len']]
train_y = train['int_label']

test_x = test[['query_id','tfidf_sum', 'tfidf_max', 'tfidf_std', 'tfidf_var', 'product_title_len', 'product_description_len']]
test_y = test['int_label']

In [88]:
import xgboost as xgb
import numpy as np

clf = xgb.XGBClassifier(max_depth=3, learning_rate=0.01, n_estimators=100)

clf.fit(train_x, train_y)

predictions = clf.predict(test_x)

In [89]:
gain = {3: 4, 2: 2, 1: 1, 0: 0}
test['pred_test_xgboost'] = predictions
test['int_label'] = test['int_label'].apply(lambda x: gain[x])
test['pred_test_xgboost'] = test['pred_test_xgboost'].apply(lambda x: gain[x])

In [90]:
def avg_ndcg_for_xgboost(data, id_list):
    sum = 0
    for id in id_list:
        df = data[data["query_id"] == id]
        
        sum += ndcg1(np.array(df['int_label'].tolist()), np.array(df['pred_test_xgboost'].tolist()))

    return sum / len(id_list)

In [91]:
test_query_id_list = set(test['query_id'])

ndcg_result = avg_ndcg_for_xgboost(test, test_query_id_list)

In [92]:
ndcg_result

0.8965599400211411

### 6. バリデーションデータと学習率

In [93]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import DMatrix

gain = {'E': 3, 'S': 2, 'C': 1, 'I': 0}
train['int_label'] = train['esci_label'].apply(lambda x: gain[x])
test['int_label'] = test['esci_label'].apply(lambda x: gain[x])

x = train[['tfidf_sum', 'tfidf_max', 'tfidf_std', 'tfidf_var', 'product_title_len', 'product_description_len']]
y = train['int_label']

train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.2, random_state=42)

test_x = test[['tfidf_sum', 'tfidf_max', 'tfidf_std', 'tfidf_var', 'product_title_len', 'product_description_len']]
test_y = test['int_label']


In [94]:
from xgboost import XGBClassifier as XGB

clf = XGB()
clf.fit(train_x, train_y, eval_set =[(valid_x, valid_y)], early_stopping_rounds=1)

y_pred = clf.predict(test_x)
test['pred_test_xgboost'] = y_pred

[0]	validation_0-mlogloss:1.32020
[1]	validation_0-mlogloss:1.27734
[2]	validation_0-mlogloss:1.24745
[3]	validation_0-mlogloss:1.22782
[4]	validation_0-mlogloss:1.21318
[5]	validation_0-mlogloss:1.20077
[6]	validation_0-mlogloss:1.19222
[7]	validation_0-mlogloss:1.18533
[8]	validation_0-mlogloss:1.17971
[9]	validation_0-mlogloss:1.17606
[10]	validation_0-mlogloss:1.17395
[11]	validation_0-mlogloss:1.17260
[12]	validation_0-mlogloss:1.17056
[13]	validation_0-mlogloss:1.17075




In [None]:
gain = {3: 4, 2: 2, 1: 1, 0: 0}
test['int_label'] = test['int_label'].apply(lambda x: gain[x])
test['pred_test_xgboost'] = test['pred_test_xgboost'].apply(lambda x: gain[x])

test_query_id_list = set(test['query_id'])

ndcg_result = avg_ndcg_for_xgboost(test, test_query_id_list)

In [96]:
ndcg_result

0.8962326784631502

### 7. 特徴選択

In [None]:
from matplotlib import pyplot as plt

_, ax = plt.subplots(figsize=(12, 4))
xgb.plot_importance(clf,
                    ax=ax,
                    importance_type='gain',
                    show_values=False)
plt.show()

### 8. BERT

In [None]:
gain = {'E': 3, 'S': 2, 'C': 1, 'I': 0}
train['int_label'] = train['esci_label'].apply(lambda x: gain[x])
test['int_label'] = test['esci_label'].apply(lambda x: gain[x])

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_metric
import pandas as pd
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm
2024-04-30 13:44:11.951173: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
checkpoint = "bert-base-multilingual-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [3]:
def tokenize_function(batch):
    text = batch['product_title'] + batch['query']
    tokenized_batch = tokenizer(text, max_length=384,  truncation=True)
    tokenized_batch['labels'] = batch['int_label']
    return tokenized_batch

In [None]:
x = train[['product_title', 'query']]
y = train['int_label']

train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(pd.concat([train_x, train_y], axis=1))
dev_dataset = Dataset.from_pandas(pd.concat([valid_x, valid_y], axis=1))
test_dataset = Dataset.from_pandas(test[['product_title', 'query', 'int_label']])

In [None]:
train_tokenized_dataset = train_dataset.map(tokenize_function, remove_columns=train_dataset.column_names)
dev_tokenized_dataset = dev_dataset.map(tokenize_function, remove_columns=dev_dataset.column_names)
test_tokenized_dataset = test_dataset.map(tokenize_function, remove_columns=test_dataset.column_names)

100%|██████████| 5600/5600 [00:01<00:00, 3450.92ex/s]
100%|██████████| 1400/1400 [00:00<00:00, 3691.72ex/s]
100%|██████████| 425762/425762 [01:47<00:00, 3959.06ex/s]


In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=4,
    hidden_dropout_prob=0.2, 
    attention_probs_dropout_prob=0.2,
)
print(type(model).__name__)

loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/sugiyama/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.2,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.2,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_atten

BertForSequenceClassification


In [None]:
training_args = TrainingArguments(
    output_dir='./output/model',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    lr_scheduler_type='constant',
    load_best_model_at_end=True,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    remove_unused_columns=False,
    report_to='none'
)
   

PyTorch: setting up devices


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    train_dataset=train_tokenized_dataset,
    eval_dataset=dev_tokenized_dataset,
    data_collator=data_collator,
    args=training_args,
)

In [None]:
trainer.train()

In [None]:
test_preds = trainer.predict(test_tokenized_dataset)

In [None]:
predictions = np.argmax(np.array(test_preds[0]),axis=1)

In [None]:
def avg_ndcg_for_bert(data, id_list):
    sum = 0
    for id in id_list:
        df = data[data["query_id"] == id]
        
        sum += ndcg1(np.array(df['int_label'].tolist()), np.array(df['pred_test_bert'].tolist()))

    return sum / len(id_list)

In [None]:
gain = {0: 0, 1: 1, 2: 2, 3: 4}
test['pred_test_bert'] = predictions
test['int_label'] = test['int_label'].apply(lambda x: gain[x])
test['pred_test_bert'] = test['pred_test_bert'].apply(lambda x: gain[x])

test_query_id_list = set(test['query_id'])

ndcg_result = avg_ndcg_for_bert(test, test_query_id_list)

### 9. Siamese BERT

In [2]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset
import os
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score


  from .autonotebook import tqdm as notebook_tqdm
2024-05-01 10:51:16.338875: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [24]:
""" 0. Init """
dataset_path = '/home/sugiyama/ir/esci-data/shopping_queries_dataset'
df_examples = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_examples.parquet'))
df_products = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_products.parquet'))
df_examples_products = pd.merge(df_examples, df_products, how='left', on=['product_locale', 'product_id'])

train = df_examples_products[df_examples_products['split'] == "train"]
test = df_examples_products[df_examples_products['split'] == "test"]

train = train[train['product_locale'] == "us"]
test = test[test['product_locale'] == "us"]

gain = {'E': 3, 'S': 2, 'C': 1, 'I': 0}
train['label'] = train['esci_label'].map(gain)
test['label'] = test['esci_label'].map(gain)

x = train[['product_title', 'query']]
y = train['label']

train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(pd.concat([train_x, train_y], axis=1).reset_index(drop=True))
dev_dataset = Dataset.from_pandas(pd.concat([valid_x, valid_y], axis=1).reset_index(drop=True))
test_dataset = Dataset.from_pandas(test[['product_title', 'query', 'label']].reset_index(drop=True))


In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')



In [27]:
### trainデータセット ###

all_cols = ['label']

for part in ['product_title', 'query']:
    train_dataset = train_dataset.map(
        lambda x: tokenizer(
            x[part], max_length=128, padding='max_length',
            truncation=True
        ), batched=True
    )
    for col in ['input_ids', 'attention_mask']:
        train_dataset = train_dataset.rename_column(
            col, part+'_'+col
        )
        all_cols.append(part+'_'+col)
print(all_cols)

100%|██████████| 1115/1115 [00:55<00:00, 20.20ba/s]
100%|██████████| 1115/1115 [02:27<00:00,  7.55ba/s]


['label', 'product_title_input_ids', 'product_title_attention_mask', 'query_input_ids', 'query_attention_mask']


In [29]:
### devデータセット ###

all_cols = ['label']

for part in ['product_title', 'query']:
    dev_dataset = dev_dataset.map(
        lambda x: tokenizer(
            x[part], max_length=128, padding='max_length',
            truncation=True
        ), batched=True
    )
    for col in ['input_ids', 'attention_mask']:
        dev_dataset = dev_dataset.rename_column(
            col, part+'_'+col
        )
        all_cols.append(part+'_'+col)
print(all_cols)

100%|██████████| 279/279 [00:14<00:00, 19.43ba/s]
100%|██████████| 279/279 [00:36<00:00,  7.56ba/s]

['label', 'product_title_input_ids', 'product_title_attention_mask', 'query_input_ids', 'query_attention_mask']





In [30]:
### testデータセット ###

all_cols = ['label']

for part in ['product_title', 'query']:
    test_dataset = test_dataset.map(
        lambda x: tokenizer(
            x[part], max_length=128, padding='max_length',
            truncation=True
        ), batched=True
    )
    for col in ['input_ids', 'attention_mask']:
        test_dataset = test_dataset.rename_column(
            col, part+'_'+col
        )
        all_cols.append(part+'_'+col)
print(all_cols)

100%|██████████| 426/426 [00:20<00:00, 20.50ba/s]
100%|██████████| 426/426 [00:55<00:00,  7.65ba/s]

['label', 'product_title_input_ids', 'product_title_attention_mask', 'query_input_ids', 'query_attention_mask']





In [31]:
import torch
train_dataset.set_format(type='torch', columns=all_cols)
dev_dataset.set_format(type="torch", columns=all_cols)
# initialize the dataloader
batch_size = 16
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True
)
dev_loader = torch.utils.data.DataLoader(
    dev_dataset, batch_size=batch_size, shuffle=False
)
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=batch_size, shuffle=False
)

In [32]:
import torch
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertModel.from_pretrained('xlm-roberta-base').to(device)

You are using a model of type xlm-roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing BertModel: ['roberta.encoder.layer.1.attention.self.value.weight', 'lm_head.dense.bias', 'roberta.encoder.layer.5.attention.self.value.weight', 'roberta.encoder.layer.11.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.2.intermediate.dense.weight', 'roberta.encoder.layer.9.intermediate.dense.weight', 'roberta.encoder.layer.10.attention.self.value.weight', 'roberta.encoder.layer.8.attention.self.key.bias', 'roberta.encoder.layer.6.output.dense.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.8.attention.self.query.bias', 'roberta.encoder.layer.6.attention.output.dense.weight', 'roberta.encoder.layer.9.attention.self.key.bias', 'roberta.encoder.layer.8.atten

In [33]:
import torch.nn as n

class Siamese(nn.Module):
    def __init__(self, bert, embedding_dim=768, num_classes=4):
        super(Siamese, self).__init__()
        self.bert=bert
        self.linear=nn.Linear(embedding_dim*3, num_classes)
    
    # define mean pooling function
    def mean_pool(self, token_embeds, attention_mask):
        # reshape attention_mask to cover 768-dimension embeddings
        in_mask = attention_mask.unsqueeze(-1).expand(
            token_embeds.size()
        ).float()
        # perform mean-pooling but exclude padding tokens (specified by in_mask)
        pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
            in_mask.sum(1), min=1e-9
        )
        return pool
    
    def forward(self, batch_data):
        u=self.bert(batch_data['product_title_input_ids'].to(device), batch_data['product_title_attention_mask'].to(device), output_hidden_states=True)
        u=self.mean_pool(u.last_hidden_state, batch_data['product_title_attention_mask'].to(device))
        v=self.bert(batch_data['query_input_ids'].to(device), batch_data['query_attention_mask'].to(device), output_hidden_states=True)
        v=self.mean_pool(v.last_hidden_state, batch_data['query_attention_mask'].to(device))
        x=torch.cat([u, v, torch.abs(u-v)], dim=-1)
        y=self.linear(x)
        return y


In [34]:
loss_fn=torch.nn.CrossEntropyLoss()
model= Siamese(model).to(device)
optim = torch.optim.Adam(model.parameters(), lr=2e-5)


In [35]:
from transformers.optimization import get_linear_schedule_with_warmup
total_steps = int(len(train_dataset) / batch_size)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=warmup_steps,num_training_steps=total_steps - warmup_steps)


In [40]:
import numpy as np
np.object = object

In [42]:
num_epochs=1
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
for i in range(num_epochs):
    for batch in tqdm(train_loader):
        optim.zero_grad()
        y=model(batch)
        loss=loss_fn(y, batch['label'].to(device))
        loss.backward()
        optim.step()
        scheduler.step()
    correct=0
    total=0
    epoch_loss=0
    with torch.no_grad():
        for batch in tqdm(train_loader):
            y=model(batch)
            loss=loss_fn(y, batch['label'].to(device))
            epoch_loss+=loss.item()
            preds=y.argmax(-1)
            correct+=(preds==batch['label'].to(device)).sum()
            total+=preds.shape[0]
        print(f"Epoch: {i} Train accuracy: {correct/total*100} Loss: {epoch_loss/total}")
    
    correct=0
    total=0
    epoch_loss=0
    with torch.no_grad():
        for batch in tqdm(dev_loader):
            y=model(batch)
            loss=loss_fn(y, batch['label'].to(device))
            epoch_loss+=loss.item()
            preds=y.argmax(-1)
            correct+=(preds==batch['label'].to(device)).sum()
            total+=preds.shape[0]
        print(f"Epoch: {i} Val accuracy: {correct/total*100} Loss: {epoch_loss/total}")


100%|██████████| 69654/69654 [4:49:23<00:00,  4.01it/s]  
100%|██████████| 69654/69654 [1:33:28<00:00, 12.42it/s]


Epoch: 0 Train accuracy: 74.59805297851562 Loss: 0.04182487461064608


100%|██████████| 17414/17414 [23:21<00:00, 12.42it/s]

Epoch: 0 Val accuracy: 73.38961029052734 Loss: 0.043756257107274335



