In [1]:
import os
os.chdir('./sbert_module/')

---

In [2]:
from config import CFG

import pandas as pd

In [3]:
# set files
origin_file = 'labeled.csv'
origin_sample_file = 'labeled_sample_03.csv'
augmented_file = 'labeled_sample_03_with_aug.csv'

In [4]:
# Get data for sampling
ori = pd.read_csv(f"./{CFG.DATA_PATH}/{origin_file}")

In [10]:
ori = pd.concat([ori['review'], ori.loc[:, ori.dtypes==int]], axis=1)

In [13]:
aug_sample = ori.sample(frac=0.3)

In [18]:
not_sample_idx = [i for i in range(len(ori)) if i not in aug_sample.index]
not_sample = ori.loc[not_sample_idx]

In [25]:
aug_sample.to_csv(f'./{CFG.DATA_PATH}/{origin_sample_file}', index=False)
not_sample.to_csv(f'./{CFG.DATA_PATH}/labeled_sample_07.csv', index=False)

In [6]:
# load original file and augmented file
origin = pd.read_csv(f'./{CFG.DATA_PATH}/{origin_sample_file}')
augmented = pd.read_csv(f'./{CFG.DATA_PATH}/{augmented_file}')

In [None]:
# change config file to compare two files
def change_cfg(config_file, attribute, value):
    with open(config_file, 'r') as f:
        cfg = f.readlines()
    with open(config_file, 'w') as f:
        for line in cfg:
            if line.__contains__(attribute):
                f.writelines(f"    {attribute} = '{value}'\n")
            else:
                f.write(line)

# Original Sample

In [None]:
# Edit config file. Training data to original file, use pretrained model
change_cfg('./config.py', 'MODEL_CSV_FILE', origin_sample_file)
change_cfg('./config.py', 'SBERT_MODEL_FOLDER', 'None')

## SBERT training

In [None]:
# Train sbert
!python sbert_trainer.py

In [None]:
# Edit config file to load sbert model parameter just trained with original file
change_cfg('./config.py', 'SBERT_MODEL_FOLDER', os.listdir('sbert_model')[-1])

## Classifier training

In [None]:
# Train classifier
!python trainer.py

2023-02-14 08:58:58.120770: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-14 08:58:59.076177: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2023-02-14 08:58:59.076300: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
[INFO] (2023.02.14-08:59:02) Load SBERT Model...
[INFO] (2023.02.14-08:59:04) Done! (2.115s)
[

In [None]:
# Edit config file, classifier model parameter 
change_cfg('./config.py', 'CLASSIFIER_MODEL_DATE', os.listdir('model')[-1])

In [None]:
from config import CFG
from data_setup import get_data
from data_setup import ClassifierDataset
from model_builder import SimpleMLC
from model_builder import load_sbert
from engine import test_step

import torch
from torch.utils.data import DataLoader

# set device
device = 'cuda' if torch.cuda.is_available else 'cpu'

# load sbert learned augmented data
s_bert = load_sbert(f'{CFG.SBERT_MODEL_PATH}', os.listdir({CFG.SBERT_MODEL_PATH})[-1])

# Get test data
df = get_data(CFG.DATA_PATH, CFG.EVALUATE_CSV_FILE)

# Vectorize data using sbert
X_train_vectorized = s_bert.encode(df['review'].tolist(), device=device)

# Create Dataset and DataLoader for classifier input
dataset = ClassifierDataset(X_train_vectorized, df.iloc[:, -15:].to_numpy())
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Call classifier
classifier = SimpleMLC(n_classes=15).to(device)
classifier.load_state_dict(
    torch.load(f"{CFG.CLASSIFIER_MODEL_PATH}/{os.listdir('model')[-1]}/{CFG.CLASSIFIER_MODEL_FILE}.pth",
               map_location=torch.device(device)))

# evaluate test datas
loss, results = test_step(classifier,
                          dataloader, 
                          torch.nn.BCEWithLogitsLoss(),
                          device=device)

In [None]:
results.mean()

---

# Augmented Sample

In [None]:
# Edit config file. Change training data to augmented file. Use sbert pretrained parameters
change_cfg('./config.py', 'MODEL_CSV_FILE', augmented_file)
change_cfg('./config.py', 'SBERT_MODEL_FOLDER', 'None')

## SBERT training

In [None]:
# Train sbert with augmented data
!python sbert_trainer.py

Epoch:   0% 0/5 [00:00<?, ?it/s]
Iteration:   0% 0/552 [00:00<?, ?it/s][A
Iteration:   0% 1/552 [00:01<11:09,  1.21s/it][A
Iteration:   0% 2/552 [00:01<08:17,  1.11it/s][A
Iteration:   1% 3/552 [00:02<08:04,  1.13it/s][A
Iteration:   1% 4/552 [00:03<07:55,  1.15it/s][A
Iteration:   1% 5/552 [00:04<07:38,  1.19it/s][A
Iteration:   1% 6/552 [00:05<07:38,  1.19it/s][A
Iteration:   1% 7/552 [00:05<07:07,  1.27it/s][A
Iteration:   1% 8/552 [00:06<07:03,  1.29it/s][A
Iteration:   2% 9/552 [00:07<06:59,  1.29it/s][A
Iteration:   2% 10/552 [00:07<06:34,  1.37it/s][A
Iteration:   2% 11/552 [00:08<06:40,  1.35it/s][A
Iteration:   2% 12/552 [00:09<06:39,  1.35it/s][A
Iteration:   2% 13/552 [00:09<06:31,  1.38it/s][A
Iteration:   3% 14/552 [00:10<06:22,  1.41it/s][A
Iteration:   3% 15/552 [00:11<06:28,  1.38it/s][A
Iteration:   3% 16/552 [00:11<06:25,  1.39it/s][A
Iteration:   3% 17/552 [00:12<06:30,  1.37it/s][A
Iteration:   3% 18/552 [00:13<06:35,  1.35it/s][A
Iteration:   3% 

In [None]:
# Edit config file to load sbert model parameter just trained with original file
change_cfg('./config.py', 'SBERT_MODEL_FOLDER', os.listdir('sbert_model')[-1])

## Classifier training

In [None]:
# Train classifier
!python trainer.py

[INFO] (2023.02.14-10:03:58) Load SBERT Model... 23-02-14_0929
[INFO] Load custom trained sbert model.
[INFO] (2023.02.14-10:04:00) Done! (1.764s)
[INFO] (2023.02.14-10:04:00) Preparing Dataset...
[INFO] Vectorize Data...
[INFO] (2023.02.14-10:04:08) Done! (7.525s)
[INFO] (2023.02.14-10:04:08) Training...
100% 42/42 [00:00<00:00, 357.79it/s]
100% 5/5 [00:00<00:00, 1015.91it/s]

Epochs: 1 | Loss: 3.9956 | Valid Loss: 2.3528
Train Results:
+----------+------------+-----------+-----------+
|          |   F1 score |   ROC AUC |    PR AUC |
|----------+------------+-----------+-----------|
| 가성비   |  0.255556  |  0.692227 | 0.260702  |
| 귀여운   |  0.311404  |  0.641317 | 0.311342  |
| 넓은     |  0.030303  |  0.59091  | 0.0485963 |
| 단체     |  0.0239521 |  0.506502 | 0.059863  |
| 만족     |  0.015748  |  0.590969 | 0.127546  |
| 모던     |  0.117647  |  0.539237 | 0.264096  |
| 분위기   |  0.123324  |  0.569879 | 0.358362  |
| 비주얼   |  0.151316  |  0.578525 | 0.122629  |
| 아늑     |  0.0729167 |  0.5

In [None]:
# Edit config file, classifier model parameter 
change_cfg('./config.py', 'CLASSIFIER_MODEL_DATE', os.listdir('model')[-1])

In [None]:
# load sbert learned augmented data
s_bert = load_sbert(f'{CFG.SBERT_MODEL_PATH}', os.listdir(CFG.SBERT_MODEL_PATH)[-1])

# Vectorize data using sbert
X_train_vectorized = s_bert.encode(df['review'].tolist(), device=device)

# Create Dataset and DataLoader for classifier input
dataset = ClassifierDataset(X_train_vectorized, df.iloc[:, -15:].to_numpy())
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Call classifier
classifier = SimpleMLC(n_classes=15).to(device)
classifier.load_state_dict(
    torch.load(f"{CFG.CLASSIFIER_MODEL_PATH}/{os.listdir('model')[-1]}/{CFG.CLASSIFIER_MODEL_FILE}.pth",
               map_location=torch.device(device)))

# evaluate test datas
aug_loss, aug_results = test_step(classifier,
                                  dataloader, 
                                  torch.nn.BCEWithLogitsLoss(),
                                  device=device)

[INFO] Load custom trained sbert model.


100%|██████████| 57/57 [00:00<00:00, 1015.39it/s]


In [None]:
aug_results.mean()

F1 score    0.559121
ROC AUC     0.886213
PR AUC      0.569219
dtype: float64

# Undo config

In [None]:
change_cfg('./config.py', 'SBERT_MODEL_FOLDER', os.listdir('sbert_model')[0])
change_cfg('./config.py', 'CLASSIFIER_MODEL_DATE', os.listdir('model')[0])
change_cfg('./config.py', 'MODEL_CSV_FILE', origin_file)

# Remove trained model

In [3]:
import shutil

shutil.rmtree(f"{CFG.SBERT_MODEL_PATH}/{os.listdir(CFG.SBERT_MODEL_PATH)[-1]}")
shutil.rmtree(f"{CFG.SBERT_MODEL_PATH}/{os.listdir(CFG.SBERT_MODEL_PATH)[-1]}")

shutil.rmtree(f"{CFG.CLASSIFIER_MODEL_PATH}/{os.listdir(CFG.CLASSIFIER_MODEL_PATH)[-1]}")
shutil.rmtree(f"{CFG.CLASSIFIER_MODEL_PATH}/{os.listdir(CFG.CLASSIFIER_MODEL_PATH)[-1]}")