# Quick view

In [1]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('./data/labeled_sample_03.csv')
df = pd.concat([df['review'], df.loc[:, df.dtypes==int]], axis=1)

In [3]:
df.head()

Unnamed: 0,review,가성비,귀여운,넓은,단체,만족,모던,분위기,비주얼,아늑,위생,응대,이색음식,이색테마,클래식,혼자
0,연어국수 너무 맛있었어요 이 가격에 초밥무한이라니 행복해요 신선한재료로 맛나게 먹었어요,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,즈인짜 너무 맛있네요 카레 리필도 해주시고 직분들 다 친절하셔서 기분 좋게 먹고 왔...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,일하시는 분들이 정말 친절하셔서 좋았어요 음식도 맛있었습니당,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,맛있어요 저는 치킨이 안느끼하고맛있었어요 밥이랑 카레 리필도 되서 넘 맛있게 먹었습...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,직원들이 너무 친절하고 맛있어용 특히 카레가.,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [4]:
df.iloc[:, 1:].sum().sort_values() / len(df) * 100

단체       1.591187
비주얼      3.059976
귀여운      3.427173
혼자       3.549572
넓은       3.916769
클래식      4.039168
모던       6.119951
이색테마     6.487148
이색음식     9.179927
위생       9.547124
만족       9.669523
아늑      10.648715
가성비     12.239902
분위기     25.948592
응대      39.412485
dtype: float64

---

# Data Augmentation

## Manually

### Gensim - Word2Vec

In [5]:
import re
import random
from typing import List

import numpy as np
import pandas as pd

from gensim import models

In [None]:
ko_model = models.fasttext.load_facebook_model('./wordvec/cc.ko.300.bin')

In [None]:
# check  similar word manually
ko_model.wv.most_similar(positive=['클래식', '분위기'], topn=100)

### Random swap

In [6]:
def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)

	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0

	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words

	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
	return new_words

In [7]:
criteria = df.iloc[:, 1:].sum().sort_values() / len(df) < 0.1
lack_keywords = criteria.index[criteria].to_list()
lack_keywords

['단체', '비주얼', '귀여운', '혼자', '넓은', '클래식', '모던', '이색테마', '이색음식', '위생', '만족']

### Synonym replacement

In [8]:
def select_vibe(df, vibe, synonyms):
    vibe_df = df.iloc[np.where(df['review'].str.contains('|'.join(synonyms)))]
    vibe_df = vibe_df[vibe_df[vibe]==1]
    return vibe_df

def text_augmentation(dataframe: pd.DataFrame,
                      synonyms: List,
                      num_repeated:int = 2,
                      swap_index:int = 2):
    augmented = []
    for i, row in dataframe.iterrows():
        for word in synonyms:
            for m in re.finditer(word, row['review']):
                synonyms_temp = synonyms.copy()
                count = 0
                while len(synonyms_temp) > 0 and count < num_repeated:
                    synonym = synonyms_temp.pop(random.choice(range(len(synonyms_temp))))
                    if synonym == word:
                        continue
                    new_string = re.sub(word, synonym, row['review'])
                    new_string = ' '.join(random_swap(new_string.split(), swap_index))
                    augmented.append([new_string, *row.values[1:]])
                    count += 1
    return augmented

In [9]:
# 단체
corps_synonym = ['여러명', '모임', '대형', '회식', '동아리', '동호회']
corps = select_vibe(df, '단체', corps_synonym)
# 비주얼
visual_synonym = ['플레이팅', '아름', '이쁘', '예쁘', '색감', '인스타', '비쥬얼', '감동']
visual = select_vibe(df, '비주얼', visual_synonym)
# 귀여운
cute_synonym = ['귀여', '사랑', '예쁘', '이쁘', '깜찍', '아기자기', '아름']
cute = select_vibe(df, '귀여운', cute_synonym)
# 혼자
alone_synonym = ['혼밥', '혼술', '1인', '홀로']
alone = select_vibe(df, '혼자', alone_synonym)
# 클래식
classic_synonym = ['엔틱', '비싸', '우아', '감성', '유럽', '무드', '여유', '기념일', '조용', '고급']
classic = select_vibe(df, '클래식', classic_synonym)
# 모던
modern_synonym = ['깔끔', '조용', '데이트', '고급', '예쁘', '이쁘', '세련', '화이트', '인스타']
modern = select_vibe(df, '모던', modern_synonym)

aug_list = [corps, visual, cute, alone, classic, modern]
aug_synonyms = [corps_synonym, visual_synonym, cute_synonym, 
                alone_synonym, classic_synonym, modern_synonym]
total_aug_df = pd.DataFrame(columns=df.columns)

for aug_df, aug_syn in zip(aug_list, aug_synonyms):
    temp = pd.DataFrame(text_augmentation(aug_df, aug_syn, 10, 2), columns=df.columns)
    total_aug_df = pd.concat([total_aug_df, temp], axis=0)

In [10]:
len(total_aug_df)

644

In [11]:
# concat original data and augmented data
review_with_aug = pd.concat([df, total_aug_df], axis=0, ignore_index=True)
# shuffle
review_with_aug = review_with_aug.sample(frac=1).reset_index(drop=True)

In [12]:
review_with_aug.head()

Unnamed: 0,review,가성비,귀여운,넓은,단체,만족,모던,분위기,비주얼,아늑,위생,응대,이색음식,이색테마,클래식,혼자
0,동아리굳 굳굳,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,전시회 같아서 너무 여유스러운 인테리어였어요 남친과 너무 분위기 크으으 마들렌도 갔...,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0
2,"뼈삼겹이란 메뉴가 생소했는데 고기도 신선하고, 구워주시니 편리해서 좋네요.숯불최고",0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
3,음식이 짱입니다.다녀본 먹어본 음식중에 젤 맛있어요.와인추천도 해주시고 설명도 잘 ...,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0
4,브라우니 약간 비싼 감이 있지만 맛있었어요.인테리어는 딱 요즘 데이트 인테리어고 감...,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0


In [13]:
review_with_aug.to_csv('./data/labeled_sample_03_with_aug.csv', index=False)

---

## BERT

In [None]:
import re
import random
from random import shuffle
from io import StringIO

import pandas as pd
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

import warnings
warnings.filterwarnings('ignore')

### Helper functions

In [None]:
def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)

	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0

	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words

	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
	return new_words

def only_kor(string):
    string = re.sub('\n', ' ', string)
    string = re.sub(r'[ㄱ-ㅎㅏ-ㅣ ]', ' ', string)
    string = re.sub(r'[^0-9가-힣,.!? ]', '', string)
    string = re.sub(r' *[.!?]+ *', '.', string)
    string = re.sub('\.([0-9]+)\.', '', string)
    string = re.sub(r'^[. ]+', '', string)
    string = re.sub(' +', ' ', string)
    return string

### Prepare

In [None]:
# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# Load data, preprocessing
raw = pd.read_csv('./data/labeled_sample_03.csv')
raw = raw.dropna(subset=['review'])
raw['review'] = raw['review'].map(only_kor)

In [None]:
# Load pretraiend tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('snunlp/KR-SBERT-V40K-klueNLI-augSTS')
model = AutoModel.from_pretrained('snunlp/KR-SBERT-V40K-klueNLI-augSTS').to(device)

for param in model.parameters():
    param.requires_grad = False

In [None]:
# Create Dataset
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, texts, max_length=512):
        super().__init__()
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            truncation=True,
            return_tensors='pt',
            padding='max_length'
        )

        input_ids = inputs['input_ids'].flatten()

        return {'input_ids': input_ids}

In [None]:
# Create Dataloader
review_dataset = ReviewDataset(tokenizer, raw['review'].tolist())
review_dataloader = DataLoader(review_dataset,
                               batch_size=64)

### Extract similar words from review

In [None]:
def target_vibe(model, tokenizer, text, threshold=0.4):
    target = tokenizer.tokenize(text)[0]
    print(f"Target keyword is: {target}")
    vibe_output = model(tokenizer.encode(text, return_tensors='pt').to(device))
    vibe_embedding = vibe_output.last_hidden_state[:, 1, :]

    print(f"[INFO] Searching similar words with '{target}'...")
    similar_words = []
    model.eval()
    with torch.inference_mode():
        for batch in tqdm(review_dataloader):
            input_ids = batch['input_ids'].to(device)

            output = model(input_ids)

            last_seq_emb = output.last_hidden_state

            similarity = F.cosine_similarity(last_seq_emb, vibe_embedding, dim=-1).detach()

            similar_index = (similarity>threshold).nonzero().squeeze()

            if len(similar_index) > 0:
                for i in range(len(similar_index)):
                    try:
                        sim_word = tokenizer.decode(input_ids[similar_index[i][0]][similar_index[i][1]])
                        similar_words.append(sim_word)
                        # if sim_word not in similar_words:
                        print(f"- {sim_word:>5s}, {similarity[similar_index[i][0]][similar_index[i][1]]:.3f}")
                    except:
                        continue
    
    return list(set(similar_words))

### Encode similar tokens(words) to int

In [None]:
def words_to_tokens(similar_words):
    similar_tokens = [t[1] for t in list(map(tokenizer.encode, similar_words))]
    return similar_tokens

### Using similar words, create new sentence

In [None]:
def generate_sim_sentence(df, similar_tokens):
    new_sentences = []

    for j, text in enumerate(df['review'].tolist()):
        text_tokenized = tokenizer.encode(text)

        for i, token in enumerate(text_tokenized):
            if token in similar_tokens:
                syn_token = random.choice(similar_tokens)
                new_tokenized = text_tokenized[:i] + [syn_token] + text_tokenized[i+1:]
                new_tokenized = random_swap(new_tokenized[1:-1], 1)
                new_sentence = tokenizer.decode(new_tokenized[1:-1])
                new_sentence = re.sub('#', '', new_sentence)
                new_sentence = re.sub('\[UNK\]', '', new_sentence)
                new_sentence = new_sentence + ';' + ';'.join(map(str, df.iloc[j, 4:].values))
                new_sentences.append(new_sentence)

    return new_sentences

### Put all in one

In [None]:
def sentence_generator(df, model, tokenizer, text, threshold=0.46):
    similar_words = target_vibe(model, tokenizer, text, threshold=threshold)
    similar_tokens = words_to_tokens(similar_words)
    new_sentences = generate_sim_sentence(df, similar_tokens)

    csv_strings = StringIO('\n'.join(new_sentences))
    col_names = ['review']
    col_names.extend(df.columns[4:])
    raw_augmented = pd.read_csv(csv_strings, header=None, sep=';', names=col_names)

    return raw_augmented

### Generate new sentences

In [None]:
target_texts = ['귀여운 분위기가 있는', '단체로 가기 좋은', 
                '모임 장소로 적절한', '비주얼이 좋은 음식이 있는', 
                '클래식한 분위기인', '혼자 가기 좋은']

In [None]:
augmented_df = pd.DataFrame()
for target_text in tqdm(target_texts):
    temp = sentence_generator(raw, model, tokenizer, target_text)
    augmented_df = pd.concat([augmented_df, temp], axis=0)

In [None]:
augmented_df.head(10)