# Data Processing

## Global Settings

In [1]:
# jupyter notebook에서 import 해서 쓰는 모듈의 코드가 변경될 시, 변동 사항을 자동으로 반영해주는 기능 켜기
%load_ext autoreload
%autoreload 2

In [2]:
import os
from preprocess import prep_behavior_combined
from preprocess import prep_news_combined
from preprocess.prep_behavior_combined import PrepBehaviorArgs
from preprocess.prep_news_combined import PrepNewsArgs

In [3]:
"""
경로가 폴더를 나타낼 경우 Dir, 파일일 경우 Path로 명명
"""

size = "manual"

datasetPath = f"MIND/{size}"
datasetTrainPath = f"{datasetPath}/train"
datasetTestPath = f"{datasetPath}/test"
trainBehaviorsPath = f"{datasetTrainPath}/behaviors.tsv"
testBehaviorsPath = f"{datasetTestPath}/behaviors.tsv"
trainNewsPath = f"{datasetTrainPath}/news.tsv"
testNewsPath = f"{datasetTestPath}/news.tsv"

processedDataPath = "preprocessed_data"
preTrainDir = f"{processedDataPath}/{size}/train"
preTestDir = f"{processedDataPath}/{size}/test"

user2intPath = f"{preTrainDir}/user2int.tsv"
word2intPath = f"{preTrainDir}/word2int.tsv"
category2intPath = f"{preTrainDir}/category2int.tsv"

wordEmbeddingDir = "word_embeddings"
wordEmbeddingPath = f"{wordEmbeddingDir}/glove.840B.300d.txt"

## 1. Train/Test 데이터셋 behaviors.tsv 전처리

### 생성되는 파일
##### Train
1. train_behavior.tsv
2. user2int.tsv
3. val_behavior.tsv
##### Test
1. test_behavior.tsv

In [4]:
args = PrepBehaviorArgs(
    train_behavior_path = trainBehaviorsPath,
    test_behavior_path = testBehaviorsPath,
    train_out_dir = preTrainDir,
    test_out_dir = preTestDir,
    user2int_path = f"{preTrainDir}/user2int.tsv",
    split_test_size = 0.1,
    n_negative = 4
)

In [5]:
os.makedirs(args.train_out_dir, exist_ok=True)
os.makedirs(args.test_out_dir, exist_ok=True)

prep_behavior_combined.prep_behavior_combined(args)

preparing training data


100%|██████████| 1/1 [00:00<?, ?it/s]


preparing eval data


100%|██████████| 1/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]


preparing eval data


100%|██████████| 1/1 [00:00<?, ?it/s]


## 2. Train/Test 데이터셋 news.tsv 전처리

### 생성되는 파일
##### Train
1. parsed_news.tsv
2. category2int.tsv
3. embedding_weights.csv
4. word2int.tsv
##### Test
1. parsed_news.tsv
2. embedding_weights.csv
3. word2int.tsv

In [6]:
#parse_news_combined.py용 args
args = PrepNewsArgs(
    train_news_path = trainNewsPath,
    test_news_path = testNewsPath,
    train_out_dir = preTrainDir,
    test_out_dir = preTestDir,
    word_embedding_path = wordEmbeddingPath,
    max_title = 100,
    max_abstract = 100
)

In [7]:
# prep embedings/vocab
embeddings = prep_news_combined.process_word_embeddings(args.word_embedding_path)

preparing/processing word-embeddings


100%|██████████| 2196018/2196018 [00:29<00:00, 75087.10it/s]


In [8]:
prep_news_combined.prep_news_combined(args, embeddings)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


preparing/processing train news content


Device set to use cuda:0
  0%|          | 1/235 [00:00<00:31,  7.53it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 235/235 [00:01<00:00, 162.11it/s]
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


preparing/processing test news content


Device set to use cuda:0
100%|██████████| 24/24 [00:00<00:00, 179.84it/s]


## 기타 테스트 코드

In [8]:
# embeddings 테스트
print(type(embeddings))
print(len(embeddings))
print(type(embeddings["hello"]))
print(embeddings["hello"])

<class 'dict'>
2196017
<class 'str'>
0.25233 0.10176 -0.67485 0.21117 0.43492 0.16542 0.48261 -0.81222 0.041321 0.78502 -0.077857 -0.66324 0.1464 -0.29289 -0.25488 0.019293 -0.20265 0.98232 0.028312 -0.081276 -0.1214 0.13126 -0.17648 0.13556 -0.16361 -0.22574 0.055006 -0.20308 0.20718 0.095785 0.22481 0.21537 -0.32982 -0.12241 -0.40031 -0.079381 -0.19958 -0.015083 -0.079139 -0.18132 0.20681 -0.36196 -0.30744 -0.24422 -0.23113 0.09798 0.1463 -0.062738 0.42934 -0.078038 -0.19627 0.65093 -0.22807 -0.30308 -0.12483 -0.17568 -0.14651 0.15361 -0.29518 0.15099 -0.51726 -0.033564 -0.23109 -0.7833 0.018029 -0.15719 0.02293 0.49639 0.029225 0.05669 0.14616 -0.19195 0.16244 0.23898 0.36431 0.45263 0.2456 0.23803 0.31399 0.3487 -0.035791 0.56108 -0.25345 0.051964 -0.10618 -0.30962 1.0585 -0.42025 0.18216 -0.11256 0.40576 0.11784 -0.19705 -0.075292 0.080723 -0.02782 -0.15617 -0.44681 -0.15165 0.1692 0.098255 -0.031894 0.087143 0.26082 0.002706 0.1319 0.34439 -0.37894 -0.4114 0.081571 -0.11674 -0.43