# Data Processing

## Global Settings

In [1]:
# jupyter notebook에서 import 해서 쓰는 모듈의 코드가 변경될 시, 변동 사항을 자동으로 반영해주는 기능 켜기
%load_ext autoreload
%autoreload 2

In [None]:
import argparse
import os
from preprocess import prep_behavior_combined
from preprocess import prep_news_combined

In [None]:
"""
경로가 폴더를 나타낼 경우 Dir, 파일일 경우 Path로 명명
"""

size = "demo"

datasetPath = f"MIND/{size}"
datasetTrainPath = f"{datasetPath}/train"
datasetTestPath = f"{datasetPath}/test"
trainBehaviorsPath = f"{datasetTrainPath}/behaviors.tsv"
testBehaviorsPath = f"{datasetTestPath}/behaviors.tsv"
trainNewsPath = f"{datasetTrainPath}/news.tsv"
testNewsPath = f"{datasetTestPath}/news.tsv"

processedDataPath = "preprocessed_data"
preTrainDir = f"{processedDataPath}/{size}/train"
preTestDir = f"{processedDataPath}/{size}/test"

user2intPath = f"{preTrainDir}/user2int.tsv"
word2intPath = f"{preTrainDir}/word2int.tsv"
category2intPath = f"{preTrainDir}/category2int.tsv"

wordEmbeddingDir = "word_embeddings"
wordEmbeddingPath = f"{wordEmbeddingDir}/glove.840B.300d.txt"

## 1. Train/Test 데이터셋 behaviors.tsv 전처리

### 생성되는 파일
##### Train
1. train_behavior.tsv
2. user2int.tsv
3. val_behavior.tsv
##### Test
1. test_behavior.tsv

In [4]:
args = argparse.Namespace(
    train_behavior_path = trainBehaviorsPath,
    test_behavior_path = testBehaviorsPath,
    train_out_dir = preTrainDir,
    test_out_dir = preTestDir,
    user2int_path = f"{preTrainDir}/user2int.tsv",
    split_test_size = 0.1,
    n_negative = 4
)

In [5]:
os.makedirs(args.train_out_dir, exist_ok=True)
os.makedirs(args.test_out_dir, exist_ok=True)

prep_behavior_combined.prep_behavior_combined(args)

preparing training data


100%|██████████| 19830/19830 [00:00<00:00, 43072.48it/s]


preparing eval data


100%|██████████| 2204/2204 [00:00<00:00, 50197.09it/s]
100%|██████████| 4880/4880 [00:00<00:00, 2439304.44it/s]


preparing eval data


100%|██████████| 7538/7538 [00:00<00:00, 58717.92it/s]


## 2. Train/Test 데이터셋 news.tsv 전처리

### 생성되는 파일
##### Train
1. parsed_news.tsv
2. category2int.tsv
3. embedding_weights.csv
4. word2int.tsv
##### Test
1. parsed_news.tsv
2. embedding_weights.csv
3. word2int.tsv

In [None]:
#parse_news_combined.py용 args
args = argparse.Namespace(
    train_news_path = trainNewsPath,
    test_news_path = testNewsPath,
    train_out_dir = preTrainDir,
    test_out_dir = preTestDir,
    word_embedding_path = wordEmbeddingPath,
    max_title = 20,
    max_abstract = 50
)

In [None]:
# prep embedings/vocab
embeddings = prep_news_combined.process_word_embeddings(args.word_embedding_path)

preparing/processing word-embeddings


100%|██████████| 2196018/2196018 [00:28<00:00, 77164.71it/s]


In [8]:
prep_news_combined.prep_news_combined(args, embeddings)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


preparing/processing train news content


Device set to use cuda:0
  0%|          | 1/26740 [00:00<1:02:04,  7.18it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 26740/26740 [02:21<00:00, 189.04it/s]
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


preparing/processing test news content


Device set to use cuda:0
100%|██████████| 18723/18723 [01:53<00:00, 165.18it/s]


## 기타 테스트 코드

In [None]:
# embeddings 테스트
print(type(embeddings))
print(len(embeddings))
print(type(embeddings["hello"]))
print(embeddings["hello"])