# Data Processing

## 1. Global Settings

순서 파악파기

python parse_behavior.py --in-file "$datasetTrainPath/behaviors.tsv" --out-dir "$preTrainPath" --mode train

python parse_behavior.py --in-file "$datasetTestPath/behaviors.tsv" --out-dir "$preTestPath" --mode test --user2int "$preTrainPath/user2int.tsv"

python parse_news.py --in-file "$datasetTrainPath/news.tsv" --out-dir "$preTrainPath" --mode train --word-embeddings "$wordEmbeddingPath/glove.840B.300d.txt"

python parse_news.py --in-file "$datasetTestPath/news.tsv" --out-dir "$preTestPath" --mode test --word-embeddings "$wordEmbeddingPath/glove.840B.300d.txt" --embedding-weights "$preTrainPath/embedding_weights.csv" --word2int "$preTrainPath/word2int.tsv" --category2int "$preTrainPath/category2int.tsv"

parse_behavior.py 먼저 진행

parse_news는 glove 임베딩 파일 필요

In [1]:
# jupyter notebook에서 import 해서 쓰는 모듈의 코드가 변경될 시, 변동 사항을 자동으로 반영해주는 기능 켜기
%load_ext autoreload
%autoreload 2

In [2]:
import argparse
import os
from sklearn.model_selection import train_test_split
import prep_behavior
import prep_news

In [3]:
size = "demo"

datasetPath = f"MIND/{size}"
datasetTrainPath = f"{datasetPath}/train"
datasetTestPath = f"{datasetPath}/test"
trainBehaviorsPath = f"{datasetTrainPath}/behaviors.tsv"
testBehaviorsPath = f"{datasetTestPath}/behaviors.tsv"
trainNewsPath = f"{datasetTrainPath}/news.tsv"
testNewsPath = f"{datasetTestPath}/news.tsv"
wordEmbeddingPath = "word_embeddings"

processedDataPath = "processed"
preTrainPath = f"{processedDataPath}/{size}/train"
preTestPath = f"{processedDataPath}/{size}/test"

user2int = f"{preTrainPath}/user2int.tsv"
word2int = f"{preTrainPath}/word2int.tsv"
category2int = f"{preTrainPath}/category2int.tsv"
word_embeddings = f"{wordEmbeddingPath}/glove.840B.300d.txt"

#### 1. python parse_behavior.py --in-file "$datasetTrainPath/behaviors.tsv" --out-dir "$preTrainPath" --mode train

In [4]:
# train behaviors.tsv 전처리

args = argparse.Namespace(
    in_file = trainBehaviorsPath,
    split = 0.1,
    out_dir = preTrainPath,
    n_negative = 4
)

os.makedirs(args.out_dir, exist_ok=True)

with open(args.in_file, 'r') as trainBehaviors:
    behavior = trainBehaviors.readlines()
    if (args.split == 0):
        prep_behavior.generate_training_data(args, behavior, args.out_dir)
    else:
        train_behavior, val_behavior = train_test_split(behavior,test_size=args.split, random_state=1234)
        user2int = prep_behavior.generate_training_data(args, train_behavior, args.out_dir)
        prep_behavior.generate_eval_data(val_behavior, args.out_dir, "val_behavior.tsv", user2int)

preparing training data


100%|██████████| 19830/19830 [00:00<00:00, 42982.95it/s]


preparing eval data


100%|██████████| 2204/2204 [00:00<00:00, 56004.01it/s]


#### 2. python parse_behavior.py --in-file "$datasetTestPath/behaviors.tsv" --out-dir "$preTestPath" --mode test --user2int "$preTrainPath/user2int.tsv"

In [5]:
# test behaviors.tsv 전처리

#parse_behavior.py용 args
args = argparse.Namespace(
    in_file = testBehaviorsPath,
    user2int = f"{preTrainPath}/user2int.tsv",
    out_dir = preTestPath
)

os.makedirs(args.out_dir, exist_ok=True)

user2int = prep_behavior.load_idx_map_as_dict(args.user2int)
with open(args.in_file, 'r') as in_file:
    behavior = in_file.readlines()
    prep_behavior.generate_eval_data(behavior, args.out_dir, "test_behavior.tsv", user2int)

100%|██████████| 4880/4880 [00:00<00:00, 2146188.90it/s]


preparing eval data


100%|██████████| 7538/7538 [00:00<00:00, 60235.75it/s]


#### 3. python parse_news.py --in-file "$datasetTrainPath/news.tsv" --out-dir "$preTrainPath" --mode train --word-embeddings "$wordEmbeddingPath/glove.840B.300d.txt"

In [6]:
#parse_news.py용 args
args = argparse.Namespace(
    in_file = trainNewsPath,
    out_dir = preTrainPath,
    mode = "train",
    word_embeddings = word_embeddings,
    max_title = 20,
    max_abstract = 50
)

In [None]:
# prep embedings/vocab
embeddings = prep_news.process_word_embeddings(args.word_embeddings)

preparing/processing word-embeddings


In [None]:
print(type(embeddings))
print(len(embeddings))
print(type(embeddings["hello"]))
print(embeddings["hello"])


<class 'dict'>
2196017
<class 'str'>
0.25233 0.10176 -0.67485 0.21117 0.43492 0.16542 0.48261 -0.81222 0.041321 0.78502 -0.077857 -0.66324 0.1464 -0.29289 -0.25488 0.019293 -0.20265 0.98232 0.028312 -0.081276 -0.1214 0.13126 -0.17648 0.13556 -0.16361 -0.22574 0.055006 -0.20308 0.20718 0.095785 0.22481 0.21537 -0.32982 -0.12241 -0.40031 -0.079381 -0.19958 -0.015083 -0.079139 -0.18132 0.20681 -0.36196 -0.30744 -0.24422 -0.23113 0.09798 0.1463 -0.062738 0.42934 -0.078038 -0.19627 0.65093 -0.22807 -0.30308 -0.12483 -0.17568 -0.14651 0.15361 -0.29518 0.15099 -0.51726 -0.033564 -0.23109 -0.7833 0.018029 -0.15719 0.02293 0.49639 0.029225 0.05669 0.14616 -0.19195 0.16244 0.23898 0.36431 0.45263 0.2456 0.23803 0.31399 0.3487 -0.035791 0.56108 -0.25345 0.051964 -0.10618 -0.30962 1.0585 -0.42025 0.18216 -0.11256 0.40576 0.11784 -0.19705 -0.075292 0.080723 -0.02782 -0.15617 -0.44681 -0.15165 0.1692 0.098255 -0.031894 0.087143 0.26082 0.002706 0.1319 0.34439 -0.37894 -0.4114 0.081571 -0.11674 -0.43

In [None]:
prep_news.prep_news(args, embeddings)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


preparing/processing news content


Device set to use cuda:0
  0%|          | 1/26740 [00:00<55:55,  7.97it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 26740/26740 [02:16<00:00, 196.62it/s]


#### 4. python parse_news.py --in-file "$datasetTestPath/news.tsv" --out-dir "$preTestPath" --mode test --word-embeddings "$wordEmbeddingPath/glove.840B.300d.txt" --embedding-weights "$preTrainPath/embedding_weights.csv" --word2int "$preTrainPath/word2int.tsv" --category2int "$preTrainPath/category2int.tsv"

In [10]:
#parse_news.py용 args
args = argparse.Namespace(
    in_file = testNewsPath,
    out_dir = preTestPath,
    mode = "test",
    word_embeddings = wordEmbeddingPath,
    embedding_weights = f"{preTrainPath}/embedding_weights.csv",
    word2int = word2int,
    category2int = category2int,
    max_title = 20,
    max_abstract = 50
)

In [11]:
prep_news.prep_news(args, embeddings)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


preparing/processing news content


Device set to use cuda:0
100%|██████████| 247/247 [00:00<?, ?it/s]
100%|██████████| 41366/41366 [00:00<00:00, 2425035.35it/s]
100%|██████████| 41367/41367 [00:00<00:00, 3935531.42it/s]
100%|██████████| 18723/18723 [01:39<00:00, 188.66it/s]
