# Data Processing

## 1. Global Settings

In [1]:
# jupyter notebook에서 import 해서 쓰는 모듈의 코드가 변경될 시, 변동 사항을 자동으로 반영해주는 기능 켜기
%load_ext autoreload
%autoreload 2

In [None]:
import argparse
import os
from sklearn.model_selection import train_test_split
from preprocess import prep_behavior
from preprocess import prep_news

In [None]:
size = "demo"

datasetPath = f"MIND/{size}"
datasetTrainDir = f"{datasetPath}/train"
datasetTestDir = f"{datasetPath}/test"
trainBehaviorsPath = f"{datasetTrainDir}/behaviors.tsv"
testBehaviorsPath = f"{datasetTestDir}/behaviors.tsv"
trainNewsPath = f"{datasetTrainDir}/news.tsv"
testNewsPath = f"{datasetTestDir}/news.tsv"

processedDataPath = "preprocessed_data"
preTrainDir = f"{processedDataPath}/{size}/train"
preTestDir = f"{processedDataPath}/{size}/test"

user2intPath = f"{preTrainDir}/user2int.tsv"
word2intPath = f"{preTrainDir}/word2int.tsv"
category2intPath = f"{preTrainDir}/category2int.tsv"

wordEmbeddingDir = "word_embeddings"
wordEmbeddingPath = f"{wordEmbeddingDir}/glove.840B.300d.txt"

#### 1. python parse_behavior.py --in-file "$datasetTrainPath/behaviors.tsv" --out-dir "$preTrainPath" --mode train

In [None]:
# train behaviors.tsv 전처리

args = argparse.Namespace(
    in_file = trainBehaviorsPath,
    split = 0.1,
    out_dir = preTrainDir,
    n_negative = 4
)

os.makedirs(args.out_dir, exist_ok=True)

with open(args.in_file, 'r') as trainBehaviors:
    behavior = trainBehaviors.readlines()
    if (args.split == 0):
        prep_behavior.generate_training_data(args, behavior, args.out_dir)
    else:
        train_behavior, val_behavior = train_test_split(behavior,test_size=args.split, random_state=1234)
        user2intPath = prep_behavior.generate_training_data(args, train_behavior, args.out_dir)
        prep_behavior.generate_eval_data(val_behavior, args.out_dir, "val_behavior.tsv", user2intPath)

preparing training data


100%|██████████| 19830/19830 [00:00<00:00, 39633.16it/s]


preparing eval data


100%|██████████| 2204/2204 [00:00<00:00, 47973.96it/s]


#### 2. python parse_behavior.py --in-file "$datasetTestPath/behaviors.tsv" --out-dir "$preTestPath" --mode test --user2int "$preTrainPath/user2int.tsv"

In [None]:
# test behaviors.tsv 전처리

#parse_behavior.py용 args
args = argparse.Namespace(
    in_file = testBehaviorsPath,
    user2int = f"{preTrainDir}/user2int.tsv",
    out_dir = preTestDir
)

os.makedirs(args.out_dir, exist_ok=True)

user2intPath = prep_behavior.load_idx_map_as_dict(args.user2int)
with open(args.in_file, 'r') as in_file:
    behavior = in_file.readlines()
    prep_behavior.generate_eval_data(behavior, args.out_dir, "test_behavior.tsv", user2intPath)

100%|██████████| 4880/4880 [00:00<00:00, 2441632.29it/s]


preparing eval data


100%|██████████| 7538/7538 [00:00<00:00, 55018.03it/s]


#### 3. python parse_news.py --in-file "$datasetTrainPath/news.tsv" --out-dir "$preTrainPath" --mode train --word-embeddings "$wordEmbeddingPath/glove.840B.300d.txt"

In [None]:
#parse_news.py용 args
args = argparse.Namespace(
    in_file = trainNewsPath,
    out_dir = preTrainDir,
    mode = "train",
    word_embeddings = wordEmbeddingPath,
    max_title = 20,
    max_abstract = 50
)

In [None]:
# prep embedings/vocab
embeddings = prep_news.process_word_embeddings(args.word_embeddings)

preparing/processing word-embeddings


100%|██████████| 2196018/2196018 [00:27<00:00, 78857.79it/s]


In [None]:
print(type(embeddings))
print(len(embeddings))
print(type(embeddings["hello"]))
print(embeddings["hello"])

In [None]:
prep_news.prep_news(args, embeddings)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


preparing/processing train news content


Device set to use cuda:0
100%|██████████| 26740/26740 [02:18<00:00, 193.36it/s]
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


preparing/processing test news content


Device set to use cuda:0
100%|██████████| 18723/18723 [01:36<00:00, 194.76it/s]


#### 4. python parse_news.py --in-file "$datasetTestPath/news.tsv" --out-dir "$preTestPath" --mode test --word-embeddings "$wordEmbeddingPath/glove.840B.300d.txt" --embedding-weights "$preTrainPath/embedding_weights.csv" --word2int "$preTrainPath/word2int.tsv" --category2int "$preTrainPath/category2int.tsv"

In [None]:
#parse_news.py용 args
args = argparse.Namespace(
    in_file = testNewsPath,
    out_dir = preTestDir,
    mode = "test",
    word_embeddings = wordEmbeddingDir,
    embedding_weights = f"{preTrainDir}/embedding_weights.csv",
    word2int = word2intPath,
    category2int = category2intPath,
    max_title = 20,
    max_abstract = 50
)

In [11]:
prep_news.prep_news(args, embeddings)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


preparing/processing news content


Device set to use cuda:0
100%|██████████| 247/247 [00:00<?, ?it/s]
100%|██████████| 41366/41366 [00:00<00:00, 2425035.35it/s]
100%|██████████| 41367/41367 [00:00<00:00, 3935531.42it/s]
100%|██████████| 18723/18723 [01:39<00:00, 188.66it/s]
