# Data Processing

## Global Settings and imports

In [7]:
# jupyter notebook에서 import 해서 쓰는 모듈의 코드가 변경될 시, 변동 사항을 자동으로 반영해주는 기능 켜기
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
"""
module import를 진행할 시, sys.path에 등록된 경로에서 해당 모듈 파일을 찾습니다.
그런데 프로젝트 폴더가 기본적으로 등록되어있지 않아서
project/data 같은 경로의 모듈을 사용하기 위해 둘 중 한가지 방법을 써야 합니다.
직접 경로를 추가 => sys.path.append(...)
import할 때 상대 경로 사용 => from ...data.preprocess import
그런데 상대 경로가 이래저래 요상한 점이 많아서
절대 경로를 등록해서 사용하기로 했습니다.
"""
import os
import sys
from os import path

PROJECT_DIR = path.abspath(path.join(os.getcwd(), "../.."))
sys.path.append(PROJECT_DIR)
print(PROJECT_DIR)

c:\Users\nclud\Desktop\projects\python\NewsRecommendation\SentiRecTest\project


In [9]:
from data.preprocess import prep_behavior
from data.preprocess import prep_news
from data.preprocess.prep_behavior import PrepBehaviorArgs
from data.preprocess.prep_news import PrepNewsArgs

In [16]:
"""
경로가 폴더를 나타낼 경우 Dir, 파일일 경우 Path로 명명

size: 전처리를 진행할 데이터셋의 크기 (demo, small, large 등)
"""

size = "tiny"

DATA_DIR = path.join(PROJECT_DIR, "data")
datasetDir = path.join(DATA_DIR, "MIND", size)
trainBehaviorsPath = path.join(datasetDir, "train", "behaviors.tsv")
testBehaviorsPath = path.join(datasetDir, "test", "behaviors.tsv")
trainNewsPath = path.join(datasetDir, "train", "news.tsv")
testNewsPath = path.join(datasetDir, "test", "news.tsv")

processedDataDir = path.join(DATA_DIR, "preprocessed_data")
preTrainDir = path.join(processedDataDir, size, "train")
preTestDir = path.join(processedDataDir, size, "test")

wordEmbeddingDir = path.join(DATA_DIR, "word_embeddings")
wordEmbeddingPath = path.join(wordEmbeddingDir, "glove.840B.300d.txt")
wordEmbeddingNpyPath = path.join(wordEmbeddingDir, "glove.840B.300d.npy")
wordEmbeddingTokensPath = path.join(wordEmbeddingDir, "glove.840B.300d.tokens.tsv")

os.makedirs(preTrainDir, exist_ok=True)
os.makedirs(preTestDir, exist_ok=True)

## 1. Train/Test 데이터셋 behaviors.tsv 전처리

### 생성되는 파일
##### Train
1. train_behavior.tsv
2. user2int.tsv
3. val_behavior.tsv
##### Test
1. test_behavior.tsv

In [11]:
args = PrepBehaviorArgs(
    train_behavior_path = trainBehaviorsPath,
    test_behavior_path = testBehaviorsPath,
    train_out_dir = preTrainDir,
    test_out_dir = preTestDir,
    user2int_path = f"{preTrainDir}/user2int.tsv",
    split_test_size = 0.3,
    n_negative = 4
)

In [12]:
prep_behavior.prep_behavior(args)

preparing training data


100%|██████████| 7/7 [00:00<00:00, 13649.52it/s]


preparing eval data


100%|██████████| 3/3 [00:00<00:00, 2989.53it/s]
100%|██████████| 7/7 [00:00<?, ?it/s]


preparing eval data


100%|██████████| 5/5 [00:00<00:00, 9967.45it/s]


## 2. Train/Test 데이터셋 news.tsv 전처리

### 생성되는 파일
##### Train
1. parsed_news.tsv
2. category2int.tsv
3. embedding_weights.csv
4. word2int.tsv
##### Test
1. parsed_news.tsv
2. embedding_weights.csv
3. word2int.tsv

In [17]:
#parse_news_combined.py용 args
args = PrepNewsArgs(
    train_news_path = trainNewsPath,
    test_news_path = testNewsPath,
    train_out_dir = preTrainDir,
    test_out_dir = preTestDir,
    word_embedding_path = wordEmbeddingPath,
    word_embedding_npy_path = wordEmbeddingNpyPath,
    word_embedding_tokens_path = wordEmbeddingTokensPath,
    max_title = 20,
    max_abstract = 50
)

In [18]:
# prep embedings/vocab
embeddings = prep_news.load_word_embeddings_by_npy(args.word_embedding_npy_path, args.word_embedding_tokens_path)

load word-embeddings


100%|██████████| 2196017/2196017 [00:03<00:00, 582444.32it/s]


In [19]:
prep_news.prep_news(args, embeddings)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


preparing/processing train news content


Device set to use cuda:0
  1%|          | 10/938 [00:00<01:01, 15.03it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 938/938 [00:07<00:00, 122.32it/s]
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


preparing/processing test news content


Device set to use cuda:0
100%|██████████| 443/443 [00:03<00:00, 134.07it/s]


## 기타 테스트 코드

In [20]:
# embeddings 테스트
print(type(embeddings))
print(len(embeddings))
print(type(embeddings["hello"]))
print(embeddings["hello"])

<class 'dict'>
2196017
<class 'numpy.ndarray'>
[ 0.25233    0.10176   -0.67485    0.21117    0.43492    0.16542
  0.48261   -0.81222    0.041321   0.78502   -0.077857  -0.66324
  0.1464    -0.29289   -0.25488    0.019293  -0.20265    0.98232
  0.028312  -0.081276  -0.1214     0.13126   -0.17648    0.13556
 -0.16361   -0.22574    0.055006  -0.20308    0.20718    0.095785
  0.22481    0.21537   -0.32982   -0.12241   -0.40031   -0.079381
 -0.19958   -0.015083  -0.079139  -0.18132    0.20681   -0.36196
 -0.30744   -0.24422   -0.23113    0.09798    0.1463    -0.062738
  0.42934   -0.078038  -0.19627    0.65093   -0.22807   -0.30308
 -0.12483   -0.17568   -0.14651    0.15361   -0.29518    0.15099
 -0.51726   -0.033564  -0.23109   -0.7833     0.018029  -0.15719
  0.02293    0.49639    0.029225   0.05669    0.14616   -0.19195
  0.16244    0.23898    0.36431    0.45263    0.2456     0.23803
  0.31399    0.3487    -0.035791   0.56108   -0.25345    0.051964
 -0.10618   -0.30962    1.0585    -0.42