# Data Processing

## 1. Global Settings

순서 파악파기

python parse_behavior.py --in-file "$datasetTrainPath/behaviors.tsv" --out-dir "$preTrainPath" --mode train

python parse_behavior.py --in-file "$datasetTestPath/behaviors.tsv" --out-dir "$preTestPath" --mode test --user2int "$preTrainPath/user2int.tsv"

python parse_news.py --in-file "$datasetTrainPath/news.tsv" --out-dir "$preTrainPath" --mode train --word-embeddings "$wordEmbeddingPath/glove.840B.300d.txt"

python parse_news.py --in-file "$datasetTestPath/news.tsv" --out-dir "$preTestPath" --mode test --word-embeddings "$wordEmbeddingPath/glove.840B.300d.txt" --embedding-weights "$preTrainPath/embedding_weights.csv" --word2int "$preTrainPath/word2int.tsv" --category2int "$preTrainPath/category2int.tsv"

parse_behavior.py 먼저 진행

parse_news는 glove 임베딩 파일 필요

In [None]:
from sklearn.model_selection import train_test_split
import argparse
import parse_behavior

In [None]:
size = "demo"

datasetPath = f"MIND/{size}"
datasetTrainPath = f"{datasetPath}/train"
datasetTestPath = f"{datasetPath}/test"
trainBehaviorsPath = f"{datasetTrainPath}/behaviors.tsv"
testBehaviorsPath = f"{datasetTestPath}/behaviors.tsv"
wordEmbeddingPath = "word_embeddings"

processedDataPath = "processed"
preTrainPath = f"{processedDataPath}/{size}/train"
preTestPath = f"{processedDataPath}/{size}/test"

user2int = f"{preTrainPath}/user2int.tsv"

word_embeddings = f"{wordEmbeddingPath}/glove.840B.300d.txt"
word2int = f"{preTrainPath}/word2int.tsv"

#### 1. python parse_behavior.py --in-file "$datasetTrainPath/behaviors.tsv" --out-dir "$preTrainPath" --mode train

In [None]:
# train behaviors.tsv 전처리

#parse_behavior.py용 args
args = argparse.Namespace(
    in_file = trainBehaviorsPath,
    split = 0.1,
    out_dir = preTrainPath
)

with open(args.in_file, 'r') as trainBehaviors:
    behavior = trainBehaviors.readlines()
    if (args.split == 0):
        parse_behavior.generate_training_data(behavior, args.out_dir)
    else:
        train_behavior, val_behavior = train_test_split(behavior,test_size=args.split, random_state=1234)
        user2int = parse_behavior.generate_training_data(train_behavior, args.out_dir)
        parse_behavior.generate_eval_data(val_behavior, args.out_dir, "val_behavior.tsv", user2int)

#### 2. python parse_behavior.py --in-file "$datasetTestPath/behaviors.tsv" --out-dir "$preTestPath" --mode test --user2int "$preTrainPath/user2int.tsv"

In [None]:
# test behaviors.tsv 전처리

#parse_behavior.py용 args
args = argparse.Namespace(
    in_file = testBehaviorsPath,
    user2int = f"{preTrainPath}/user2int.tsv",
    out_dir = preTrainPath
)

user2int = parse_behavior.load_idx_map_as_dict(args.user2int)
with open(args.in_file, 'r') as in_file:
    behavior = in_file.readlines()
    parse_behavior.generate_eval_data(behavior, args.out_dir, "test_behavior.tsv", user2int)

#### 3. python parse_news.py --in-file "$datasetTrainPath/news.tsv" --out-dir "$preTrainPath" --mode train --word-embeddings "$wordEmbeddingPath/glove.840B.300d.txt"

In [None]:
from os import path
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import parse_news

In [None]:
# train behaviors.tsv 전처리

#parse_behavior.py용 args
args = argparse.Namespace(
    in_file = trainBehaviorsPath,
    split = 0.1,
    out_dir = preTrainPath
)

# prep embedings/vocab
embeddings = process_word_embeddings(args.word_embeddings)

    
# parse news 
with open(args.in_file, 'r', encoding='utf-8') as in_file:
    with open(path.join(args.out_dir, 'parsed_news.tsv'), 'w', newline='') as news_file:  
        news_writer = csv.writer(news_file, delimiter='\t')
        print("preparing/processing news content")
        news_collection = in_file.readlines()
        news2int = {}
        # sentiment analyzer
        dsb_sentiment_classifier = pipeline('sentiment-analysis')
        vader_sentiment_classifier = SentimentIntensityAnalyzer()
        # max title/abstract length
        max_title_length = int(args.max_title)
        max_abstract_length = int(args.max_abstract)
        if args.mode == "train": 
            category2int = {}
            word2int = {}
            embedding_weights = []
        else:
            category2int = load_idx_map_as_dict(args.category2int)
            word2int = load_idx_map_as_dict(args.word2int)
            embedding_weights = load_embedding_weights(args.embedding_weights)
            
        # iterate over news
        for news in tqdm(news_collection):
            newsid, category, subcategory, title, abstract, _, _, _ = news.strip().split("\t")
            if newsid not in news2int:
                news2int[newsid] = len(news2int) + 1
            else:
                continue
            # category to int
            if category not in category2int:
                if(args.mode == "train"):
                    category2int[category] = len(category2int) + 1
                    category_id = category2int[category]
                else:
                    category_id = 0
            else: 
                category_id = category2int[category]
            if subcategory not in category2int:
                if(args.mode == "train"):
                    category2int[subcategory] = len(category2int) + 1
                    subcategory_id = category2int[subcategory]
                else:
                    subcategory_id = 0
            else: 
                subcategory_id = category2int[subcategory]
            # parse/prep title --> to token ids
            # crop at max-title or pad to max-title
            title_tokens = word_tokenize(title.strip().lower())
            title_word_idxs = []
            for token in title_tokens:
                if token not in embeddings:
                    continue
                if token not in word2int:
                    word2int[token] = str(len(word2int) + 1)
                    embedding_weights.append(embeddings[token])
                title_word_idxs.append(word2int[token])
            # title_word_idxs = [word2int[token] for token in title_tokens if token in word2int]
            if len(title_word_idxs) > max_title_length:
                title_word_idxs = title_word_idxs[:max_title_length]
            else:
                title_word_idxs = title_word_idxs + ["0"]*(max_title_length-len(title_word_idxs))
            title_word_idxs_str = " ".join(title_word_idxs)
            # parse/prep abstract --> to token ids
            # crop at max-abstract or pad to max-abstract
            abstract_tokens = word_tokenize(abstract.strip().lower())
            abstract_word_idxs = []
            for token in abstract_tokens:
                if token not in embeddings:
                    continue
                if token not in word2int:
                    word2int[token] = str(len(word2int) + 1)
                    embedding_weights.append(embeddings[token])
                abstract_word_idxs.append(word2int[token])
            if len(abstract_word_idxs) > max_abstract_length:
                abstract_word_idxs = abstract_word_idxs[:max_abstract_length]
            else:
                abstract_word_idxs = abstract_word_idxs + ["0"]*(max_abstract_length-len(abstract_word_idxs))
            abstract_word_idxs_str = " ".join(abstract_word_idxs)
            # calc sentiments scores
            # vader
            vs = vader_sentiment_classifier.polarity_scores(title.strip())
            vader_sentiment = vs['compound']
            # bert
            dsbs_label, dsbs_score = dsb_sentiment_classifier(title.strip())[0].values()
            if(dsbs_label == "POSITIVE"):
                bert_sentiment = (1-dsbs_score)*(-1) + dsbs_score
            else:
                bert_sentiment = (dsbs_score)*(-1) + (1-dsbs_score)
            # prepare output
            news_writer.writerow([
                newsid,
                category_id,
                subcategory_id,
                title_word_idxs_str,
                abstract_word_idxs_str,
                vader_sentiment,
                bert_sentiment
            ])
        if args.mode == "train":
            with open(path.join(args.out_dir, 'category2int.tsv'), 'w', encoding='utf-8', newline='') as file:  
                cat_writer = csv.writer(file, delimiter='\t')
                for key, value in category2int.items():
                    cat_writer.writerow([key, value])
        with open(path.join(args.out_dir, 'word2int.tsv'), 'w', encoding='utf-8', newline='') as file:
            word_writer = csv.writer(file, delimiter='\t')
            for key, value in word2int.items():
                word_writer.writerow([key, value])
        with open(path.join(args.out_dir, 'embedding_weights.csv'), 'w', encoding='utf-8', newline='') as file:
            # 첫 줄에 index=0(빈칸) 패딩용 0 0 0 ... 0 가중치 추가
            padding_weights = " ".join(["0"] * 300) + "\n"
            file.write(padding_weights)
            for weights in embedding_weights:
                file.write(weights)