In [1]:
import pandas as pd
import pickle
import numpy as np

p_train = r'../data/ag_train.csv'
p_test = r'../data/ag_test.csv'
df_train = pd.read_csv(p_train, header=None)
df_test = pd.read_csv(p_test, header=None)
df_train.columns = ['labels', 'title', 'data']
df_test.columns = ['labels', 'title', 'data']
print(df_train.shape, df_test.shape)
df_train.head()

(120000, 3) (7600, 3)


Unnamed: 0,labels,title,data
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [2]:
df_train['raw'] = df_train['title'] + ' '+df_train['data']
df_test['raw'] = df_test['title'] + ' '+df_test['data']

# label이 1~4까지라서 => 0~3 까지로 변경해주어야 함
df_train['labels'] = df_train['labels'] - 1
df_test['labels'] = df_test['labels'] - 1

df_train

Unnamed: 0,labels,title,data,raw
0,2,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Wall St. Bears Claw Back Into the Black (Reute...
1,2,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Carlyle Looks Toward Commercial Aerospace (Reu...
2,2,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,2,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Iraq Halts Oil Exports from Main Southern Pipe...
4,2,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","Oil prices soar to all-time record, posing new..."
...,...,...,...,...
119995,0,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...,Pakistan's Musharraf Says Won't Quit as Army C...
119996,1,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...,Renteria signing a top-shelf deal Red Sox gene...
119997,1,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...,Saban not going to Dolphins yet The Miami Dolp...
119998,1,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...,Today's NFL games PITTSBURGH at NY GIANTS Time...


# Tokenizing 미리 해서 저장해두기

- train dataset에서 vocab 추출하고
- 추출한 vocab, token2idx로 train_data 바꿔치기 하고
- train_data의 max 길이로 패딩 (BasicCollator 참조)
- train_data랑, train_labels 묶어서 ag_train.pkl 로 저장
- 마찬가지로
- 추출한 vocab, token2idx로 test_data 바꿔치기 하고
- train_data의 max 길이로 패딩 (BasicCollator 참조)
- test_data, test_labels 묶어서 ag_test.pkl 로 저장

In [3]:
import collections
from typing import List, Tuple, Dict
from transformers import BertTokenizer


def build_tok_vocab(tokenize_target: List,
                    tokenizer,
                    min_freq: int = 1,
                    max_vocab=19998) -> Tuple[List[str], Dict]:
    vocab = []
    print('start tokenizing')
    for i, target in enumerate(tokenize_target):
        if i % 10000 == 0:
            print(i)
        try:
            temp = tokenizer.tokenize(target)
            vocab.extend(temp)
        except Exception as e_msg:
            error_target = f'idx: {i} \t target:{target}'

    print('start counting')
    vocab = collections.Counter(vocab)
    temp = {}
    # min_freq보다 적은 단어 거르기
    for key in vocab.keys():
        if vocab[key] >= min_freq:
            temp[key] = vocab[key]
    vocab = temp

    print('start sorting')
    # 가장 많이 등장하는 순으로 정렬한 후, 적게 나온것 위주로 vocab set에서 빼기
    vocab = sorted(vocab, key=lambda x: -vocab[x])
    if len(vocab) > max_vocab:
        vocab = vocab[:max_vocab]

    tok2idx = {'<pad>': 0, '<unk>': 1}
    for tok in vocab:
        tok2idx[tok] = len(tok2idx)
    vocab.extend(['<pad>', '<unk>'])
    print('tokenizing done')

    return vocab, tok2idx

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
data = [row['raw'] for i, row in df_train.iterrows()]
vocab_set, tok2idx = build_tok_vocab(data, tokenizer, min_freq=1, max_vocab=19998)
print(f'Vocab set size: {len(tok2idx)}')
vocab_set[0:5]

start tokenizing
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
start counting
start sorting
tokenizing done
Vocab set size: 20000


['.', 'the', ',', '-', 'to']

In [4]:
max_len = 0
for i in data:
    if len(i) > max_len:
        max_len = len(i)
print(f'max length: {max_len}')

tokenized_idx_data = []

for sentence in data:
    tokened_sentence = tokenizer.tokenize(sentence)
    token_list = []
    for word in tokened_sentence:
        if word not in tok2idx.keys():
            token_list.append(tok2idx['<unk>'])
        else:
            token_list.append(tok2idx[word])

    padding_list = [0] * (max_len - len(token_list))
    token_list = padding_list + token_list
    tokenized_idx_data.append(token_list)

print(len(tokenized_idx_data), len(tokenized_idx_data[0]))

max length: 1012
120000 1012


In [5]:
train_tokenized_idx = np.array(tokenized_idx_data)
train_labels_np = np.array(df_train['labels'])
train_data = (train_tokenized_idx, train_labels_np)
print(train_tokenized_idx.shape)
print(train_labels_np.shape)

print('now dumping pickle')
with open(file='ag_train.pkl', mode='wb') as f:
    pickle.dump(train_data, f)

(120000, 1012)
(120000,)
now dumping pickle


In [6]:
# - 마찬가지로 test 에 대해서도

test_data = [row['raw'] for i, row in df_test.iterrows()]

test_max_len = 0
for i in data:
    if len(i) > test_max_len:
        test_max_len = len(i)
print(f'test max length: {test_max_len}')
if test_max_len > max_len:
    print('test max length is bigger than train_max_len')

tokenized_idx_test_data = []

for sentence in test_data:
    tokened_sentence = tokenizer.tokenize(sentence)
    token_list = []
    for word in tokened_sentence:
        if word not in tok2idx.keys():
            token_list.append(tok2idx['<unk>'])
        else:
            token_list.append(tok2idx[word])

    padding_list = [0] * (max_len - len(token_list))
    token_list = padding_list + token_list
    tokenized_idx_test_data.append(token_list)

print(len(tokenized_idx_test_data), len(tokenized_idx_test_data[0]))

test max length: 1012
7600 1012


In [7]:
test_tokenized_idx = np.array(tokenized_idx_test_data)
test_labels_np = np.array(df_test['labels'])
test_data = (test_tokenized_idx, test_labels_np)
print(test_tokenized_idx.shape)
print(test_labels_np.shape)

print('now dumping test pickle')
with open(file='ag_test.pkl', mode='wb') as f:
    pickle.dump(test_data, f)

(7600, 1012)
(7600,)
now dumping test pickle
