## 2. Preparing learning data

In [1]:
import os
# regular expression library import
import re

from sklearn import datasets, model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# morpho-analyzer library
from konlpy.tag import Hannanum
from konlpy.tag import Kkma

import pandas as pd

import numpy as np

In [4]:
dir_prefix = './data/'
target_dir = 'HKIB-20000'
cat_dirs = ['health', 'economy', 'science', 'education', 'culture', 'society', 'industry', 'leisure', 'politics']
cat_prefixes = ['건강', '경제', '과학', '교육', '문화', '사회', '산업', '여가', '정치']

In [5]:
# data mashup
files = os.listdir(dir_prefix + target_dir)

# Process 5 separate text files
for file in files:
    # Process only files with data
    if not file.endswith('.txt'):
        continue
    
    # Process each text file
    with open(dir_prefix + target_dir + '/' + file) as currfile:
        doc_cnt = 0
        docs = []
        curr_doc = None
        
        # Generate a list by dividing it into articles
        for curr_line in currfile:
            if curr_line.startswith('@DOCUMENT'):
                if curr_doc is not None:
                    docs.append(curr_doc)
                curr_doc = curr_line
                doc_cnt = doc_cnt + 1
                continue
            curr_doc = curr_doc + curr_line
        
        # Categorize each article by subject and organize it into a file by article
        for doc in docs:
            doc_lines = doc.split('\n')
            doc_no = doc_lines[1][9:]
            
            # Extract Topic
            doc_cat03 = ''
            for line in doc_lines[:10]:
                if line.startswith("#CAT'03:"):
                    doc_cat03 = line[10:]
                    break
            
            # Organize directories by extracted subject
            for cat_prefix in cat_prefixes:
                if doc_cat03.startswith(cat_prefix):
                    dir_index = cat_prefixes.index(cat_prefix)
                    break
                    
            # Remove document information and leave only the body of the article
            filtered_lines = []
            for line in doc_lines:
                if not (line.startswith('#') or line.startswith('@')):
                    filtered_lines.append(line)
                    
            # Write articles to a file in a topic-specific directory
            filename = 'hkib-' + doc_no + '.txt'
            filepath = dir_prefix + target_dir + '/' + cat_dirs[dir_index]
            
            if not os.path.exists(filepath):
                os.makedirs(filepath)
            f = open(filepath + '/' + filename, 'w')
            f.write('\n'.join(filtered_lines))
            f.close()

In [6]:
# Select subject folder to be covered (education, health)
#dirs = ['economy', 'society']
dirs = ['education', 'health']

# Create a list to store the words and labels that appear in the article
# Description variable
x_ls = [] 
# Objective variable
y_ls = []

tmp1 = []
tmp2 = ''

# Create a morpho analyzer Object
#tokenizer_han = Hannanum()
tokenizer = Kkma()

# Reads the files in each folder one by one and saves them in the list after preprocessing
for i, d in enumerate(dirs):
    # Retrieve file list
    files = os.listdir(dir_prefix + target_dir + '/' + d)
    
    for file in files:
        # Read each file
        f = open(dir_prefix + target_dir + '/' + d + '/' + file, 'r', encoding='utf-8')
        raw = f.read()  
        
        # Use regular expressions to remove unnecessary strings and then output the contents of the file
        reg_raw = re.sub(r'[-\'@#:/◆▲0-9a-zA-Z<>!-"*\(\)]', '', raw)
        reg_raw = re.sub(r'[ ]+', ' ', reg_raw)
        reg_raw = reg_raw.replace('\n', ' ')
        
        # Generate a list of nouns after morpho analysis
        tokens = tokenizer.nouns(reg_raw)
        
        for token in tokens:
            tmp1.append(token)
            
        tmp2 = ' '.join(tmp1)
        x_ls.append(tmp2)
        tmp1 = []
        
        # Save article topic labels to list
        y_ls.append(i)
        
        # Close File
        f.close()   

In [7]:
# Convert to a data frame and print description variables on the screen
pd.DataFrame(x_ls).tail(10)

Unnamed: 0,0
991,내년 월 호골 원료 신경통 치료제 판매 전면 금지 보사 보사부 부 일 멸종 멸종위기...
992,월일 술 담배 세미 판매 판매금지표시 금지 표시 모든 미만 시행 국무총리 소속 청소...
993,한방 국민 관심 병 의원 백 백여개 여 개 이상 일 의료 의료보험관리공단 보험 관리...
994,후의 한국 한국골프 골프 생각 그때 골프장 캐디 캐디백 백 전동 전동카골프 카 주류...
995,비아그라 한국인 겐 부작용 발기 발기부전 부전 치료제 임상 임상시험 시험 결과 평가...
996,원진 원진레이온 레이온 비상 비상대책위원회위원장 대책 위원회 위원장 박 박인도 인도...
997,보사부 일 콜레라 유입 유입주의보 주의보 국내 국내유입 주민 주민홍보 홍보 만전 일...
998,국내산 코카콜라 라이트 방부제 검출 국내 시판 시판중인 중인 일 식품 식품의약품안전...
999,밤 남편 성폭력 여성 성기 유죄 무죄 미국 절단 로 로리 리 보 보비트 비트 법정 ...
1000,우리 우리나라 나라 형 간염 보균자 수 백만 백만명정도 명 정도 성인 약 소아 항원...


In [8]:
# Print words extracted from the first article
print(x_ls[0])

학년도 대입 대입원서접수 원서 접수 마감 마감결과 결과 일부 중위 대학 수험생 동점 동점자가 자가 다수 발생 예상 가운데 동 동점자 점자 처리 처리기준 기준 이 이입시방법 입시 방법 만큼 전망 일 본고사 경우 수능 수능성적 성적 내신 시하 일부 점수 세분화 계열 특정 특정영역 영역 득점자 우선적 선발 반면 본고사실시 실시 우선 수능내신연소자 연소자 순 처리키 키 대의 평균 평균경쟁률 경쟁률 기록 속출 세종 세종대 대 내신성적 뒤 졸업 졸업년도 년도 생년월일 천 천백명 백 명 정원 만 만천백명 지원 동국대 수능시험 시험 총점 교과 교과성적 월 졸업예정자 예정자 출석 우선내신 출석성적 내신행동 행동 발달 특별 봉사 봉사활동나이연소자 활동 나이 등 한편 지원자 건국 건국대 홍익 홍익대 국민 국민대등 대등 자 합격 비 실시대학 본고사성적 합격자 고려대 제지 제지망 망 본고사성적예 예 체능 실기 실기고사 고사 우수자 강대 수능성적등 중시 단계 마련 편 연세대 석차 석차백분율 백분율 대외비 서울대 다음 미 미실시 내신행동발달 수능내신수능 점수인문 인문 외국어 외국어영역 자연 자연수리탐구영역 수리 탐구 사범 계인 적성 적성검사 적성검사졸업년도 검사 국민대 저 지망 지망수능수능 점수인문언어 언어 외국어영역내신연소자 수능수능 점수인문언어영역 영역내신연소자 단국 단국대 인 인문계열수능 문 외국어언어 탐구내신 자연계열수능수리 내신연소자 예체능 계열실기 탐구내신연소자 본고사수능 본고사중 중 국어 국어과목본고사중 과목 영어 영어인문사회계열 사회 수학 수학자연계열 내신총점 본고사내신석차 한성 한성대 본고사수능내신연소자


In [9]:
# print objective variables
print(y_ls)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [10]:
# Converts into NumPy arrays
x_array = np.array(x_ls)
y_array = np.array(y_ls)

# Count the number of times a word appears
cntvec = CountVectorizer()
x_cntvecs = cntvec.fit_transform(x_array)
x_cntarray = x_cntvecs.toarray()

# print the number of times a word appears in a data frame
pd.DataFrame(x_cntarray)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33562,33563,33564,33565,33566,33567,33568,33569,33570,33571
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Show words and their indexes
for k, v in sorted(cntvec.vocabulary_.items(), key=lambda x:x[1]):
    k, v
print(k, v)

힐탑호텔 33571


In [14]:
# Calculate TF-IDF for words
tfidf_vec = TfidfVectorizer(use_idf=True)
x_tfidf_vecs = tfidf_vec.fit_transform(x_array)
x_tfidf_array = x_tfidf_vecs.toarray()

# print the number of times a word appears after converting to a dataframe
pd.DataFrame(x_tfidf_array)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33562,33563,33564,33565,33566,33567,33568,33569,33570,33571
0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.000000,0.054548,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# data partitioning
train_X, test_X, train_Y, test_Y = model_selection.train_test_split(x_tfidf_array, y_array, test_size=0.2)

print(len(train_X))
print(len(test_X))

800
201


## 3. Make Tensor

In [16]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [17]:
# tensor of train data
train_X = torch.from_numpy(train_X).float()
train_Y = torch.from_numpy(train_Y).long()

# tensor of test data
test_X = torch.from_numpy(test_X).float()
test_Y = torch.from_numpy(test_Y).long()

print(train_X.shape)
print(train_Y.shape)

torch.Size([800, 33572])
torch.Size([800])


In [18]:
# Merge into one variable
train = TensorDataset(train_X, train_Y)

# Check the contents of the first data in the Tensor
print(train[0])

# Split into MiniBatch
train_loader = DataLoader(train, batch_size=100, shuffle=True)

(tensor([0., 0., 0.,  ..., 0., 0., 0.]), tensor(1))


## 4. Build NN

In [19]:
# build NN
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(33572, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 256)
        self.fc4 = nn.Linear(256, 128)
        self.fc5 = nn.Linear(128, 128)
        self.fc6 = nn.Linear(128, 2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = self.fc5(x)
        return F.log_softmax(x, dim=1)
    
# make an instance
model = Net()

## 5. Model fitting

In [21]:
# loss func
criterion = nn.CrossEntropyLoss()

# optimization
optimizer = optim.Adam(model.parameters(), lr=0.005)

# fitting
for epoch in range(20):
    total_loss = 0
    
    # data call
    for train_x, train_y in train_loader:
        # Calc Graph Configuration
        train_x, train_y = Variable(train_x), Variable(train_y)
        # Slope Initialization
        optimizer.zero_grad()
        # calc FP
        output = model(train_x)
        # calc loss
        loss = criterion(output, train_y)
        # calc BP
        loss.backward()
        # update weights
        optimizer.step()
        # Calc cumulative losses
        total_loss += loss.item()
        
    # print cum_loss every epochs
    if (epoch+1) % 1 == 0:
        print(epoch+1, total_loss)

1 20.582783699035645
2 8.259650707244873
3 5.738945186138153
4 5.271221160888672
5 4.705459117889404
6 3.0320937037467957
7 0.6753186099231243
8 0.4252451930951793
9 0.2344477631850168
10 0.16901431512087584
11 0.11699942505219951
12 0.07776993128936738
13 0.0619757954555098
14 0.05174226237431867
15 0.05041250800786656
16 0.05083385190118861
17 0.05049786812878665
18 0.05071822925003744
19 0.04335879907443996
20 0.046638489860143295


In [22]:
# Calc Graph Configuration
test_x, test_y = Variable(test_X), Variable(test_Y)

# Have output to be 0 or 1
result = torch.max(model(test_x).data, 1)[1]

# measure accuracy of model
accuracy = sum(test_y.data.numpy() == result.numpy()) / len(test_y.data.numpy())

accuracy

0.9800995024875622