In [18]:
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
import re
from datetime import datetime
# import konlpy
from eunjeon import Mecab
from matplotlib import pyplot as plt

In [2]:
mecab = Mecab()

In [3]:
data = pd.read_csv("data/train.csv", encoding="euc-kr")

In [4]:
data.head()

Unnamed: 0,id,year_month,text,smishing
0,0,2017-01,XXX은행성산XXX팀장입니다.행복한주말되세요,0
1,1,2017-01,오늘도많이웃으시는하루시작하세요XXX은행 진월동VIP라운지 XXX올림,0
2,2,2017-01,안녕하십니까 고객님. XXX은행입니다.금일 납부하셔야 할 금액은 153600원 입니...,0
3,4,2017-01,XXX 고객님안녕하세요XXX은행 XXX지점입니다지난 한 해 동안 저희 XXX지점에 ...,0
4,5,2017-01,1월은 새로움이 가득XXX입니다.올 한해 더 많이행복한 한해되시길바랍니다,0


In [9]:
print("Data length: {}".format(len(data)))
print("Data shape {}".format(data.shape))
df = data

Data length: 295945
Data shape (295945, 4)


In [6]:
def after_effect(arr):
    words_in_brackets = ['광고']
    ret = []
    idx = 0
    while idx < len(arr):
        if arr[idx] == '(' and idx + 2 < len(arr) and arr[idx + 2] == ')':
            if arr[idx + 1] in '일월화수목금토':
                if len(ret) > 0 and ret[-1] == 'X':
                    ret.pop()
                    ret.append('X일')
                ret.append('(요일)')
                idx += 2
            elif arr[idx + 1] in words_in_brackets:
                ret.append('(' + arr[idx + 1] + ')')
                idx += 2
            else:
                buff = re.sub('[0-9.X]+', 'X', arr[idx])
                buff = re.sub('[-]+', '-', buff)
                buff = re.sub('[X]+', 'X', buff)
                ret.append(buff)
        else:
            buff = re.sub('[0-9.X]+', 'X', arr[idx])
            buff = re.sub('[-]+', '-', buff)
            buff = re.sub('[X]+', 'X', buff)
            ret.append(buff)
        idx += 1
    return ret

In [7]:
def parse_sentence(sentence):
    prev_word, prev_pos = '', ''
    nouns = []
    words = []
    adverbs = []
    etc = []
    buff = ''
    ret = []
    condition = False
    idx, condidx = 0, 0
    
    for word, pos in mecab.pos(sentence):
        # S인 경우 '() 각각 있는 경우 나누어서 여러개의 S로 처리'
        if pos in ('SF') or pos[:1] in ('J'):
            # 은/는/이/가/. 등
            prev_word, prev_pos = '', ''
            continue

        if pos[:1] == 'N' and prev_pos[:1] == 'S':
            # 1억, 1천만원 등
            word = re.sub('[억천만]', '', word)
            if len(word) == 0:
                continue

#         if pos[:1] == 'S':
#             word = re.sub('[(:)]', '', word)
#             word = re.sub('[0-9.X]+', 'X', word)

        # 조건절 판단
        if prev_pos[-3:] == 'ETM' and word in ['분','고객','개인','사업자','이','대상','당신','직원','VIP', '자','분도','전문직','신용자','외국인','본인','임직원','부모','회계사','귀하','투자자','분과']:
            condition = True  
        elif pos[:1] == 'N':
            if word[:2] in ('경우', '필요', '라면', '다면'): # 경우, 필요 단어 등장 시 조건절로 판단
                condition = True
        elif pos[-2:] == 'EC':
            # 여도, 라도, 라면, 면
            if word[-1:] in ('면', '도') or word[-2:] in ('도록'):
                condition = True

        if pos[:1] == 'E':
            buff += word
        #elif pos[:1] == 'N' and prev_pos[:1] == 'N' and len(word) == 1:
        #    buff += word
        elif pos[:1] == 'S' and prev_pos[:1] == 'S' and word[:1] in 'X0123456789%.-': #('%', '.', '-'):
            pos_x = re.search('[0-9X]', buff)
            pos_x = pos_x.span()[0] if pos_x is not None else len(buff)
            if pos_x > 0:
                words.append(buff[:pos_x])
                buff = buff[pos_x:]
            
            # 중간에 %가 있는 경우
            if len(buff) > 0:
                pos_pct = re.search('%', buff)
                pos_pct = (pos_pct.span()[0] + 1) if pos_pct is not None else 0
                if pos_pct > 0:
                    words.append(buff[:pos_pct])
                    buff = buff[pos_pct:]
                
            buff += word

        elif pos[:1] in ('N', 'S') and prev_pos[:1] == 'S' and len(buff) > 0:
            # 숫자 앞에 문자가 있는 경우
            pos_x = re.search('[0-9X]', buff)
            pos_x = pos_x.span()[0] if pos_x is not None else len(buff)
            if pos_x > 0:
                words.append(buff[:pos_x])
                buff = buff[pos_x:]

            # 중간에 %가 있는 경우
            if len(buff) > 0:
                pos_pct = re.search('%', buff)
                pos_pct = (pos_pct.span()[0] + 1) if pos_pct is not None else 0
                if pos_pct > 0:
                    words.append(buff[:pos_pct])
                    buff = buff[pos_pct:]

            if word in ('시', '분', '초', '시간'):
                buff = re.sub('시', '', buff)
                buff += '시'
            elif word in ('년', '월', '일', '개월'):
                buff = re.sub('일', '', buff)
                buff += '일'
            elif word in ('원'):
                buff = re.sub('원', '', buff)
                buff += '원'
                prev_word, prev_pos = word, pos
            elif word in ('배'):
                buff += '%'
                prev_word, prev_pos = word, pos
            elif word in ('kg', 'pt', '형'):
                buff += '단위'
                prev_word, prev_pos = word, 'N' #pos
            elif word in ('건', '종', '대'):
                buff += '개'
                prev_word, prev_pos = word, pos
            else:
                if len(buff) > 0:
                    words.append(buff)
                buff = word
        else:
            if len(buff) > 0:
                # 중간에 %가 있는 경우
                if len(buff) > 0:
                    pos_pct = re.search('%', buff)
                    pos_pct = (pos_pct.span()[0] + 1) if pos_pct is not None else 0
                    if pos_pct > 0:
                        words.append(buff[:pos_pct])
                        buff = buff[pos_pct:]
                if True: #buff != 'X':
                    words.append(buff)
                    # M, N 경우만 저장해보자
                    if prev_pos[:1] in ('N'):  # 'M', 
                        nouns.append(buff)
                    elif prev_pos[:1] in ('M'):
                        adverbs.append(buff)
            buff = word
            prev_word, prev_pos = word, pos

        if pos[-2:] in ('EF'): # and word[0][-1:] in ['요', '다']:
            # words가 꼭 명사를 뜻하는 게 아니고 특색있는 키워드 모두를 포함
            if len(buff) > 0:
                words.append(buff)
            buff = ''
            # 문장의 끝을 구분
            #words.append('(절취선)')
            
            words = after_effect(words)
            ret.append([word, condition, words, nouns, adverbs])
            idx += 1
            if condition: condidx += 1
            nouns = []
            words = []
            adverbs = []
            condition = False
    if len(buff) > 0:
        words.append(buff)
    if len(words) > 0:
        words = after_effect(words)
        ret.append([word, condition, words, nouns, adverbs])
        idx += 1
        if condition: condidx += 1
    etc.append(condidx / idx)
    return ret, etc

In [None]:
smishings = []
normals = []
idx = 0
for idx, item in df.iterrows():
    #print(item.text)
    splited, etc = parse_sentence(item.text)
    etc.append(np.log(len(item.text))/8)  # 문장 전체의 길이
    etc.append(np.log(len(splited))/4)  # 문장의 개수
    splited.append(etc)
    splited.append(item.id)
    #splited = m.parse(item.text).split()
    if item.smishing == 1:
        smishings.append(splited)
    else:
        normals.append(splited)
    if idx % 100000 == 0:
        print(datetime.now(), idx)
    idx += 1
#     if idx > 20000:
#         break

2020-03-10 17:52:04.614397 0


In [None]:
print(f'namals shape : {normals.shape}')
print(f'smishings shape : {smishings.shape}')

In [22]:
# 스미싱에만 등장하는 noun들 : smish_target_words
NORM_THRESHOLD, SMISH_THRESHOLD = 500, 2
POS_IDX_IN_ARR = 3
IDX_TO_CUT = -2
norm_dict = {}
for item in [x for normal in normals for row in normal[:IDX_TO_CUT] for x in row[POS_IDX_IN_ARR]]:
    if item in norm_dict:
        norm_dict[item] += 1
    else:
        norm_dict[item] = 1
norm_filter_words = {x:norm_dict.get(x) for x in norm_dict if norm_dict.get(x) > NORM_THRESHOLD}

smish_dict = {}
for item in [x for smishing in smishings for row in smishing[:IDX_TO_CUT] for x in row[POS_IDX_IN_ARR]]:
    if item in smish_dict:
        smish_dict[item] += 1
    else:
        smish_dict[item] = 1
smish_filter_words = {x:smish_dict.get(x) for x in smish_dict if smish_dict.get(x) > SMISH_THRESHOLD}
        
word_only_in_smishings = [word for word in smish_dict if word not in norm_filter_words]
word_only_in_normals = [word for word in norm_dict if word not in smish_filter_words]

norm_target_words = {x:norm_dict.get(x) for x in word_only_in_normals if norm_dict.get(x) > NORM_THRESHOLD}
smish_target_words = {x:smish_dict.get(x) for x in word_only_in_smishings if smish_dict.get(x) > SMISH_THRESHOLD}
print('selected normal-like words count:', len(norm_target_words))
print('selected smishing-like words count:', len(smish_target_words))

smishing_word_cnt, smishing_noword_cnt = [], []
normal_word_cnt, normal_noword_cnt = [], []
for smishing in smishings:
    notarget_word_list = [x for row in smishing[:IDX_TO_CUT] for x in row[POS_IDX_IN_ARR] if x in norm_target_words]
    target_word_list = [x for row in smishing[:IDX_TO_CUT] for x in row[POS_IDX_IN_ARR] if x in smish_target_words]
    smishing.insert(-2, notarget_word_list)
    smishing.insert(-2, target_word_list)
    smishing_noword_cnt.append(len(notarget_word_list))
    smishing_word_cnt.append(len(target_word_list))
for normal in normals:
    notarget_word_list = [x for row in normal[:IDX_TO_CUT] for x in row[POS_IDX_IN_ARR] if x in norm_target_words]
    target_word_list = [x for row in normal[:IDX_TO_CUT] for x in row[POS_IDX_IN_ARR] if x in smish_target_words]
    normal.insert(-2, notarget_word_list)
    normal.insert(-2, target_word_list)
    normal_noword_cnt.append(len(notarget_word_list))
    normal_word_cnt.append(len(target_word_list))

plt.figure(figsize=(20,5))
plt.subplot(1,2,1)
plt.plot(normal_noword_cnt)
plt.plot(smishing_noword_cnt)

plt.subplot(1,2,2)
plt.plot(smishing_word_cnt)
plt.plot(normal_word_cnt)

IndexError: list index out of range