In [10]:
import urllib.request
import re
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import glob
from tqdm import tqdm
import os
import time # 코드 실행시간 알림 용
from  tkinter import *
from tkinter import filedialog



In [1]:
def tokenizer(train_df,stopwords): ## 상품명 csv 파일로 만든 DataFrame을 받아 토큰화된 2차원 List로 변환
    okt=Okt()
    X_train = []
    for sentence in tqdm(train_df['goods_nm']):
        temp_X = []
        temp_X = okt.morphs(sentence, stem=True) # 토큰화
        temp_X = [word.lower() for word in temp_X if not word in stopwords] # 불용어 제거
        X_train.append(temp_X)
    return(X_train)

In [1]:
def vocab_indexing(X_train): ## 토큰화된 2차원 List와 Vocab File을 받아 Vocab 기준으로 Indexing
    vocab_file=file_select("Vocab")
    vocab_df = pd.read_csv(vocab_file, sep=',')
    X_dict={}
    for i in tqdm(range(len(X_train))):
        for j in range(len(X_train[i])):
            if (vocab_df['word']==X_train[i][j]).any():           
                if X_train[i][j] not in X_dict:
                    X_dict[X_train[i][j]]=vocab_df[vocab_df['word'].isin([X_train[i][j]])]['index'].values[0]
                X_train[i][j] = vocab_df[vocab_df['word'].isin([X_train[i][j]])]['index'].values[0]
            else:
                new_data={'word':X_train[i][j],'index':len(vocab_df)+2}
                vocab_df=vocab_df.append(new_data, ignore_index=True)
                print("Vocab에 없는 단어 발견 : {} ".format(X_train[i][j]))
                X_dict[X_train[i][j]]=len(vocab_df)+2
                X_train[i][j]=len(vocab_df)+2
                print(vocab_df.tail())
    vocab_df.to_csv(vocab_file, sep=',',index=False)    
    return(X_train,X_dict)

In [2]:
def text_preprocessing(train_df,): ##쓸모없는 기호 제거 및 중복된 상품명 제거
    train_df['goods_nm'] = train_df['goods_nm'].str.replace(r'[-=+★,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》\\n\t]+', " ", regex=True)
    train_df['goods_nm'] = train_df['goods_nm'].str.replace(r'\t+', " ", regex=True)
    train_df['goods_nm'] = train_df['goods_nm'].str.replace(r'[\\n]+'," ", regex=True) 
    train_df['goods_nm'] = train_df['goods_nm'].str.replace(r'[0-9]'," ", regex=True) ##숫자 제거
    train_df.drop_duplicates(subset=['goods_nm'], inplace=True) 
    return(train_df)

In [2]:
def Labeling(y_train):
    ##Label=file_select("Label")
    Label="label.csv"
    Label_df=pd.read_csv(Label, sep=',')
    prdlist=Label_df['prices_prdlst_nm'].unique()
    yy_train=[]
    for i in tqdm(range(len(y_train))):
        for j in range(len(prdlist)):            
            try:
                if y_train[i]==Label_df['prices_prdlst_nm'][j]:                                 
                    yy_train.append(Label_df['Label'][j])
                    break
            except KeyError :
                print("Key Error j :값 {}".format(j))
    yy_train=np.array(yy_train)
    return(yy_train)

In [3]:

def Labeling2(y_train):
    ##Label=file_select("Label")
    Label="label.csv"
    Label_df=pd.read_csv(Label, sep=',')
    print(Label_df)
    prdlist=Label_df['prices_prdlst_nm'].unique()
    yy_train=[]
    Label_index=0
    for i in tqdm(range(len(y_train))):
        try:
            Label_index=Label_df[Label_df['prices_prdlst_nm']==y_train[i]]
            Label_index2=Label_index.iloc[0][2]
            yy_train.append(Label_index2)           
        except :
            print("값은 {}이고 Label은 {}이다.".format(i,y_train[i]))
    yy_train=np.array(yy_train)
    return(yy_train)

In [20]:
def folder_select_to_csv() : ##폴더 전체 파일을 선택해 csv로 병합하여 불러온다
    root = Tk()
    root.dirName=filedialog.askdirectory();
    print (root.dirName);
    input_file=root.dirName
    root.destroy()
    
    allFile_list = glob.glob(os.path.join(input_file, '*.csv'))
    print(allFile_list)
    allData = []
    output_file = input_file+'/'+str(time.time())+'result.csv'
    
    for file in allFile_list:
        df = pd.read_csv(file,sep=',') # for구문으로 csv파일들을 읽어 들인다
        allData.append(df) # 빈 리스트에 읽어 들인 내용을 추가한다
    
    dataCombine = pd.concat(allData, axis=0, ignore_index=True) # concat함수를 이용해서 리스트의 내용을 병합
    # axis=0은 수직으로 병합함. axis=1은 수평. ignore_index=True는 인데스 값이 기존 순서를 무시하고 순서대로정렬되도록 한다.
    
    dataCombine.to_csv(output_file, index=False) # to_csv함수로 저장한다. 인데스를 빼려면 False로 설정
    train_df = pd.read_csv(output_file, sep=',')
    train_df=train_df.loc[:,['goods_nm','prices_prdlst_nm']]
    return(train_df)


In [5]:
def file_select_to_csv(): 
    root = Tk()
    root.filename =  filedialog.askopenfilename(initialdir = "/", title = "Train 파일 선택")
    print (root.filename)
    train_filename=root.filename
    root.destroy()
    
    train_df = pd.read_csv(train_filename, sep=',', error_bad_lines=False)
    train_df=train_df.loc[:,['goods_nm','prices_prdlst_nm']]
    return(train_df)

In [4]:
def file_select(purpose):
    root = Tk()
    root.filename =  filedialog.askopenfilename(initialdir = "/", title = purpose+" 파일 선택")
    print (root.filename)
    file_name=root.filename
    root.destroy
    return file_name

In [6]:
def Make_Label(Label_var,File_name):
    root = Tk()
    root.filename = filedialog.askopenfilename(initialdir = "/", title = "라벨 만들 파일 선택")
    print ( root.filename)
    Label_file= pd.read_csv(root.filename, sep = ',')
    Label_df=Label_file.loc[:,[Label_var]]
    Label_list=Label_df[Label_var].unique()
    Label_df=pd.DataFrame(index=range(0,len(Label_list)), columns=[Label_var, 'Label'])
    for i in tqdm(range(len(Label_list))):
        Label_df[Label_var][i]=Label_list[i]
        Label_df['Label'][i]=i
    Label_df.to_csv(File_name,sep=",",na_rep='Nan', encoding ='utf-8-sig')
    

In [3]:
def file_select_to_csv_var(Product_name,Label): 
    root = Tk()
    root.filename =  filedialog.askopenfilename(initialdir = "/", title = "Train 파일 선택")
    print (root.filename)
    train_filename=root.filename
    root.destroy()
    
    train_df = pd.read_csv(train_filename, sep=',', error_bad_lines=False)
    train_df=train_df.loc[:,[Product_name,Label]]
    return(train_df)

In [10]:
def Labeling_var(y_train,Label_var):
    Label_file=file_select("Label")
    Label_df=pd.read_csv(Label_file, sep=',')
    prdlist=Label_df[Label_var].unique()
    yy_train=[]
    for i in tqdm(range(len(y_train))):
        for j in range(len(prdlist)):            
            try:
                if y_train[i]==Label_df[Label_var][j]:                                 
                    yy_train.append(Label_df['Label'][j])
                    break
            except KeyError :
                print("Key Error j :값 {}".format(j))
    yy_train=np.array(yy_train)
    den=len(prdlist)
    return(yy_train,den)