In [94]:
%%writefile ../starter/main.py
# -*- coding: utf-8 -*-
import argparse
import os

import numpy as np
import tensorflow as tf

### custom
import pandas as pd
import numpy as np
import codecs
import re
import string
import os

import pickle as pkl

### NSML
import nsml
from nsml import DATASET_PATH, HAS_DATASET, IS_ON_NSML
from dataset import MovieReviewDataset, preprocess

# DONOTCHANGE: They are reserved for nsml
# This is for nsml leaderboard
def bind_model(sess, config):
    # 학습한 모델을 저장하는 함수입니다.
    def save(dir_name, *args):
        # directory
        os.makedirs(dir_name, exist_ok=True)
        saver = tf.train.Saver()
        saver.save(sess, os.path.join(dir_name, 'model'))
#         print("{} is saved on nsml".format(os.path.join(dir_name, 'model')))
#         with open(os.path.join(dir_name, 'dataset.pkl'), 'wb') as f:
#             pkl.dump(dataset, f)
#             print("dataset is saved on nsml")

    # 저장한 모델을 불러올 수 있는 함수입니다.
    def load(dir_name, *args):
        saver = tf.train.Saver()
        # find checkpoint
        ckpt = tf.train.get_checkpoint_state(dir_name)
        if ckpt and ckpt.model_checkpoint_path:
            checkpoint = os.path.basename(ckpt.model_checkpoint_path)
            saver.restore(sess, os.path.join(dir_name, checkpoint))
#             with open(os.path.join(dir_name, 'dataset.pkl'), 'rb') as f:
#                 dataset = pkl.load(f)
        else:
            raise NotImplemented('No checkpoint!')
        print('Model loaded')

    def infer(raw_data, **kwargs):
        """
        :param raw_data: raw input (여기서는 문자열)을 입력받습니다
        :param kwargs:
        :return:
        """
#         preprocessed_data = dataset.preprocess(raw_data)
#         embedded_data = dataset.test_embedding(raw_data)
        embedded_data = preprocess(raw_data, config.strmaxlen)
    
        # 저장한 모델에 입력값을 넣고 prediction 결과를 리턴받습니다
        pred = sess.run(output, feed_dict={x: embedded_data})
        point = pred.squeeze(axis=1).tolist()
        print(np.shape(point))
        # DONOTCHANGE: They are reserved for nsml
        # 리턴 결과는 [(확률, 0 or 1)] 의 형태로 보내야만 리더보드에 올릴 수 있습니다. 리더보드 결과에 확률의 값은 영향을 미치지 않습니다
        return list(zip(np.zeros(len(point)), point))
    
    # DONOTCHANGE: They are reserved for nsml
    # nsml에서 지정한 함수에 접근할 수 있도록 하는 함수입니다.
    nsml.bind(save=save, load=load, infer=infer)

def _batch_loader(iterable, n=1):
    """
    데이터를 배치 사이즈만큼 잘라서 보내주는 함수입니다. PyTorch의 DataLoader와 같은 역할을 합니다

    :param iterable: 데이터 list, 혹은 다른 포맷
    :param n: 배치 사이즈
    :return:
    """
    length = len(iterable)
    for n_idx in range(0, length, n):
        yield iterable[n_idx:min(n_idx + n, length)]

def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)
        
    
if __name__ == '__main__':

    args = argparse.ArgumentParser()
    # DONOTCHANGE: They are reserved for nsml
    args.add_argument('--mode', type=str, default='train')
    args.add_argument('--pause', type=int, default=0)
    args.add_argument('--iteration', type=str, default='0')

    # User options
    args.add_argument('--output', type=int, default=1)
    args.add_argument('--epochs', type=int, default=5)
    args.add_argument('--strmaxlen', type=int, default=400)
    
    args.add_argument('--maxlen', type=int, default=400)
    args.add_argument('--cell_size', type=int, default=40)
    args.add_argument('--embed_size', type=int, default=300)
    args.add_argument('--prob_dropout', type=float, default=0.4)
    args.add_argument('--max_features', type=int, default=431)
    args.add_argument('--batch_size', type=int, default=256)
    
    config = args.parse_args()
 

    if not HAS_DATASET and not IS_ON_NSML:  # It is not running on nsml
        DATASET_PATH = '../sample_data/movie_review/'

     # 모델의 specification
    input_size = config.embed_size*config.maxlen
    output_size = 1
    hidden_layer_size = 200
    learning_rate = 0.001
    character_size = 251
    emb_train = True

    x = tf.placeholder(tf.int32, [None, config.maxlen])
    y_ = tf.placeholder(tf.float32, [None, output_size])
    
    # 임베딩
    word_embedding = tf.get_variable('word_embedding', [config.max_features, config.embed_size])
    embedded = tf.nn.embedding_lookup(word_embedding, x)

    # 첫 번째 레이어
    first_layer_weight = weight_variable([input_size, hidden_layer_size])
    first_layer_bias = bias_variable([hidden_layer_size])
    hidden_layer = tf.matmul(tf.reshape(embedded, (-1, input_size)), first_layer_weight) + first_layer_bias

    # 두 번째 (아웃풋) 레이어
    second_layer_weight = weight_variable([hidden_layer_size, output_size])
    second_layer_bias = bias_variable([output_size])
    output = tf.matmul(hidden_layer, second_layer_weight) + second_layer_bias
#     output_sigmoid = tf.sigmoid(output)

    # loss와 optimizer
    loss_mse = tf.losses.mean_squared_error(y_, output)
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss_mse)

    sess = tf.InteractiveSession()
    tf.global_variables_initializer().run()
        
    # DONOTCHANGE: Reserved for nsml
    dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen)
#     dataset.dset_regex_morph(morph=True)
#     dataset.dset_embedding()
    bind_model(sess=sess, config=config)

   
    # DONOTCHANGE: Reserved for nsml
    if config.pause:
        nsml.paused(scope=locals())
    
    if config.mode == 'train':
#         dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen)
#         dataset.dset_regex_morph(morph=True)
#         dataset.dset_embedding()
        

        
        ### csutom
#         dataset.dset_regex_morph(morph=True)
#         emb_path="D:/onedrive/code/ipython/wordvec/pre_trained/fasttext/ko/wiki.ko.vec"
#         dataset.load_emb_model(embedding_path=emb_path)
#         dataset.dset_embedding(dataset.emb_model)
#         dataset.dset_embedding()
        
        dataset_len = len(dataset)
        one_batch_size = dataset_len//config.batch_size
        if dataset_len % config.batch_size != 0:
            one_batch_size += 1
        # epoch마다 학습을 수행합니다.
        for epoch in range(config.epochs):
            avg_loss = 0.0
            for i, (data, labels) in enumerate(_batch_loader(dataset, config.batch_size)):
                _, loss = sess.run([train_step, loss_mse],
                                   feed_dict={x: data, y_: labels})
#                 print('Batch : ', i + 1, '/', one_batch_size,
#                       ', BCE in this minibatch: ', float(loss))
                avg_loss += float(loss)
            print('epoch:', epoch, ' train_loss:', float(avg_loss/one_batch_size))
            nsml.report(summary=True, scope=locals(), epoch=epoch, epoch_total=config.epochs,
                        train__loss=float(avg_loss/one_batch_size), step=epoch)
            # DONOTCHANGE (You can decide how often you want to save the model)
            nsml.save(epoch)

    # 로컬 테스트 모드일때 사용합니다
    # 결과가 아래와 같이 나온다면, nsml submit을 통해서 제출할 수 있습니다.
    # [(0.3, 0), (0.7, 1), ... ]
    elif config.mode == 'test_local':
        with open(os.path.join(DATASET_PATH, 'test/test_data'), 'rt', encoding='utf-8') as f:
            reviews = f.readlines()
        res = []
        for batch in _batch_loader(reviews, config.batch_size):
            temp_res = nsml.infer(batch)
            res += temp_res
#         res = nsml.infer(reviews)
        print(res)

Overwriting ../starter/main.py


In [95]:
!python ../starter/main.py --mode train

epoch: 0  train_loss: 118.17220306396484
epoch: 1  train_loss: 4420.47900390625
epoch: 2  train_loss: 319.73101806640625
epoch: 3  train_loss: 829.7860107421875
epoch: 4  train_loss: 2231.032958984375


2018-04-03 21:16:15.717569: I C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\36\tensorflow\core\platform\cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
2018-04-03 21:16:16.031441: I C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\36\tensorflow\core\common_runtime\gpu\gpu_device.cc:1212] Found device 0 with properties: 
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.683
pciBusID: 0000:01:00.0
totalMemory: 11.00GiB freeMemory: 9.09GiB
2018-04-03 21:16:16.031763: I C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\36\tensorflow\core\common_runtime\gpu\gpu_device.cc:1312] Adding visible gpu devices: 0
2018-04-03 21:16:16.648086: I C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\36\tensorflow\core\common_runtime\gpu\gpu_device.cc:993] Creating TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 8806 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:01

In [96]:
!python ../starter/main.py --mode test_local

(87,)
[(0.0, -1.0029360055923462), (0.0, -0.5922660827636719), (0.0, -0.5258321762084961), (0.0, -0.46770694851875305), (0.0, -0.9856932163238525), (0.0, 0.17409417033195496), (0.0, -1.0555821657180786), (0.0, -0.9003530740737915), (0.0, -1.25115966796875), (0.0, -0.8233543634414673), (0.0, -0.454477459192276), (0.0, -1.193847417831421), (0.0, -0.004324875771999359), (0.0, -0.9096697568893433), (0.0, -0.23301494121551514), (0.0, -0.6798992156982422), (0.0, -0.9385336637496948), (0.0, 0.20161789655685425), (0.0, -0.6412708759307861), (0.0, -1.9880775213241577), (0.0, -0.6272163391113281), (0.0, -0.5542000532150269), (0.0, -0.5005652904510498), (0.0, -0.42831066250801086), (0.0, 0.4964554011821747), (0.0, -0.6084188222885132), (0.0, -1.38687264919281), (0.0, -0.5787463188171387), (0.0, -2.1129531860351562), (0.0, -0.3341926038265228), (0.0, -1.610254168510437), (0.0, -1.316604733467102), (0.0, 0.0038783326745033264), (0.0, 0.6582373380661011), (0.0, -1.1776357889175415), (0.0, -0.2155701

2018-04-03 21:16:20.067405: I C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\36\tensorflow\core\platform\cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
2018-04-03 21:16:20.373957: I C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\36\tensorflow\core\common_runtime\gpu\gpu_device.cc:1212] Found device 0 with properties: 
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.683
pciBusID: 0000:01:00.0
totalMemory: 11.00GiB freeMemory: 9.09GiB
2018-04-03 21:16:20.374311: I C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\36\tensorflow\core\common_runtime\gpu\gpu_device.cc:1312] Adding visible gpu devices: 0
2018-04-03 21:16:20.953064: I C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\36\tensorflow\core\common_runtime\gpu\gpu_device.cc:993] Creating TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 8806 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:01

 ===

In [44]:
!python ../example_back/main.py --mode train

Batch :  1 / 1 , MSE in this minibatch:  19.29180335998535
epoch: 0  train_loss: 19.29180335998535
Batch :  1 / 1 , MSE in this minibatch:  7.607476711273193
epoch: 1  train_loss: 7.607476711273193
Batch :  1 / 1 , MSE in this minibatch:  7.607476711273193
epoch: 2  train_loss: 7.607476711273193
Batch :  1 / 1 , MSE in this minibatch:  7.607476711273193
epoch: 3  train_loss: 7.607476711273193
Batch :  1 / 1 , MSE in this minibatch:  7.607476711273193
epoch: 4  train_loss: 7.607476711273193
Batch :  1 / 1 , MSE in this minibatch:  7.607476711273193
epoch: 5  train_loss: 7.607476711273193
Batch :  1 / 1 , MSE in this minibatch:  7.607476711273193
epoch: 6  train_loss: 7.607476711273193
Batch :  1 / 1 , MSE in this minibatch:  7.607476711273193
epoch: 7  train_loss: 7.607476711273193
Batch :  1 / 1 , MSE in this minibatch:  7.607476711273193
epoch: 8  train_loss: 7.607476711273193
Batch :  1 / 1 , MSE in this minibatch:  7.607476711273193
epoch: 9  train_loss: 7.607476711273193


In [90]:
tmp = [(0.0, 5.507122039794922), (0.0, 5.477208137512207), (0.0, 5.081559658050537), (0.0, 5.745182037353516), (0.0, 6.0997090339660645), (0.0, 5.659692287445068), (0.0, 5.920278072357178), (0.0, 6.350973606109619), (0.0, 5.890348434448242), (0.0, 5.882244110107422), (0.0, 5.47745418548584), (0.0, 5.883562088012695), (0.0, 5.641011714935303), (0.0, 4.899208068847656), (0.0, 5.4042487144470215), (0.0, 5.44482421875), (0.0, 5.365687847137451), (0.0, 5.508814334869385), (0.0, 5.610958576202393), (0.0, 6.146960735321045), (0.0, 5.443153381347656), (0.0, 5.225772857666016), (0.0, 5.609511852264404), (0.0, 5.245474338531494), (0.0, 5.728208065032959), (0.0, 5.431302547454834), (0.0, 5.909133434295654), (0.0, 5.599045753479004), (0.0, 5.0513811111450195), (0.0, 5.178066253662109), (0.0, 5.482705593109131), (0.0, 5.959336280822754), (0.0, 5.443247318267822), (0.0, 5.457961559295654), (0.0, 5.3761372566223145), (0.0, 5.327464580535889), (0.0, 4.982173919677734), (0.0, 4.972344398498535), (0.0, 5.525853157043457), (0.0, 5.402675628662109), (0.0, 5.452371597290039), (0.0, 5.296295166015625), (0.0, 5.418962001800537), (0.0, 5.954452991485596), (0.0, 5.593425273895264), (0.0, 5.549355983734131), (0.0, 5.962398529052734), (0.0, 5.473674297332764), (0.0, 5.136119365692139), (0.0, 5.7232985496521), (0.0, 5.20017671585083), (0.0, 4.964993476867676), (0.0, 5.232919692993164), (0.0, 5.040822505950928), (0.0, 5.335686206817627), (0.0, 5.714105606079102), (0.0, 5.286571502685547), (0.0, 5.32389497756958), (0.0, 6.3306097984313965), (0.0, 5.624547958374023), (0.0, 5.531543254852295), (0.0, 5.307179927825928), (0.0, 5.559380531311035), (0.0, 5.755814075469971), (0.0, 5.190467357635498), (0.0, 5.5357513427734375), (0.0, 5.294408798217773), (0.0, 5.54160213470459), (0.0, 5.4538254737854), (0.0, 5.685484409332275), (0.0, 5.525065898895264), (0.0, 5.39377498626709), (0.0, 5.509948253631592), (0.0, 5.330820083618164), (0.0, 5.601436138153076), (0.0, 5.052368640899658), (0.0, 5.631039142608643), (0.0, 6.053228378295898), (0.0, 5.571783542633057), (0.0, 5.3911824226379395), (0.0, 5.865889549255371), (0.0, 5.761862754821777), (0.0, 5.490329742431641), (0.0, 5.48750638961792), (0.0, 4.960508346557617), (0.0, 5.481670379638672), (0.0, 5.939573764801025), (0.0, 5.222892761230469), (0.0, 5.459935665130615), (0.0, 5.423129081726074), (0.0, 5.0208635330200195), (0.0, 5.444896221160889), (0.0, 5.56053352355957), (0.0, 5.544216156005859), (0.0, 5.691243648529053), (0.0, 5.1534810066223145), (0.0, 5.399294376373291), (0.0, 5.101796627044678), (0.0, 5.719296455383301), (0.0, 5.345041751861572), (0.0, 5.30886697769165), (0.0, 5.01315450668335), (0.0, 5.454311847686768), (0.0, 5.488786220550537), (0.0, 5.514221668243408), (0.0, 5.336738109588623), (0.0, 5.635279655456543)]
tmp2 = [(0.0, 0.17151004076004028), (0.0, 0.3102037310600281), (0.0, 0.8511660695075989), (0.0, 0.11610407382249832), (0.0, 0.37960124015808105), (0.0, 0.49174389243125916), (0.0, 1.0346283912658691), (0.0, 0.9915686845779419), (0.0, 0.08578527718782425), (0.0, 1.8068877458572388), (0.0, 0.8327751755714417), (0.0, 0.650511622428894), (0.0, 0.41550737619400024), (0.0, 0.6187836527824402), (0.0, -0.13442781567573547), (0.0, 1.1096271276474), (0.0, 0.1299227476119995), (0.0, 0.7282607555389404), (0.0, 0.5944207906723022), (0.0, 1.1815965175628662), (0.0, -0.919247031211853), (0.0, 1.716301679611206), (0.0, 1.3576778173446655), (0.0, 0.6849305033683777), (0.0, 0.3809773325920105), (0.0, 0.7658463716506958), (0.0, 1.1667943000793457), (0.0, 1.3709861040115356), (0.0, 0.9598567485809326), (0.0, 0.8689199686050415), (0.0, 0.26576030254364014), (0.0, 0.5343411564826965), (0.0, 0.5739362835884094), (0.0, 0.2807602882385254), (0.0, 0.6746336817741394), (0.0, 0.5748001933097839), (0.0, 0.7115890383720398), (0.0, 0.9866825938224792), (0.0, 1.0107001066207886), (0.0, 0.45365607738494873), (0.0, 1.1045211553573608), (0.0, 0.02827710658311844), (0.0, 0.5486428141593933), (0.0, 0.2230035364627838), (0.0, 1.0092824697494507), (0.0, 0.8335793018341064), (0.0, 0.7652144432067871), (0.0, 0.8846083879470825), (0.0, 0.25722795724868774), (0.0, 0.023475296795368195), (0.0, 1.0517147779464722), (0.0, 0.8199621438980103), (0.0, 0.17542585730552673), (0.0, 0.9163600206375122), (0.0, 0.5567643046379089), (0.0, 1.1359342336654663), (0.0, 0.21362942457199097), (0.0, 0.09889668971300125), (0.0, 0.4078075587749481), (0.0, 0.7591463327407837), (0.0, 0.4542941749095917), (0.0, 0.39212343096733093), (0.0, 0.42554154992103577), (0.0, 0.8650057315826416), (0.0, 1.1026344299316406), (0.0, 1.3823593854904175), (0.0, 0.6251580715179443), (0.0, 0.38394099473953247), (0.0, 0.5180602669715881), (0.0, -0.3579539954662323), (0.0, 0.12424284964799881), (0.0, 0.8763779401779175), (0.0, 0.35700929164886475), (0.0, 0.5003722310066223), (0.0, 0.11724204570055008), (0.0, 0.5098779201507568), (0.0, 0.5466137528419495), (0.0, 0.28781092166900635), (0.0, -0.4003223478794098), (0.0, 1.2976845502853394), (0.0, 0.3018210232257843), (0.0, 2.049747943878174), (0.0, 0.46604761481285095), (0.0, 0.7953343987464905), (0.0, 0.6959224939346313), (0.0, 1.383487582206726), (0.0, -0.5519022941589355)]

In [75]:
import numpy as np

In [93]:
np.shape(tmp)

(107, 2)

In [88]:
tmp = [5.849471092224121, 4.6791486740112305, 5.030023574829102, 4.490840435028076, 5.456708908081055, 5.024709701538086, 5.023854732513428, 5.876331329345703, 4.961490631103516, 5.1269001960754395, 4.586499214172363, 5.601384162902832, 5.2715301513671875, 5.52301549911499, 5.54022216796875, 5.463802337646484, 5.524343013763428, 5.513424873352051, 5.404085159301758, 5.085888862609863, 5.18102502822876, 4.819967269897461, 4.992971897125244, 5.379040241241455, 5.407474994659424, 5.209883213043213, 5.179623126983643, 4.9488115310668945, 5.008663177490234, 4.976510047912598, 4.887200355529785, 5.230838298797607, 4.875622272491455, 5.012160778045654, 4.959322929382324, 4.795332908630371, 5.235421657562256, 5.2926201820373535, 5.035041809082031, 5.089512825012207, 4.789463043212891, 5.048935413360596, 4.758528709411621, 5.34534215927124, 5.352360725402832, 4.331325531005859, 4.898258209228516, 5.234723091125488, 5.124054908752441, 5.020263195037842, 4.730261325836182, 4.833620071411133, 5.053152561187744, 5.565540790557861, 5.36036491394043, 5.275566577911377, 4.7412919998168945, 4.223257541656494, 4.780771255493164, 5.20416784286499, 4.944936275482178, 5.273554801940918, 4.920029640197754, 5.362622261047363, 4.745726585388184, 5.217639923095703, 5.082106113433838, 4.781542778015137, 4.717349052429199, 5.029264450073242, 5.311281681060791, 4.725761413574219, 5.068767070770264, 5.286892414093018, 4.738636493682861, 4.955305099487305, 4.970938682556152, 5.201949596405029, 4.973832130432129, 5.466550350189209, 4.975394248962402, 4.786642074584961, 5.051764965057373, 4.621184349060059, 5.357519149780273, 4.947997093200684, 4.799294471740723, 4.8324103355407715, 5.573225498199463, 4.858403205871582, 5.014888286590576, 5.209362030029297, 4.926170825958252, 4.9326958656311035, 4.597771167755127, 4.772767066955566, 4.680015563964844, 4.746307373046875, 5.023454666137695, 4.613122940063477, 5.03616189956665, 4.5199127197265625, 4.76422119140625, 4.486040115356445, 4.875519275665283, 4.937893867492676, 5.213864803314209]

In [87]:
!python ../example_back/main.py --mode test_local

[5.849471092224121, 4.6791486740112305, 5.030023574829102, 4.490840435028076, 5.456708908081055, 5.024709701538086, 5.023854732513428, 5.876331329345703, 4.961490631103516, 5.1269001960754395, 4.586499214172363, 5.601384162902832, 5.2715301513671875, 5.52301549911499, 5.54022216796875, 5.463802337646484, 5.524343013763428, 5.513424873352051, 5.404085159301758, 5.085888862609863, 5.18102502822876, 4.819967269897461, 4.992971897125244, 5.379040241241455, 5.407474994659424, 5.209883213043213, 5.179623126983643, 4.9488115310668945, 5.008663177490234, 4.976510047912598, 4.887200355529785, 5.230838298797607, 4.875622272491455, 5.012160778045654, 4.959322929382324, 4.795332908630371, 5.235421657562256, 5.2926201820373535, 5.035041809082031, 5.089512825012207, 4.789463043212891, 5.048935413360596, 4.758528709411621, 5.34534215927124, 5.352360725402832, 4.331325531005859, 4.898258209228516, 5.234723091125488, 5.124054908752441, 5.020263195037842, 4.730261325836182, 4.833620071411133, 5.05315256

In [38]:
%%writefile ../starter/Dataset.py
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import codecs
import re
import string
import os

#===============keras ==============
from keras.preprocessing import text, sequence

#===============morphnizer ============
# from konlpy.tag import Twitter
# twt = Twitter()

class MovieReviewDataset():
    """
    영화리뷰 데이터를 읽어서, tuple (데이터, 레이블)의 형태로 리턴하는 파이썬 오브젝트 입니다.
    """
    def __init__(self, dataset_path: str, max_length: int):
        """
        initializer
        :param dataset_path: 데이터셋 root path
        :param max_length: 문자열의 최대 길이
        """

        # 데이터, 레이블 각각의 경로
        data_review = os.path.join(dataset_path, 'train', 'train_data')
        data_label = os.path.join(dataset_path, 'train', 'train_label')

        # 영화리뷰 데이터를 읽고 preprocess까지 진행합니다
        with open(data_review, 'rt', encoding='utf-8') as f:
            self.reviews = f.readlines()
        # 영화리뷰 레이블을 읽고 preprocess까지 진행합니다.
        with open(data_label) as f:
            self.labels = [np.float32(x) for x in f.readlines()]
            
        self.dset = pd.DataFrame(data=np.array([self.reviews, self.labels]).T, columns=['reviews', 'labels'])

    def __len__(self):
        """

        :return: 전체 데이터의 수를 리턴합니다
        """
        return len(self.reviews)

    def __getitem__(self, idx):
        """

        :param idx: 필요한 데이터의 인덱스
        :return: 인덱스에 맞는 데이터, 레이블 pair를 리턴합니다
        """
        return self.reviews[idx], self.labels[idx]
    
    def export(self):
        self.dset.to_csv("./export.csv")
        
    def prin(self):
        print(self.dset)   

    def load_emb_model(self, embedding_path, encodings = "utf-8"):
        def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
        self.emb_model = dict(get_coefs(*o.strip().split(" ")) for o in codecs.open(embedding_path, "r", encodings ))
    
    def preprocess(self, raw_data, morph=True):
        docs = []
        for doc in raw_data:
            doc = re.sub('[^\w?!]', ' ', doc)
            doc = re.sub('[\s]+', ' ', doc)
            doc = re.sub('[\s]$|^[\s]', '', doc)
#             if morph:
#                 docs.append(" ".join(twt.morphs(doc)))
#             else:
#                 docs.append(doc)
            docs.append(doc)
        return docs
    
    def test_embedding(self, raw_data,
#                        emb_model,
                       embed_size = 300,
                       max_features = 100000,
                       maxlen = 20,
                       oov_zero = True,
                       truncating='pre'
                      ):
        

        list_sentences = raw_data 
        list_tokenized = self.tokenizer.texts_to_sequences(list_sentences)
        X = sequence.pad_sequences(list_tokenized, maxlen=maxlen, truncating=truncating)
        return X
    
    
    
    
    def dset_regex_morph(self, morph=True):
        docs = []
        for doc in self.dset['reviews']:
            doc = re.sub('[^\w?!]', ' ', doc)
            doc = re.sub('[\s]+', ' ', doc)
            doc = re.sub('[\s]$|^[\s]', '', doc)
#             if morph:
#                 docs.append(" ".join(twt.morphs(doc)))
#             else:
#                 docs.append(doc)
            docs.append(doc)
        self.dset['reviews'] = docs
    
    def dset_embedding(self,
#                        emb_model,
                       embed_size = 300,
                       max_features = 100000,
                       maxlen = 20,
                       oov_zero = True,
                       truncating='pre'
                      ):
        
        doc_column = "reviews"
        list_classes = ["labels"]

        list_sentences = self.dset[doc_column].fillna('UNK').values.tolist()
        

        self.tokenizer = text.Tokenizer(num_words =max_features)
        self.tokenizer.fit_on_texts(list_sentences)

        list_tokenized = self.tokenizer.texts_to_sequences(list_sentences)
        
        X = sequence.pad_sequences(list_tokenized, maxlen=maxlen, truncating=truncating)
        Y = self.dset[list_classes].values
        print("=== Data is preprocessed")

#         word_index = tokenizer.word_index
#         nb_words = min(max_features, len(word_index))

#         if oov_zero:
#             embedding_matrix = np.zeros((nb_words, embed_size))
#         else:
#             embedding_matrix = np.random.normal(0.001, 0.4, (nb_words, embed_size))

#         for word, i in word_index.items():
#             if i >= max_features: continue
#             try:
#                 embedding_vector = emb_model.get(word)
#                 if embedding_vector is not None: embedding_matrix[i] = embedding_vector
#             except: 
#                 pass
#         print("=== Embedding Matrix is loaded")
        
        self.reviews = X
        self.labels = Y
#         self.emb_matrix = embedding_matrix
  

Writing ../starter/Dataset.py
