In [2]:
import pandas as pd
import numpy as np
import sys
import os

import tensorflow as tf
from gensim.models import Word2Vec

In [3]:
class TF_Agent():
    
    def __init__(self, keys):
        self._initialize(keys)
        
    def _initialize(self, keys):
        np.random.seed(0)
        self.log_df = pd.DataFrame({key: [] for key in keys})
        self.keys = self.log_df.columns
        self.interesting = ",".join(self.log_df.columns)
        self.iter_count = 0     

    def set_batch(self, n, feed_dict):
        
        keys = list(feed_dict.keys())
        
        data_n = len(feed_dict[keys[0]])
        indices = np.random.choice(range(data_n), n, replace=False)
        
        feed_dict = {key: feed_dict[key][indices] for key in keys}
        self.feed_dict = feed_dict
    
    def run_session(self, sess):
        
        values = sess.run(eval(self.interesting),
                          feed_dict=self.feed_dict)
        
        self.recent_log = pd.Series(values, index=self.keys)
        self.log_df = self.log_df.append([self.recent_log])
        
        self.iter_count += 1
        
    def trace(self, args, one_line_text=""):
        
        iter_count = self.iter_count
        deco = ["-", "\\", "|", "/"][iter_count%4]
        
        one_line_text = "Iter: % 6d"%iter_count
        for arg in args:
            one_line_text += " %s %s: %0.9f"%(
                deco, arg, self.recent_log[arg]
            )

        sys.stdout.write("\r%s"%one_line_text)

In [4]:
def set_batch_to_file_system():

    def tune_size(i, text):
        words = [model.wv[word] for word in text.split()]
        length = len(words)
        if i%100 == 0:
            percent = i/data_n*100
            sys.stdout.write("\r% 5.2f%%"%(percent))
        return np.array(words+placeholder[length:])
    
    df = pd.read_pickle("./data/preprocessed_df")
    df = df[df["label"] != "unsup"]
    
    data_n = len(df)    
    maxi = max(len(text.split()) for text in df["review"])
    placeholder = [np.array([0]*100)]*maxi
    
    normal_model = Word2Vec.load("./data/normal_wv")
    underscore_model = Word2Vec.load("./data/underscore_wv")
    model = underscore_model
    
    batch_names = ["test", "train_1", "train_2", "train_3", "train_4"]
    for i, name in enumerate(batch_names):
        if os.path.isfile("./data/batch_of_%s"%name):
            break
        batch_n = len(batch_names)
        start = (data_n//batch_n)*i
        end = (data_n//batch_n)*(i+1)
        batch_df = df[start:end]
        print("Tuning %s batch's array size for LSTM..."%name)
        X = np.array([tune_size(i, text)
                      for i, text in enumerate(batch_df["review"])])
        sys.stdout.write("\r% 5.2f%%\n"%100)
        y = np.array([[1, 0] if label == "neg" else [0, 1]
                      for label in batch_df["label"]])        
        pd.to_pickle({"data": X, "label": y}, "./data/batch_of_%s"%name)
        
        
def get_batch_from_file_system(s):
    df = pd.read_pickle("./data/batch_of_%s"%s)
    return df

In [5]:
set_batch_to_file_system()

In [6]:
dataset = get_batch_from_file_system("train_1")

In [7]:
train_X, train_y = dataset["data"], dataset["label"]
print(train_X.shape, train_y.shape)

(10000, 2473, 100) (10000, 2)


In [None]:
tf.reset_default_graph()
g = tf.get_default_graph()
tf.set_random_seed(50554145)

tf.placeholder(dtype=)