In [1]:
import boto3
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('qiita2')
table.item_count

12838

In [2]:
response = table.scan()
data = response['Items']

while 'LastEvaluatedKey' in response:
    response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    data.extend(response['Items'])
articles = data

In [3]:
for k in articles[0]["json"]:
    print(k)

private
user
created_at
tags
gist_url
stock_users
title
raw_body
uuid
stock_count
stocked
body
id
created_at_as_seconds
created_at_in_words
updated_at
updated_at_in_words
comment_count
url
tweet


In [4]:
articles[0]["json"]["tags"]

[{'following': False,
  'icon_url': 'https://s3-ap-northeast-1.amazonaws.com/qiita-tag-image/f9384932787d996b6e1ed26d7d0b2bc2a9e36441/medium.jpg?1443704516',
  'name': '初心者',
  'url_name': '%e5%88%9d%e5%bf%83%e8%80%85',
  'versions': []},
 {'following': False,
  'icon_url': '//cdn.qiita.com/assets/icons/medium/missing.png',
  'name': 'selenium-webdriver',
  'url_name': 'selenium-webdriver',
  'versions': []},
 {'following': False,
  'icon_url': '//cdn.qiita.com/assets/icons/medium/missing.png',
  'name': 'GeckoDriver',
  'url_name': 'geckodriver',
  'versions': []}]

In [5]:
body_data = []
tag_data = []
for a in articles:
    json = a["json"]
    body = json["title"]
    tags = json["tags"]
    for tag in tags:        
        body_data.append(body)
        tag_data.append(tag["name"])

len(body_data)

37646

In [6]:
import pandas as pd
df = pd.read_json("tags.json")
df.head()

Unnamed: 0,followers_count,icon_url,id,items_count
0,17743,https://s3-ap-northeast-1.amazonaws.com/qiita-...,Ruby,12523
1,30279,https://s3-ap-northeast-1.amazonaws.com/qiita-...,JavaScript,12357
2,16438,https://s3-ap-northeast-1.amazonaws.com/qiita-...,Python,8890
3,13196,https://s3-ap-northeast-1.amazonaws.com/qiita-...,iOS,8547
4,19287,https://s3-ap-northeast-1.amazonaws.com/qiita-...,PHP,8233


In [7]:
name_list = df.ix[:,"id"].tolist()[:100]
name_list.append("other")
one_hot = pd.get_dummies(name_list)
one_hot.ix[:,"Ruby"]
one_hot.ix[:,"Ruby"].shape

(101,)

In [8]:
one_hot_data = []
for t in tag_data:
    if t not in name_list:
        t = "other"
    d = one_hot.ix[:,t]
    one_hot_data.append(d)
len(one_hot_data)
one_hot_data[0].shape

(101,)

In [9]:
train_size = int(len(one_hot_data)*0.8)
print(train_size)
training_predictors_tf = body_data[:train_size]
test_predictors_tf = body_data[train_size:]
training_classes_tf = one_hot_data[:train_size]
test_classes_tf = one_hot_data[train_size:]


30116


In [10]:
import MeCab
tagger = MeCab.Tagger('-Owakati -d /usr/lib/mecab/dic/mecab-ipadic-neologd')
contents = []
for b in body_data:
    word = tagger.parse(b).split(' ')
    word = [ w.strip() for w in word ]
    contents.append(word)
max_size = max([ len(c) for c in body_data ])
print(max_size)
padded_contents = []
for i in range(len(contents)):
    content = contents[i]
    padded_contents.append(content + [ '<PAD/>' ] * (max_size - len(content)))
contents = padded_contents

210


In [15]:
padded_contents = [c +  [ '<PAD/>' ] * (max_size - len(c))for c in contents]

In [16]:
import itertools
from collections import Counter
ctr = Counter(itertools.chain(*contents))
dictionaries = [c[0] for c in ctr.most_common()]
dictionaries_inv = { c: i for i, c in enumerate(dictionaries) }

data = [ [ dictionaries_inv[word] for word in content ] for content in contents ]


In [17]:
dictionaries_inv

{'': 1,
 'amazonES': 6247,
 'イマイチ': 4627,
 'Pull': 3320,
 'Pharo': 10065,
 '23時': 10066,
 'yadockeri': 8183,
 'f8': 12400,
 'AirPods': 6248,
 '态类': 10067,
 'opt': 6249,
 'WebPay': 4632,
 '実証': 3836,
 'Mathematica': 10080,
 'de': 3300,
 'QRNN': 6250,
 '防衛戦': 14632,
 '更に': 15531,
 'tf': 1243,
 'つくり': 2730,
 'SpatialIndex': 10070,
 'タスクバー': 15246,
 'BLAS': 10071,
 '1004': 14633,
 '多い': 1868,
 'Pidora': 12403,
 'webkit': 3321,
 'Chromebook': 8185,
 'Selemiun': 6252,
 'AssetBundle': 8186,
 '非エンジニア': 2249,
 'already': 5688,
 '自分自身': 6253,
 'Plack': 6254,
 'ユニット': 8187,
 'Mac': 66,
 '全称': 12404,
 'BIND': 10072,
 '打ち': 3579,
 'Tips': 440,
 'be': 1603,
 '新サービス': 8188,
 '戻り値': 8189,
 '3/3': 12405,
 'コミット': 1123,
 'サーバレス': 1746,
 'bento': 6255,
 'お絵かき': 5157,
 '\x08': 1869,
 'contents': 8190,
 'Messenger': 925,
 'BLE': 2731,
 'すこし': 12406,
 '付け方': 13947,
 'Enum': 4239,
 'Restore': 13795,
 '長音符': 6256,
 '目的': 8191,
 ';);': 14634,
 'Kubernetes': 926,
 'IBMid': 12407,
 'XenCenter': 8192,
 'named': 3

In [18]:
import numpy as np
import tensorflow as tf

#x, y, d = data_helper.load_data_and_labels_and_dictionaries()

# Split original data into two groups for training and testing.
train_x = data[:train_size]
train_y = one_hot_data[:train_size]
test_x = data[train_size:]
test_y = one_hot_data[train_size:]
NUM_CLASSES = 101

NUM_TESTS         = 2000
NUM_EPOCHS        = 10
NUM_MINI_BATCH    = 64
EMBEDDING_SIZE    = 128
NUM_FILTERS       = 128
FILTER_SIZES      = [ 3, 4, 5 ]
L2_LAMBDA         = 0.0001
EVALUATE_EVERY    = 100
CHECKPOINTS_EVERY = 1000
SUMMARY_LOG_DIR = "tmp/tensorflow_log"
CHECKPOINTS_DIR = 'checkpoints'

keep = tf.placeholder(tf.float32)





input_x = tf.placeholder(tf.int32, [None, max_size])
input_y = tf.placeholder(tf.float32, [None, NUM_CLASSES])


with tf.name_scope('embedding'):
    w  = tf.Variable(tf.random_uniform([len(dictionaries), EMBEDDING_SIZE], -1.0, 1.0), name='weight')
    e  = tf.nn.embedding_lookup(w, input_x)
    ex = tf.expand_dims(e, -1)

# Define 3rd and 4th layer (Temporal 1-D convolutional and max-pooling layer).
p_array = []
for filter_size in FILTER_SIZES:
    with tf.name_scope('conv-%d' % filter_size):
        w  = tf.Variable(tf.truncated_normal([ filter_size, EMBEDDING_SIZE, 1, NUM_FILTERS ], stddev=0.02), name='weight')
        b  = tf.Variable(tf.constant(0.1, shape=[ NUM_FILTERS ]), name='bias')
        c0 = tf.nn.conv2d(ex, w, [ 1, 1, 1, 1 ], 'VALID')
        c1 = tf.nn.relu(tf.nn.bias_add(c0, b))
        c2 = tf.nn.max_pool(c1, [ 1,  max_size - filter_size + 1, 1, 1 ], [ 1, 1, 1, 1 ], 'VALID')
        p_array.append(c2)

p = tf.concat(3, p_array)


with tf.name_scope('fc'):
    total_filters = NUM_FILTERS * len(FILTER_SIZES)
    w = tf.Variable(tf.truncated_normal([ total_filters, NUM_CLASSES ], stddev=0.02), name='weight')
    b = tf.Variable(tf.constant(0.1, shape=[ NUM_CLASSES ]), name='bias')
    h0 = tf.nn.dropout(tf.reshape(p, [ -1, total_filters ]), keep)
    predict_y = tf.nn.softmax(tf.matmul(h0, w) + b)


xentropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(predict_y, input_y))
loss = xentropy + L2_LAMBDA * tf.nn.l2_loss(w)

global_step = tf.Variable(0, name="global_step", trainable=False)
train = tf.train.AdamOptimizer(0.0001).minimize(loss, global_step=global_step)



predict  = tf.equal(tf.argmax(predict_y, 1), tf.argmax(input_y, 1))
accuracy = tf.reduce_mean(tf.cast(predict, tf.float32))

loss_sum   = tf.scalar_summary('train loss', loss)
accr_sum   = tf.scalar_summary('train accuracy', accuracy)
t_loss_sum = tf.scalar_summary('general loss', loss)
t_accr_sum = tf.scalar_summary('general accuracy', accuracy)

saver = tf.train.Saver()


with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    writer = tf.train.SummaryWriter(SUMMARY_LOG_DIR, sess.graph)

    train_x_length = len(train_x)
    batch_count = int(train_x_length / NUM_MINI_BATCH) + 1

    print('Start training.')
    print('     epoch: %d' % NUM_EPOCHS)
    print('mini batch: %d' % NUM_MINI_BATCH)
    print('train data: %d' % train_x_length)
    print(' test data: %d' % len(test_x))
    print('We will loop %d count per an epoch.' % batch_count)

   
    for epoch in range(NUM_EPOCHS):
        random_indice = np.random.permutation(train_x_length)
        print('Start %dth epoch.' % (epoch + 1))
        for i in range(batch_count):
            mini_batch_x = []
            mini_batch_y = []
            for j in range(min(train_x_length - i * NUM_MINI_BATCH, NUM_MINI_BATCH)):
                mini_batch_x.append(train_x[random_indice[i * NUM_MINI_BATCH + j]])
                mini_batch_y.append(train_y[random_indice[i * NUM_MINI_BATCH + j]])

            
            _, v1, v2, v3, v4 = sess.run(
                [ train, loss, accuracy, loss_sum, accr_sum ],
                feed_dict={ input_x: mini_batch_x, input_y: mini_batch_y, keep: 0.5 }
            )
            print('%4dth mini batch complete. LOSS: %f, ACCR: %f' % (i + 1, v1, v2))

            current_step = tf.train.global_step(sess, global_step)
            writer.add_summary(v3, current_step)
            writer.add_summary(v4, current_step)


            if current_step % CHECKPOINTS_EVERY == 0:
                saver.save(sess, CHECKPOINTS_DIR + '/model', global_step=current_step)
                print('Checkout was completed.')

 
            if current_step % EVALUATE_EVERY == 0:
                #random_test_indice = np.random.permutation(100)
                #random_test_x = test_x[int(random_test_indice)]
                #random_test_y = test_y[int(random_test_indice)]

                v1, v2, v3, v4 = sess.run(
                    [ loss, accuracy, t_loss_sum, t_accr_sum ],
                    feed_dict={ input_x: test_x, input_y: test_y, keep: 1.0 }
                )
                print('Testing... LOSS: %f, ACCR: %f' % (v1, v2))
                writer.add_summary(v3, current_step)
                writer.add_summary(v4, current_step)


    saver.save(sess, CHECKPOINTS_DIR + '/model-last')


Start training.
     epoch: 10
mini batch: 64
train data: 30116
 test data: 7530
We will loop 471 count per an epoch.
Start 1th epoch.
   1th mini batch complete. LOSS: 4.616714, ACCR: 0.000000
   2th mini batch complete. LOSS: 4.616011, ACCR: 0.015625
   3th mini batch complete. LOSS: 4.616055, ACCR: 0.015625
   4th mini batch complete. LOSS: 4.615865, ACCR: 0.000000
   5th mini batch complete. LOSS: 4.615811, ACCR: 0.015625
   6th mini batch complete. LOSS: 4.615452, ACCR: 0.000000
   7th mini batch complete. LOSS: 4.614844, ACCR: 0.000000
   8th mini batch complete. LOSS: 4.614759, ACCR: 0.062500
   9th mini batch complete. LOSS: 4.614327, ACCR: 0.031250
  10th mini batch complete. LOSS: 4.614604, ACCR: 0.015625
  11th mini batch complete. LOSS: 4.614009, ACCR: 0.078125
  12th mini batch complete. LOSS: 4.613506, ACCR: 0.078125
  13th mini batch complete. LOSS: 4.613304, ACCR: 0.062500
  14th mini batch complete. LOSS: 4.613041, ACCR: 0.093750
  15th mini batch complete. LOSS: 4.612

KeyboardInterrupt: 

In [9]:
!rm -rf tmp/tensorflow_log/*

import tensorflow as tf
import numpy as np
import random


num_of_input_nodes = 1
num_of_hidden_nodes = 101
num_of_output_nodes = 1
num_of_training_epochs = 50000
batch_size = 100
num_of_prediction_epochs = 100
learning_rate = 0.001
forget_bias = 0.9
num_of_sample = 1000
num_layers = 1

batch_size = 100
sequences_length = 30 
test_num = int(num_of_sample*0.3)
class_num = 101

vocabulary_size = 50000
embedding_size = 128 


def get_batch(batch_size, X, t):
    rnum = [random.randint(0, len(X) - 1) for x in range(batch_size)]
    xs = np.array([[[y] for y in list(X[r])] for r in rnum])
    ts = np.array([t[r] for r in rnum])
    return xs, ts


def create_batch(batch_size, X, t):
    #X = X.as_matrix()
    #t = t.as_matrix()
    rnum = [random.randint(0, len(X) - 1) for x in range(batch_size)]
    xs = np.array([[[y] for y in list(X[r])] for r in rnum])
    ts = np.array([t[r] for r in rnum])
    return xs, ts


def unpack_sequence(tensor):
    return tf.unpack(tf.transpose(tensor, perm=[1, 0, 2]))

def pack_sequence(sequence):
    return tf.transpose(tf.pack(sequence), perm=[1, 0, 2])

def inference(input_ph):
    with tf.name_scope("inference") as scope:
        in_size = num_of_hidden_nodes
        out_size = class_num
        weight = tf.Variable(tf.truncated_normal([in_size, out_size], stddev=0.1))
        bias = tf.Variable(tf.constant(0.1, shape=[out_size]))
        
        embedding = tf.get_variable("embedding", [vocabulary_size, embedding_size])
        
       
        # network = tf.nn.rnn_cell.LSTMCell(num_of_hidden_nodes)
        network = tf.nn.rnn_cell.GRUCell(num_of_hidden_nodes)
        network = tf.nn.rnn_cell.DropoutWrapper(network, output_keep_prob=0.5)
        network = tf.nn.rnn_cell.MultiRNNCell([network] * num_layers)
        
        inputs = tf.nn.embedding_lookup(embedding, input_ph)
        inputs =  unpack_sequence(inputs)
       
        rnn_output, states_op = tf.nn.rnn(network,inputs,dtype=tf.int32)
        #rnn_output, states_op = tf.nn.dynamic_rnn(network,inputs,dtype=tf.float32)
        
        
        #rnn_output = pack_sequence(rnn_output)
        #state_op = pack_sequence(states_op)
        
        output_op = tf.transpose(rnn_output, [1, 0, 2])
        #output_op = tf.gather(output, int(output.get_shape()[0]) - 1)
        #output_op = tf.nn.softmax(tf.matmul(rnn_output[-1], weight) + bias)

 
        tf.histogram_summary("weights", weight)
        tf.histogram_summary("biases", bias)
        tf.histogram_summary("output",  output_op)
        results = [weight, bias]
        return output_op, states_op, results


def loss(output_op, supervisor_ph):
    with tf.name_scope("loss") as scope:
        loss_op = - tf.reduce_sum(supervisor_ph * tf.log(output_op))
        tf.scalar_summary("loss", loss_op)
        return loss_op


def training(loss_op):
    with tf.name_scope("training") as scope:
        training_op = optimizer.minimize(loss_op)
        return training_op

def accuracy(output_op, supervisor_ph):
    with tf.name_scope("accuracy") as scope:
        correct_prediction = tf.equal(tf.argmax(output_op,1), tf.argmax(supervisor_ph,1))
        accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        tf.scalar_summary("accuracy", accuracy_op)
        return accuracy_op

def calc_accuracy(accuracy_opp, X, t):
    inputs, targets = create_batch(len(X), X, t)
    pred_dict = {
        input_ph:  inputs,
        supervisor_ph: targets
    }
    accurecy = sess.run(accuracy_op, feed_dict=pred_dict)
    print(accurecy)



random.seed(0)
np.random.seed(0)
tf.set_random_seed(0)

#optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate)

with tf.Graph().as_default():
    input_ph = tf.placeholder(tf.int32, [None, sequences_length, num_of_input_nodes], name="input")
    supervisor_ph = tf.placeholder(tf.float32, [None, class_num], name="supervisor")

    output_op, states_op, datas_op = inference(input_ph)
    loss_op = loss(output_op, supervisor_ph)
    training_op = training(loss_op)
    accuracy_op = accuracy(output_op, supervisor_ph)

    summary_op = tf.merge_all_summaries()
    init = tf.initialize_all_variables()

    with tf.Session() as sess:
        saver = tf.train.Saver()
        summary_writer = tf.train.SummaryWriter("tmp/tensorflow_log", graph=sess.graph)
        sess.run(init)

        for epoch in range(num_of_training_epochs):
            #inputs, supervisors = create_batch(batch_size, training_predictors_tf , training_classes_tf)
            inputs, supervisors = training_predictors_tf , training_classes_tf
            train_dict = {
                input_ph:   inputs,
                supervisor_ph: supervisors
            }
            sess.run(training_op, feed_dict=train_dict)

            if (epoch) % 1000 == 0:
                #summary_str, loss = sess.run([summary_op, loss_op], feed_dict=train_dict)
                loss = sess.run(loss_op, feed_dict=train_dict)
                print("train#{}, loss: {}".format(epoch, loss))
                #summary_writer.add_summary(summary_str, epoch)
                if (epoch) % 5000 == 0:
                    calc_accuracy(output_op, test_predictors_tf, test_classes_tf)
        calc_accuracy(output_op, X_test, t_test)
        datas = sess.run(datas_op)
        saver.save(sess, "model.ckpt")

ValueError: Shapes (3,) and (4,) are not compatible