[参考](https://qiita.com/nymwa/items/9c8484ff511123e03ba8)

# 70

In [19]:
import re
import spacy
import numpy as np
from gensim.models import KeyedVectors

In [20]:
nlp = spacy.load('en')
categories = ['b', 't', 'e', 'm']
category_names = ['business', 'science and technology', 'entertainment', 'health']

In [21]:
def tokenize(x):
    x = re.sub(r'\s+', ' ', x)
    x = nlp.make_doc(x)
    x = [d.text for d in x]
    return x

def read_feature_dataset(filename):
    with open(filename) as f:
        dataset = f.read().splitlines()
    dataset = [line.split('\t') for line in dataset]
    dataset_t = [categories.index(line[0]) for line in dataset]
    dataset_x = [tokenize(line[1]) for line in dataset]
    return dataset_x, dataset_t

In [22]:
train_x, train_t = read_feature_dataset('train.txt')
valid_x, valid_t = read_feature_dataset('valid.txt')
test_x, test_t = read_feature_dataset('test.txt')

In [23]:
model = KeyedVectors.load('GoogleNews-vectors-negative300.kv', mmap='r')

In [24]:
model

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x1a37547210>

### tenosrflowでいきます

In [25]:
import tensorflow as tf

In [26]:
def sent_to_vector(sent):
    lst = [tf.constant(model[token]) for token in sent if token in model]
    return sum(lst) / len(lst)

def dataset_to_vector(dataset):
    return tf.stack([sent_to_vector(x) for x in dataset])

In [27]:
train_v = dataset_to_vector(train_x)
valid_v = dataset_to_vector(valid_x)
test_v = dataset_to_vector(test_x)

In [28]:
train_t = tf.constant(train_t)
valid_t = tf.constant(valid_t)
test_t = tf.constant(test_t)

In [38]:
train_t

<tf.Tensor: shape=(10684,), dtype=int32, numpy=array([1, 0, 3, ..., 0, 1, 2], dtype=int32)>

In [30]:
import pickle

In [31]:
with open('data/train.feature.pickle', 'wb') as f:
    pickle.dump(train_v, f)
with open('data/train.label.pickle', 'wb') as f:
    pickle.dump(train_t, f)

with open('data/valid.feature.pickle', 'wb') as f:
    pickle.dump(valid_v, f)
with open('data/valid.label.pickle', 'wb') as f:
    pickle.dump(valid_t, f)

with open('data/test.feature.pickle', 'wb') as f:
    pickle.dump(test_v, f)
with open('data/test.label.pickle', 'wb') as f:
    pickle.dump(test_t, f)

# 71

In [92]:
train_v.shape

TensorShape([10684, 300])

In [82]:
train_v_0 = tf.reshape(train_v[0], [1,300])
train_v_0.shape

TensorShape([1, 300])

In [83]:
W = tf.Variable(tf.ones([300,4], dtype=tf.float32))
W

<tf.Variable 'Variable:0' shape=(300, 4) dtype=float32, numpy=
array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       ...,
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]], dtype=float32)>

In [85]:
@tf.function
def f(x):
    return tf.matmul(x, W)

In [93]:
perceptron = tf.nn.softmax(f(train_v[:4]), axis=-1)
perceptron

<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25]], dtype=float32)>