# 4. Text classification and regression

### 4.2. Text classification for Reuters news

In [None]:
from keras.datasets import reuters
import numpy as np
from autokeras import TextClassifier

def convert_labels_to_one_hot(labels, num_classes):
    one_hot = np.zeros((len(labels), num_classes))
    one_hot[np.arange(len(labels)), labels] = 1
    return one_hot

(x_train, y_train), (x_test, y_test) = reuters.load_data()
#print(x_train[2])
word_indexes = reuters.get_word_index(path="reuters_word_index.json")
#print(word_indexes)
indexes_to_word = {}
for key, value in word_indexes.items():
    indexes_to_word[value] = key

x_train_str = []
x_test_str = []
for i in range(x_train.shape[0]):
    x_train_str.append(' '.join([indexes_to_word.get(x - 3, '?') for x in x_train[i]]))
for i in range(x_test.shape[0]):
    x_test_str.append(' '.join([indexes_to_word.get(x - 3, '?') for x in x_test[i]]))

    
#print(x_train_str[521])
num_labels = np.max(y_train) + 1
#print(num_labels)

y_train = convert_labels_to_one_hot(y_train, num_labels)
y_test = convert_labels_to_one_hot(y_test, num_labels)
clf = TextClassifier(verbose=True)
clf.fit(x=x_train_str[:1000], y=y_train[:1000], time_limit=20 * 60)
results = clf.evaluate(x_test_str, y_test)
print(results)


### 4.3. Text classification for spam filtering

In [None]:
# data set taken from: https://archive.ics.uci.edu/ml/datasets/spambase
import numpy as np
from autokeras import MlpModule
from autokeras.backend.torch.loss_function import classification_loss
from autokeras.backend.torch import DataTransformerMlp
from autokeras.nn.metric import Accuracy
from autokeras.preprocessor import OneHotEncoder

def to_one_hot(y):
    y_encoder = OneHotEncoder()
    y_encoder.fit(y)
    y = y_encoder.transform(y)
    return y, y_encoder

data = np.loadtxt('spam_data.csv', delimiter=';')
np.random.shuffle(data)
#print(data.shape)
#print(data[:2])
#print(len(data))
x, y = data[:, :-1], data[:, -1]
y = np.asarray(y, dtype=np.int32)
n_train = 3500
x_train, y_train, x_test, y_test = x[:n_train], y[:n_train], x[n_train:], y[n_train:]
y_train, y_encoder = to_one_hot(y_train)
y_test, _ = to_one_hot(y_test)
data_transformer = DataTransformerMlp(x_train)
train_data = data_transformer.transform_train(x_train, y_train)
test_data = data_transformer.transform_train(x_test, y_test)

mlpModule = MlpModule(verbose=True, loss=classification_loss, metric=Accuracy)
mlpModule.fit(n_output_node=y_encoder.n_classes,
              input_shape=(-1, x_train.shape[1]),
              train_data=train_data,
              test_data=test_data,
              time_limit=20 * 60)

### 4.4. Text regression on a real-world data set

In [1]:
# data set taken from: https://www.kaggle.com/illgorhek/mercari-price#train.tsv
import numpy as np
from autokeras.text.text_supervised import TextRegressor

data = np.loadtxt('mercari_train.tsv', 
                  delimiter='\t', 
                  skiprows=1, 
                  dtype='str', 
                  comments=None,
                  max_rows=5000)
np.random.shuffle(data)
print(data.shape)
x, y = data[:, -1], np.asarray(data[:, -3], np.float32)
print(x[3], y[3])
n_train = 2000
x_train, y_train, x_test, y_test = x[:n_train], y[:n_train], x[n_train:], y[n_train:]
reg = TextRegressor(verbose=True)
reg.fit(x=x_train, y=y_train, time_limit=20 * 60)
results = reg.evaluate(x_test, y_test)
print(results)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
(5000, 8)
Material: Sterling Silver Condition: New Size: 7 Colors: Clear Esther Mahari - .925 Sterling Silver Huge Pear Shape Clear CZ Ring New in original packaging Free shipping 29.0


KeyboardInterrupt: 