In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models
import pandas as pd
import numpy as np
from word2vec import getVectors
from getTrainTest import sample




In [2]:
df = pd.read_csv('train.csv', index_col=0)

train, validation, test = sample(df)
vectorized_vectors_train, vectorized_vectors_validation, vectorized_vectors_test = np.array(getVectors(train)), np.array(getVectors(validation)), np.array(getVectors(test))

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

encoder = LabelEncoder()
onehot = OneHotEncoder()

In [4]:
trailing_space_train = np.array([int(item) for item in list(train['trailing_space'])])
train_X = np.column_stack((vectorized_vectors_train, trailing_space_train))
train_y = onehot.fit_transform(np.array(encoder.fit_transform(train['label'])).reshape(-1, 1)).toarray()

In [5]:
trailing_space_validation = np.array([int(item) for item in list(validation['trailing_space'])])
validation_X = np.column_stack((vectorized_vectors_validation, trailing_space_validation))
validation_y = onehot.transform(np.array(encoder.transform(validation['label'])).reshape(-1, 1)).toarray()

In [6]:
model = models.Sequential([
    layers.Dense(512, activation='relu'),
    layers.Dense(2048, activation='relu'),
    layers.Dense(1024, activation='relu'),
    layers.Dense(512, activation='relu'),
    layers.Dense(12, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.legacy.Adam(0.001), metrics=['categorical_crossentropy'])




In [7]:
model.fit(train_X, train_y, epochs=1, batch_size=256, validation_data=(validation_X, validation_y))





<keras.src.callbacks.History at 0x1df24398b50>

In [8]:
trailing_space_test = np.array([int(item) for item in list(test['trailing_space'])])
test_X = np.column_stack((vectorized_vectors_test, trailing_space_test))
test_y = onehot.transform(np.array(encoder.transform(test['label'])).reshape(-1, 1)).toarray()

In [9]:
pred = model.predict(test_X)
pred



array([[2.6495660e-30, 1.4328980e-31, 9.0674623e-16, ..., 1.2200885e-26,
        0.0000000e+00, 1.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 7.3687768e-26, ..., 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [1.0045765e-01, 1.3249974e-01, 5.9523210e-03, ..., 9.3853176e-02,
        4.8035022e-06, 4.8672028e-02],
       ...,
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 1.9201888e-31, ..., 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [2.0019388e-35, 4.5816747e-37, 8.7741022e-19, ..., 5.0540474e-31,
        0.0000000e+00, 1.0000000e+00]], dtype=float32)

In [15]:
from sklearn.metrics import confusion_matrix

pred_train = model.predict(train_X)
pred_train = encoder.inverse_transform(onehot.inverse_transform(pred_train).reshape(1, -1)[0])
confusion_matrix(train['label'], pred_train)



array([[     0,      0,      0,      0,  73770,      0,      0,      0,
             0,      0, 148327,      0],
       [     0,      0,      0,      0,  98991,      0,      0,      0,
             0,      0, 123106,      0],
       [     0,      0, 210656,      0,   4427,      0,      0,   6357,
             0,      0,    452,    205],
       [     0,      0,      0,      0,  36950,      0,      0,      0,
             0,      0, 185147,      0],
       [     0,      0,      0,      0, 222097,      0,      0,      0,
             0,      0,      0,      0],
       [     0,      0,      0,      0,  40807,      0,      0,      0,
             0,      0, 181290,      0],
       [     0,      0,      0,      0, 110873,      0,      0,      0,
             0,      0, 111224,      0],
       [     0,      0,   5449,      0,   2829,      0,      0, 212282,
             0,    528,   1009,      0],
       [     0,      0,      0,      0,  14599,      0,      0,      0,
             0,      0, 

In [10]:
pred = encoder.inverse_transform(onehot.inverse_transform(pred).reshape(1, -1)[0])

In [14]:
confusion_matrix(test['label'], pred)

array([[     0,      0,      0,      0,      1,      0,      0,      0,
             0,      0,      5,      0],
       [     0,      0,      0,      0,      5,      0,      0,      0,
             0,      0,     11,      0],
       [     0,      0,    192,      0,      4,      0,      0,      5,
             0,      0,      0,      0],
       [     0,      0,      0,      0,      1,      0,      0,      0,
             0,      0,      0,      0],
       [     0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0],
       [     0,      0,      0,      0,      2,      0,      0,      0,
             0,      0,     12,      0],
       [     0,      0,      0,      0,      1,      0,      0,      0,
             0,      0,      1,      0],
       [     0,      0,      2,      0,      3,      0,      0,    155,
             0,      1,      2,      0],
       [     0,      0,      0,      0,      0,      0,      0,      0,
             0,      0, 