In [67]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data as Data
import torchvision
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import numpy as np
%matplotlib inline

from preprocess import *

In [68]:
# Hyper Parameters
EPOCH = 10
BATCH_SIZE = 60
LR = 0.005         # learning rate

In [69]:
X, cv, answers, word_ratio = generate_count_vectorizer()

In [70]:
vol_sz = X.shape[1]

In [71]:
train = torch.from_numpy(X)

In [72]:
print(train.size())

torch.Size([600, 7574])


In [73]:
def get_batches(x, batch_size=100):
    n_batches = len(x)//batch_size
    x = x[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size]

In [74]:
for ii, x in enumerate(get_batches(train, 100), 1):
    print(x.size())

torch.Size([100, 7574])
torch.Size([100, 7574])
torch.Size([100, 7574])
torch.Size([100, 7574])
torch.Size([100, 7574])
torch.Size([100, 7574])


In [75]:
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(vol_sz, 512),
            nn.Tanh(),
            nn.Linear(512, 256),
            nn.Tanh(),
            nn.Linear(256, 128),
            nn.Tanh(),
            nn.Linear(128, 32),   # compress to 3 features which can be visualized in plt
        )
        self.decoder = nn.Sequential(
            nn.Linear(32, 128),
            nn.Tanh(),
            nn.Linear(128, 256),
            nn.Tanh(),
            nn.Linear(256, 512),
            nn.Tanh(),
            nn.Linear(512, vol_sz),
            nn.Sigmoid(),       # compress to a range (0, 1)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [81]:
autoencoder = AutoEncoder()
print(autoencoder)

optimizer = torch.optim.Adam(autoencoder.parameters(), lr=LR)
loss_func = nn.MSELoss()

autoencoder.train()
for epoch in range(EPOCH):
    for step, x in enumerate(get_batches(train, BATCH_SIZE), 1):
        b_x = Variable(x.view(-1, vol_sz)).float()   # batch x, shape (batch, 28*28)
        b_y = Variable(x.view(-1, vol_sz)).float()   # batch y, shape (batch, 28*28)

        encoded, decoded = autoencoder(b_x)

        loss = loss_func(decoded, b_y)      # mean square error
        optimizer.zero_grad()               # clear gradients for this training step
        loss.backward()                     # backpropagation, compute gradients
        optimizer.step()                    # apply gradients
        
        if step % 5 == 0 and epoch in [0, 5, EPOCH-1]:
            print('Epoch: ', epoch, '| train loss: %.4f' % loss.data[0])   

AutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=7574, out_features=512, bias=True)
    (1): Tanh()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): Tanh()
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): Tanh()
    (6): Linear(in_features=128, out_features=32, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=32, out_features=128, bias=True)
    (1): Tanh()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): Tanh()
    (4): Linear(in_features=256, out_features=512, bias=True)
    (5): Tanh()
    (6): Linear(in_features=512, out_features=7574, bias=True)
    (7): Sigmoid()
  )
)




Epoch:  0 | train loss: 0.0417
Epoch:  0 | train loss: 0.0297
Epoch:  5 | train loss: 0.0413
Epoch:  5 | train loss: 0.0298
Epoch:  9 | train loss: 0.0414
Epoch:  9 | train loss: 0.0297


In [82]:
autoencoder.eval()

AutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=7574, out_features=512, bias=True)
    (1): Tanh()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): Tanh()
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): Tanh()
    (6): Linear(in_features=128, out_features=32, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=32, out_features=128, bias=True)
    (1): Tanh()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): Tanh()
    (4): Linear(in_features=256, out_features=512, bias=True)
    (5): Tanh()
    (6): Linear(in_features=512, out_features=7574, bias=True)
    (7): Sigmoid()
  )
)

In [87]:
## evalation
ques = answers[13]
qes = cv.transform([ques.content])[0].toarray()[0]
qesT = torch.from_numpy(qes).float()

fea = autoencoder.encoder(qesT)
print(fea)

back = autoencoder.decoder(fea)
print(back)


tensor([ 0.8494, -2.9037, -1.4144, -4.9663,  2.1244,  3.6652, -3.3771,
         0.9677,  2.4564, -2.0442, -1.3018, -3.1650,  0.5961, -3.3204,
        -2.4065,  1.5097,  2.9149,  3.3373,  4.0288,  2.4421,  1.3653,
        -1.7443,  3.0628,  3.7115,  3.3038, -1.6719,  3.4336, -2.3190,
         3.5927,  3.1077, -0.4736,  2.7039])
tensor([ 2.1554e-07,  3.1428e-07,  2.0067e-07,  ...,  1.5178e-06,
         1.4302e-06,  2.7688e-07])


In [84]:
print(X[0])

[0 0 0 ... 0 0 0]


In [88]:
def getTrainFeatures(ae, X):
    res = []
    for i in range(X.shape[0]):
        ans = torch.from_numpy(X[i]).float()
        fea = autoencoder.encoder(ans)
        res.append(fea)
    return res

hiddenTrain = getTrainFeatures(autoencoder, X)

In [89]:
# find the best answer
def getIndexOrderList(ques, ansLst):
    simiList = []
    for ans in ansLst:
        simiList.append(cosine_similarity(ques, ans))
    res = list(range(len(simiList)))
    return sorted(res, key = lambda i : simiList[i], reverse= True)

print(getIndexOrderList(fea, hiddenTrain))

[13, 287, 285, 19, 367, 390, 481, 538, 332, 383, 43, 306, 394, 364, 450, 90, 52, 201, 241, 485, 37, 250, 557, 42, 505, 533, 246, 516, 361, 552, 180, 212, 405, 261, 11, 38, 100, 324, 291, 546, 369, 292, 413, 467, 318, 528, 4, 284, 20, 264, 509, 280, 506, 18, 483, 393, 514, 449, 416, 234, 508, 207, 592, 550, 544, 198, 319, 303, 215, 466, 371, 27, 415, 45, 289, 216, 189, 231, 195, 183, 379, 338, 49, 242, 524, 487, 408, 595, 522, 270, 197, 238, 10, 429, 539, 464, 382, 479, 255, 469, 9, 190, 315, 321, 248, 387, 529, 586, 460, 442, 24, 486, 317, 251, 377, 236, 542, 454, 525, 380, 560, 452, 407, 39, 526, 549, 275, 200, 521, 47, 470, 448, 0, 462, 417, 342, 354, 459, 274, 589, 323, 497, 217, 511, 253, 21, 426, 458, 534, 430, 15, 352, 357, 348, 419, 567, 358, 313, 308, 388, 294, 523, 347, 244, 391, 333, 473, 504, 553, 204, 492, 373, 199, 434, 457, 259, 513, 410, 277, 211, 482, 433, 269, 441, 392, 271, 214, 385, 283, 436, 116, 372, 493, 438, 478, 297, 368, 272, 305, 362, 554, 428, 360, 548, 276, 