In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data as Data
import torchvision
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import numpy as np
%matplotlib inline

from preprocess import generate_count_vectorizer, cosine_similarity

In [2]:
# Hyper Parameters
EPOCH = 3
BATCH_SIZE = 60
LR = 0.0005         # learning rate

In [3]:
X, cv, answers, questions, word_ratio, answer_mapping = generate_count_vectorizer()
X = X.toarray()

In [4]:
vol_sz = X.shape[1]

In [5]:
train = torch.from_numpy(X)

In [6]:
print(train.size())

torch.Size([600, 7574])


In [7]:
def get_batches(x, batch_size=100):
    n_batches = len(x)//batch_size
    x = x[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size]

In [8]:
for ii, x in enumerate(get_batches(train, 100), 1):
    print(x.size())

torch.Size([100, 7574])
torch.Size([100, 7574])
torch.Size([100, 7574])
torch.Size([100, 7574])
torch.Size([100, 7574])
torch.Size([100, 7574])


In [11]:
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(vol_sz, 512),
            nn.Tanh(),
            nn.Linear(512, 256),
            nn.Tanh(),
            nn.Linear(256, 128),
            nn.Tanh(),
            nn.Linear(128, 32),   # compress to 3 features which can be visualized in plt
        )
        self.decoder = nn.Sequential(
            nn.Linear(32, 128),
            nn.Tanh(),
            nn.Linear(128, 256),
            nn.Tanh(),
            nn.Linear(256, 512),
            nn.Tanh(),
            nn.Linear(512, vol_sz),
            nn.Sigmoid(),       # compress to a range (0, 1)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [14]:
autoencoder = AutoEncoder()
print(autoencoder)

optimizer = torch.optim.Adam(autoencoder.parameters(), lr=LR)
loss_func = nn.MSELoss()

for epoch in range(EPOCH):
    for step, x in enumerate(get_batches(train, BATCH_SIZE), 1):
        b_x = Variable(x.view(-1, vol_sz)).float()   # batch x, shape (batch, 28*28)
        b_y = Variable(x.view(-1, vol_sz)).float()   # batch y, shape (batch, 28*28)

        encoded, decoded = autoencoder(b_x)
        
        loss = loss_func(decoded, b_y)      # mean square error
        optimizer.zero_grad()               # clear gradients for this training step
        loss.backward()                     # backpropagation, compute gradients
        optimizer.step()                    # apply gradients
        
        #if step % 5 == 0 and epoch in [0, 5, EPOCH-1]:
        print('Epoch: ', epoch, '| train loss: %.4f' % loss.data[0])   

AutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=7574, out_features=512, bias=True)
    (1): Tanh()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): Tanh()
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): Tanh()
    (6): Linear(in_features=128, out_features=32, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=32, out_features=128, bias=True)
    (1): Tanh()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): Tanh()
    (4): Linear(in_features=256, out_features=512, bias=True)
    (5): Tanh()
    (6): Linear(in_features=512, out_features=7574, bias=True)
    (7): Sigmoid()
  )
)
Epoch:  0 | train loss: 0.2768




Epoch:  0 | train loss: 0.2671
Epoch:  0 | train loss: 0.2658
Epoch:  0 | train loss: 0.2585
Epoch:  0 | train loss: 0.2588
Epoch:  0 | train loss: 0.2575
Epoch:  0 | train loss: 0.2308
Epoch:  0 | train loss: 0.2045
Epoch:  0 | train loss: 0.1822
Epoch:  0 | train loss: 0.1550
Epoch:  1 | train loss: 0.1287
Epoch:  1 | train loss: 0.0933
Epoch:  1 | train loss: 0.0706
Epoch:  1 | train loss: 0.0535
Epoch:  1 | train loss: 0.0515
Epoch:  1 | train loss: 0.0576
Epoch:  1 | train loss: 0.0378
Epoch:  1 | train loss: 0.0216
Epoch:  1 | train loss: 0.0259
Epoch:  1 | train loss: 0.0298
Epoch:  2 | train loss: 0.0392
Epoch:  2 | train loss: 0.0310
Epoch:  2 | train loss: 0.0350
Epoch:  2 | train loss: 0.0315
Epoch:  2 | train loss: 0.0408
Epoch:  2 | train loss: 0.0526
Epoch:  2 | train loss: 0.0344
Epoch:  2 | train loss: 0.0196
Epoch:  2 | train loss: 0.0248
Epoch:  2 | train loss: 0.0292


In [15]:
autoencoder.eval()

AutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=7574, out_features=512, bias=True)
    (1): Tanh()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): Tanh()
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): Tanh()
    (6): Linear(in_features=128, out_features=32, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=32, out_features=128, bias=True)
    (1): Tanh()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): Tanh()
    (4): Linear(in_features=256, out_features=512, bias=True)
    (5): Tanh()
    (6): Linear(in_features=512, out_features=7574, bias=True)
    (7): Sigmoid()
  )
)

In [None]:
## evalation
# ques = answers[13]
# qes = cv.transform([ques.content])[0].toarray()[0]
# qesT = torch.from_numpy(qes).float()

# fea = autoencoder.encoder(qesT)
# print(fea)

# back = autoencoder.decoder(fea)
# print(back)


In [None]:
# print(X[0])

In [17]:
def getTrainFeatures(ae, X):
    res = []
    for i in range(X.shape[0]):
        ans = torch.from_numpy(X[i]).float()
        fea = autoencoder.encoder(ans)
        res.append(fea)
    return res

hiddenTrain = getTrainFeatures(autoencoder, X)

In [18]:
# find the best answer
def getIndexOrderList(ques, ansLst):
    simiList = []
    for ans in ansLst:
        simiList.append(cosine_similarity(ques, ans))
    res = list(range(len(simiList)))
    return sorted(res, key = lambda i : simiList[i], reverse= True)

# print(getIndexOrderList(fea, hiddenTrain))

In [23]:
def evalQuestion(index, train):
    ques = questions[index]
    tar = answer_mapping[ques.peer_idx]
    
    qes = cv.transform([ques.content])[0].toarray()[0]
    qesT = torch.from_numpy(qes).float()
    fea = autoencoder.encoder(qesT)
    
    l = getIndexOrderList(fea, train)
    # find the anser
    for i in range(len(l)):
        if tar == l[i]:
            return i

In [26]:
for i in range(5):
    print(evalQuestion(i, hiddenTrain))

It seems to me that the addition of electrons and protons as you move across a period would cause an atom to become larger However I'm told it gets smaller Why is this
As you move from left to right across a period the number of protons in the nucleus increases The electrons are thus attracted to the nucleus more strongly and the atomic radius is smaller (this attraction is much stronger than the relatively weak repulsion between electrons) As you move down a column there are more protons but there are also more complete energy levels below the valence electrons These lower energy levels shield the valence electrons from the attractive effects of the atom's nucleus so the atomic radius gets larger

139
My understanding is that $\mathrm{NaCl}$ is an ionic compound in which $\mathrm{Cl}$ becomes (effectively) $\mathrm{Cl^-}$ and $\mathrm{Na}$ becomes $\mathrm{Na^+}$ So I understand why I would get a sea of particles that would stick together But why does the above mean that it will have 