1. Import dependancies

In [None]:
import torch
from torch import nn
import logging

import numpy as np
from sklearn.datasets import make_blobs
import torch
#from torch import nn
import argparse

2. establisch model(without psl loss)

In [None]:
logging.basicConfig(level=logging.INFO)

class SPINEModel(torch.nn.Module):
    def __init__(self, params):
        super(SPINEModel, self).__init__()
        # params
        self.inp_dim = params['inp_dim']
        self.hdim = params['hdim']
        self.noise_level = params['noise_level']
        self.getReconstructionLoss = nn.MSELoss()
        self.rho_star = 1.0 - params['sparsity']
        
        # autoencoder
        logging.info("Building model ")
        self.linear1 = nn.Linear(self.inp_dim, self.hdim)
        self.linear2 = nn.Linear(self.hdim, self.inp_dim)
        

    def forward(self, batch_x, batch_y):
        # forward
        batch_size = batch_x.size(0)
        linear1_out = self.linear1(batch_x)
        h = linear1_out.clamp(min=0, max=1) # capped relu
        out = self.linear2(h)

        # different terms of the loss
        reconstruction_loss = self.getReconstructionLoss(out, batch_y) # reconstruction loss
        #psl_loss = self._getPSLLoss(h, batch_size) 		# partial sparsity loss
        asl_loss = self._getASLLoss(h)    	# average sparsity loss
        total_loss = reconstruction_loss + asl_loss
        
        return out, h, total_loss, [reconstruction_loss, asl_loss]


    def _getPSLLoss(self,h, batch_size):
        return torch.sum(h*(1-h))/ (batch_size * self.hdim)


    def _getASLLoss(self, h):
        temp = torch.mean(h, dim=0) - self.rho_star
        temp = temp.clamp(min=0)
        return torch.sum(temp * temp) / self.hdim

3. data load function

In [None]:
logging.basicConfig(level=logging.DEBUG)

class DataHandler:
    
    def __init__(self):
        pass

    
    def loadData(self, filename):
        
        #limit = 1000 ## for debnugging. TODO: remove this
        lines = open(filename).readlines()#[:limit]
        self.data = []
        self.words = []
        for line in lines:
			tokens = line.strip().split()
			if len(tokens[1:]) != 300 and len(tokens[1:]) != 1280:
				line = ''
			for token in tokens:
				if token == "nan":
					line = ''
			if line == '':
				continue
			tokens = line.strip().split()
			self.words.append(tokens[0])
			self.data.append([float(i) for i in tokens[1:]])
        
        self.data = np.array(self.data)
        logging.info("Loaded data. #shape = " + str(self.data.shape))
        logging.info(" #words = %d " %(len(self.words)) )
        self.data_size = self.data.shape[0]
        self.inp_dim = self.data.shape[1]
        self.original_data = self.data[:]
        logging.debug("original_data[0][0:5] = " + str(self.original_data[0][0:5]))


    def getWordsList(self):
        return self.words

    def getDataShape(self):
        return self.data.shape

    def resetDataOrder(self):
        self.data = self.original_data[:]
        logging.debug("original_data[0][0:5] = " + str(self.original_data[0][0:5]))

    def getNumberOfBatches(self, batch_size):
        return int(( self.data_size + batch_size - 1 ) / batch_size)

    def getBatch(self, i, batch_size, noise_level, denoising):
        batch_y = self.data[i*batch_size:min((i+1)*batch_size, self.data_size)]
        batch_x = batch_y
        if denoising:
            batch_x = batch_y + get_noise_features(batch_y.shape[0], self.inp_dim, noise_level)
        return batch_x, batch_y

    def shuffleTrain(self):
        indices = np.arange(self.data_size)
        np.random.shuffle(indices)
        self.data = self.data[indices]

In [None]:
def compute_sparsity(X):
    non_zeros = 1. * np.count_nonzero(X)
    total = X.size
    sparsity = 100. * (1 - (non_zeros)/total)
    return sparsity

def dump_vectors(X, outfile, words):
    print ("shape", X.shape)
    assert len(X) == len(words) #TODO print error statement
    fw = open(outfile, 'w')
    for i in range(len(words)):
        fw.write(words[i] + " ")
        for j in X[i]:
            fw.write(str(j) + " ")
        fw.write("\n")
    fw.close()

def get_noise_features(n_samples, n_features, noise_amount):
    noise_x,  _ =  make_blobs(n_samples=n_samples, n_features=n_features, 
                cluster_std=noise_amount,
                centers=np.array([np.zeros(n_features)]))
    return noise_x

4. define train and get embeddings function(without psl loss)

In [None]:
logging.basicConfig(level=logging.INFO)

class Solver:

	def __init__(self, params):

		# Build data handler
		self.data_handler = DataHandler()
		self.data_handler.loadData(params['input'])
		params['inp_dim'] = self.data_handler.getDataShape()[1]
		logging.info("="*41)


		# Build model
		self.model = SPINEModel(params)
		self.dtype = torch.FloatTensor
		use_cuda = torch.cuda.is_available()
		if use_cuda:
			self.model.cuda()
			self.dtype = torch.cuda.FloatTensor
		self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1)
		logging.info("="*41)


	def train(self, params):
		num_epochs, batch_size = params['num_epochs'], params['batch_size'],
		optimizer = self.optimizer
		dtype = self.dtype
		for iteration in range(num_epochs):
			self.data_handler.shuffleTrain()
			num_batches = self.data_handler.getNumberOfBatches(batch_size)
			epoch_losses = np.zeros(4) # rl, asl, psl, total
			for batch_idx in range(num_batches):
				optimizer.zero_grad()
				batch_x, batch_y = self.data_handler.getBatch(batch_idx, batch_size, params['noise_level'], params['denoising'] )
				batch_x = torch.from_numpy(batch_x).type(dtype)
				batch_y = torch.from_numpy(batch_y).type(dtype)
				out, h, loss, loss_terms = self.model(batch_x, batch_y)
				reconstruction_loss, asl_loss = loss_terms
				loss.backward()
				optimizer.step()
				epoch_losses[0]+=reconstruction_loss.item()
				epoch_losses[1]+=asl_loss.item()
				epoch_losses[2]+=0
				epoch_losses[3]+=loss.item()
			print("After epoch %r, Reconstruction Loss = %.4f, ASL = %.4f,"\
						"PSL = %.4f, and total = %.4f"
						%(iteration+1, epoch_losses[0], epoch_losses[1], epoch_losses[2], epoch_losses[3]) )
			#logging.info("After epoch %r, Sparsity = %.1f"
			#			%(iteration+1, utils.compute_sparsity(h.cpu().data.numpy())))
				#break
			#break

	def getSpineEmbeddings(self, batch_size, params):
		ret = []
		self.data_handler.resetDataOrder()
		num_batches = self.data_handler.getNumberOfBatches(batch_size)
		for batch_idx in range(num_batches):
			batch_x, batch_y = self.data_handler.getBatch(batch_idx, batch_size, params['noise_level'], params['denoising'] )
			batch_x = torch.from_numpy(batch_x).type(self.dtype)
			batch_y = torch.from_numpy(batch_y).type(self.dtype)
			_, h, _, _ = self.model(batch_x, batch_y)
			ret.extend(h.cpu().data.numpy())
		return np.array(ret)

	def getWordsList(self):
		return self.data_handler.getWordsList()

5. hyperparameters and input/output(without psl loss)

In [None]:
params = {}
params.update({"hdim": 1000})
params.update({"denoising": False})
params.update({"noise_level": 0.2})
params.update({"num_epochs": 100})
params.update({"batch_size": 64})
params.update({"sparsity": 0.85})
# I use Colab here so I simply copy the data's path, I'll upload all the data this notebook need, if the path doesn't work, please change it manualy.
params.update({"input": "/content/glove.6B.300d.txt"})
params.update({"output": "/content/glove.6B.300d.txt_without_psl.spine"})


#parser = argparse.ArgumentParser(
#    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

'''
parser.add_argument('--hdim', dest='hdim', type=int, default=1000,
                    help='resultant embedding size')

parser.add_argument('--denoising', dest='denoising',
					default=False,
					action='store_true',
                    help='noise amount for denoising auto-encoder')

parser.add_argument('--noise', dest='noise_level', type=float,
					default=0.2,
                    help='noise amount for denoising auto-encoder')

parser.add_argument('--num_epochs', dest='num_epochs', type=int,
					default=100,
                    help='number of epochs')

parser.add_argument('--batch_size', dest='batch_size', type=int,
					default=64,
                    help='batch size')

parser.add_argument('--sparsity', dest='sparsity', type=float,
					default=0.85,
                    help='sparsity')

parser.add_argument('--input', dest='input',
					default = "../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt" ,
                    help='input src')

parser.add_argument('--output', dest='output',
					default = "./output/glove.6B.200d.txt.spine" ,
                    help='output')
'''

In [None]:
logging.info("PARAMS = " + str(params))
logging.info("="*41)
solver = Solver(params)
solver.train(params)

# dumping the final vectors
logging.info("Dumping the final SPine embeddings")
output_path = params['output'] #+ ".spine"
final_batch_size = 512
spine_embeddings = solver.getSpineEmbeddings(final_batch_size, params)
dump_vectors(spine_embeddings, output_path, solver.getWordsList())

establisch model(without asl loss)

In [None]:
logging.basicConfig(level=logging.INFO)

class SPINEModel(torch.nn.Module):
    def __init__(self, params):
        super(SPINEModel, self).__init__()
        # params
        self.inp_dim = params['inp_dim']
        self.hdim = params['hdim']
        self.noise_level = params['noise_level']
        self.getReconstructionLoss = nn.MSELoss()
        self.rho_star = 1.0 - params['sparsity']
        
        # autoencoder
        logging.info("Building model ")
        self.linear1 = nn.Linear(self.inp_dim, self.hdim)
        self.linear2 = nn.Linear(self.hdim, self.inp_dim)
        

    def forward(self, batch_x, batch_y):
        # forward
        batch_size = batch_x.size(0)
        linear1_out = self.linear1(batch_x)
        h = linear1_out.clamp(min=0, max=1) # capped relu
        out = self.linear2(h)

        # different terms of the loss
        reconstruction_loss = self.getReconstructionLoss(out, batch_y) # reconstruction loss
        psl_loss = self._getPSLLoss(h, batch_size) 		# partial sparsity loss
        #asl_loss = self._getASLLoss(h)    	# average sparsity loss
        total_loss = reconstruction_loss + psl_loss
        
        return out, h, total_loss, [reconstruction_loss, psl_loss]


    def _getPSLLoss(self,h, batch_size):
        return torch.sum(h*(1-h))/ (batch_size * self.hdim)


    def _getASLLoss(self, h):
        temp = torch.mean(h, dim=0) - self.rho_star
        temp = temp.clamp(min=0)
        return torch.sum(temp * temp) / self.hdim

In [None]:
logging.basicConfig(level=logging.INFO)

class Solver:

	def __init__(self, params):

		# Build data handler
		self.data_handler = DataHandler()
		self.data_handler.loadData(params['input'])
		params['inp_dim'] = self.data_handler.getDataShape()[1]
		logging.info("="*41)


		# Build model
		self.model = SPINEModel(params)
		self.dtype = torch.FloatTensor
		use_cuda = torch.cuda.is_available()
		if use_cuda:
			self.model.cuda()
			self.dtype = torch.cuda.FloatTensor
		self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1)
		logging.info("="*41)


	def train(self, params):
		num_epochs, batch_size = params['num_epochs'], params['batch_size'],
		optimizer = self.optimizer
		dtype = self.dtype
		for iteration in range(num_epochs):
			self.data_handler.shuffleTrain()
			num_batches = self.data_handler.getNumberOfBatches(batch_size)
			epoch_losses = np.zeros(4) # rl, asl, psl, total
			for batch_idx in range(num_batches):
				optimizer.zero_grad()
				batch_x, batch_y = self.data_handler.getBatch(batch_idx, batch_size, params['noise_level'], params['denoising'] )
				batch_x = torch.from_numpy(batch_x).type(dtype)
				batch_y = torch.from_numpy(batch_y).type(dtype)
				out, h, loss, loss_terms = self.model(batch_x, batch_y)
				reconstruction_loss, psl_loss = loss_terms
				loss.backward()
				optimizer.step()
				epoch_losses[0]+=reconstruction_loss.item()
				epoch_losses[1]+=0
				epoch_losses[2]+=psl_loss.item()
				epoch_losses[3]+=loss.item()
			print("After epoch %r, Reconstruction Loss = %.4f, ASL = %.4f,"\
						"PSL = %.4f, and total = %.4f"
						%(iteration+1, epoch_losses[0], epoch_losses[1], epoch_losses[2], epoch_losses[3]) )
			#logging.info("After epoch %r, Sparsity = %.1f"
			#			%(iteration+1, utils.compute_sparsity(h.cpu().data.numpy())))
				#break
			#break

	def getSpineEmbeddings(self, batch_size, params):
		ret = []
		self.data_handler.resetDataOrder()
		num_batches = self.data_handler.getNumberOfBatches(batch_size)
		for batch_idx in range(num_batches):
			batch_x, batch_y = self.data_handler.getBatch(batch_idx, batch_size, params['noise_level'], params['denoising'] )
			batch_x = torch.from_numpy(batch_x).type(self.dtype)
			batch_y = torch.from_numpy(batch_y).type(self.dtype)
			_, h, _, _ = self.model(batch_x, batch_y)
			ret.extend(h.cpu().data.numpy())
		return np.array(ret)

	def getWordsList(self):
		return self.data_handler.getWordsList()

In [None]:
params = {}
params.update({"hdim": 1000})
params.update({"denoising": False})
params.update({"noise_level": 0.2})
params.update({"num_epochs": 100})
params.update({"batch_size": 64})
params.update({"sparsity": 0.85})
# I use Colab here so I simply copy the data's path, I'll upload all the data this notebook need, if the path doesn't work, please change it manualy.
params.update({"input": "/content/glove.6B.300d.txt"})
params.update({"output": "/content/glove.6B.300d.txt_without_asl.spine"})


#parser = argparse.ArgumentParser(
#    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

'''
parser.add_argument('--hdim', dest='hdim', type=int, default=1000,
                    help='resultant embedding size')

parser.add_argument('--denoising', dest='denoising',
					default=False,
					action='store_true',
                    help='noise amount for denoising auto-encoder')

parser.add_argument('--noise', dest='noise_level', type=float,
					default=0.2,
                    help='noise amount for denoising auto-encoder')

parser.add_argument('--num_epochs', dest='num_epochs', type=int,
					default=100,
                    help='number of epochs')

parser.add_argument('--batch_size', dest='batch_size', type=int,
					default=64,
                    help='batch size')

parser.add_argument('--sparsity', dest='sparsity', type=float,
					default=0.85,
                    help='sparsity')

parser.add_argument('--input', dest='input',
					default = "../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt" ,
                    help='input src')

parser.add_argument('--output', dest='output',
					default = "./output/glove.6B.200d.txt.spine" ,
                    help='output')
'''

In [None]:
logging.info("PARAMS = " + str(params))
logging.info("="*41)
solver = Solver(params)
solver.train(params)

# dumping the final vectors
logging.info("Dumping the final SPine embeddings")
output_path = params['output'] #+ ".spine"
final_batch_size = 512
spine_embeddings = solver.getSpineEmbeddings(final_batch_size, params)
dump_vectors(spine_embeddings, output_path, solver.getWordsList())