# Turning data into one dimensional tensor

In [21]:
require 'nn';

In [142]:
WordSPlitterMinibatchLoader = {}
WordSPlitterMinibatchLoader.__index = WordSPlitterMinibatchLoader

data_dir = "/Users/david/Documents/MemoryNetwork/output_lua"

function WordSPlitterMinibatchLoader.create_vocabulary(input_file,vocab_file)
    	print('loading text file....')
	local rawdata
	local tot_len = 0
	local f = assert(io.open(input_file,"r"))
	local max_sent_len = 0
	local sent_count = 0
	-- Create vocabulary if it doesn't exist yet
	print('creating vocabulary mapping')
	local unordered = {}
	rawdata = f:read():lower()
	repeat
		sent_count = sent_count + 1
		for k,word in pairs(rawdata:split(" ")) do 
			word=word:lower()
			if not unordered[word] then unordered[word] = true end
		end
		sent_len = #rawdata:split(" ")
		if sent_len > max_sent_len then max_sent_len=sent_len end
		tot_len = tot_len + sent_len
		rawdata = f:read()
	until not rawdata
	f:close()
	-- sort into a table (i.e. keys become 1..N)
	local ordered = {}
	for word in pairs(unordered) do ordered[#ordered + 1] = word end
	table.sort(ordered)
	-- invert `ordered` to create the char->int mapping
	local vocab_mapping = {}
	for i, word in ipairs(ordered) do
		vocab_mapping[word] = i
	end
	print('saving ' .. vocab_file)
    torch.save(vocab_file, vocab_mapping)
    return {sent_count,vocab_mapping,max_sent_len}
end

In [143]:
function iterate_trough(data,currline,rawdata,f)
	repeat
		for k,word in pairs(rawdata:split(" ")) do 
			data[{currline,k}] = vocab_mapping[word:lower()]
		end
		currline = currline + 1
		rawdata = f:read():lower()
	until not rawdata
end

In [147]:
function WordSPlitterMinibatchLoader.create_tensor(sent_count,vocab_mapping,max_sent_len,input_file,tensor_file)
	print('putting data into tensor...')
	local data = torch.ByteTensor(sent_count,max_sent_len):zero() -- store it into 1D first, then rearrange
	f = assert(io.open(input_file,"r"))
	local currline = 1
	-------- Writing in the tensor file 
	rawdata = f:read()
	repeat
        rawdata = rawdata:lower()
		for k,word in pairs(rawdata:split(" ")) do 
			data[{currline,k}] = vocab_mapping[word:lower()]
		end
		currline = currline + 1
		rawdata = f:read()
	until not rawdata
	f:close()
	-- save output preprocessed files
    print('saving ' .. tensor_file)
    torch.save(tensor_file, data)
end


In [148]:
function WordSPlitterMinibatchLoader.text_to_tensor(input_file, out_vocab_file, out_tensor_file)
    local timer = torch.Timer()
    res = WordSPlitterMinibatchLoader.create_vocabulary(input_file,out_vocab_file)
    local sent_count = res[1]
    local vocab_mapping = res[2]
    local max_sent_len = res[3]
    return WordSPlitterMinibatchLoader.create_tensor(sent_count,vocab_mapping,max_sent_len,input_file,out_tensor_file)
end

In [149]:
input_file = "/Users/david/Documents/MemoryNetwork/preprocessing/output.txt"
out_vocab_file = "/Users/david/Documents/MemoryNetwork/output_lua/vocab.t7"
out_tensor_file = "/Users/david/Documents/MemoryNetwork/output_lua/data.t7"
WordSPlitterMinibatchLoader.text_to_tensor(input_file,out_vocab_file,out_tensor_file)

loading text file....	
creating vocabulary mapping	


saving /Users/david/Documents/MemoryNetwork/output_lua/vocab.t7	


putting data into tensor...	


saving /Users/david/Documents/MemoryNetwork/output_lua/data.t7	


# Scrap