In [112]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split



import os

import torch
import torchvision
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import MNIST
from torchvision.utils import save_image

In [103]:
sample_batch = "nsample_1107"
sample_dir = "CHIMES_0.6/Out/{}/{}_csv".format(sample_batch,sample_batch)

def get_vecs(df,N):
    
    x = []
    y = []
    
    nget = int(len(df)/N)
    cols = df.columns[4:-21]
    nspec = len(cols)
    select_times = df["t(Myrs)"].unique()[::nget]
    for t in select_times:
        xx = np.zeros(nspec)
        yy = np.zeros(2*nspec)
        grph =df[np.logical_and(df["t(Myrs)"]==t,df["datatype"]=="graph")][cols].iloc[0]
        drv =  df[np.logical_and(df["t(Myrs)"]==t,df["datatype"]=="deriv")][cols].iloc[0]
        
        xx[:] = grph.values[:]
        yy[:nspec] = grph.values[:]
        yy[nspec:] = drv.values[:]
        
        x.append(xx)
        y.append(yy)
    return list(cols),select_times,x,y
    

def create_dataset(dirpath,N_per_model,test_size=0.3,val_size=0.3):

    X = []
    Y = []
    
    dirfiles = os.listdir(dirpath)
    total_s =  len(dirfiles)*N_per_model
    
    print("{} lines for each of the {} models = {} samples".format(N_per_model,len(dirfiles),total_s))
    print("  {} samples for training".format(int(total_s*(1-test_size)*(1-val_size))))
    print("  {} samples for validating".format(int(total_s*(1-test_size)*val_size)))
    print("  {} samples for testing".format(int(total_s*test_size)))
    for i in range(len(dirfiles)):
        filename = dirfiles[i]
        if  filename.endswith(".csv.gz"):
            
            s = "extracting {0} ... ".format(filename)
            if i == len(dirfiles)-1:
                s+='\n'        
            if i>0:
                s = '\r'+s
            print(s, end='')

            filepath = os.path.join(dirpath, filename)
            data = pd.read_csv(filepath,compression="gzip")
            cols_,sel_t,x,y = get_vecs(data,N_per_model)
            for i in range(len(x)):
                X.append(x[i])
                Y.append(y[i])
    
            
        else:
            continue
    
    print("Extraction completed. Splitting dataset...")
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=test_size)
    X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=val_size)
    print ("Done.")
    return {"cols":cols_,"times":sel_t,"x_train":X_train,"x_val":X_val,"x_test":X_test,"y_train":y_train,"y_val":y_val,"y_test":y_test}

In [105]:
dataset = create_dataset(sample_dir,10,0.3)

10 lines for each of the 150 models = 1500 samples
  735 samples for training
  315 samples for validating
  450 samples for testing
extracting nsample_1107_99.csv.gz ...  
Extraction completed. Splitting dataset...
Done.


In [114]:

num_specs = len(dataset["cols"])
num_epochs = 10
batch_size = 128
learning_rate = 1e-3


In [115]:
num_specs

135

In [113]:



#img_transform = transforms.Compose([
#    transforms.ToTensor(),
#    transforms.Normalize([0.5], [0.5])
#])

#dataset = MNIST('./data', transform=img_transform)
#dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(num_specs, 32),
            nn.ReLU(True),
            nn.Linear(32, 16),
            nn.ReLU(True), 
            nn.Linear(16, 8), 
            nn.ReLU(True), 
            nn.Linear(8, 5))
        self.decoder = nn.Sequential(
            nn.Linear(5, 8),
            nn.ReLU(True),
            nn.Linear(8, 16),
            nn.ReLU(True),
            nn.Linear(16, 32),
            nn.ReLU(True), nn.Linear(32, 2*num_specs), nn.Tanh())

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


model = autoencoder()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(
    model.parameters(), lr=learning_rate, weight_decay=1e-5)

for epoch in range(num_epochs):
    for data in dataloader:
        img, _ = data
        img = img.view(img.size(0), -1)
        img = Variable(img).cuda()
        # ===================forward=====================
        output = model(img)
        loss = criterion(output, img)
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log========================
    print('epoch [{}/{}], loss:{:.4f}'
          .format(epoch + 1, num_epochs, loss.data[0]))
    if epoch % 10 == 0:
        pic = to_img(output.cpu().data)
        save_image(pic, './mlp_img/image_{}.png'.format(epoch))

torch.save(model.state_dict(), './sim_autoencoder.pth')

AssertionError: Torch not compiled with CUDA enabled

In [108]:
np.savez_compressed("dataset",dataset)

In [100]:
import time

# status generator
def range_with_status(total):
    """ iterate from 0 to total and show progress in console """
    n=0
    while n<total:
        done = '#'*(n+1)
        todo = '-'*(total-n-1)
        s = '<{0}>'.format(done+todo)
        if not todo:
            s+='\n'        
        if n>0:
            s = '\r'+s
        print(s, end='')
        yield n
        n+=1

# example for use of status generator
for i in range_with_status(100):
    print(i)

<#--------------------------------------------------------------------------------------------------->0
<##-------------------------------------------------------------------------------------------------->1
<###------------------------------------------------------------------------------------------------->2
<####------------------------------------------------------------------------------------------------>3
<#####----------------------------------------------------------------------------------------------->4
<######---------------------------------------------------------------------------------------------->5
<#######--------------------------------------------------------------------------------------------->6
<########-------------------------------------------------------------------------------------------->7
<#########------------------------------------------------------------------------------------------->8
<##########--------------------------------------------