In [32]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split



import os

import torch
import torchvision
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import MNIST
from torchvision.utils import save_image

In [33]:
sample_batch = "nsample_1107"
sample_dir = "CHIMES_0.6/Out/{}/{}_csv".format(sample_batch,sample_batch)

def get_vecs(df,N):
    
    x = []
    y = []
    
    nget = int(len(df)/N)
    cols = df.columns[4:-21]
    nspec = len(cols)
    select_times = df["t(Myrs)"].unique()[::nget]
    for t in select_times:
        xx = np.zeros(nspec)
        yy = np.zeros(2*nspec)
        grph =df[np.logical_and(df["t(Myrs)"]==t,df["datatype"]=="graph")][cols].iloc[0]
        drv =  df[np.logical_and(df["t(Myrs)"]==t,df["datatype"]=="deriv")][cols].iloc[0]
        
        xx[:] = grph.values[:]
        yy[:nspec] = grph.values[:]
        yy[nspec:] = drv.values[:]
        
        x.append(xx)
        y.append(yy)
    return list(cols),select_times,x,y
    

def create_dataset(dirpath,N_per_model,test_size=0.3,val_size=0.3):

    X = []
    Y = []
    
    dirfiles = os.listdir(dirpath)
    total_s =  len(dirfiles)*N_per_model
    
    print("{} lines for each of the {} models = {} samples".format(N_per_model,len(dirfiles),total_s))
    print("  {} samples for training".format(int(total_s*(1-test_size)*(1-val_size))))
    print("  {} samples for validating".format(int(total_s*(1-test_size)*val_size)))
    print("  {} samples for testing".format(int(total_s*test_size)))
    for i in range(len(dirfiles)):
        filename = dirfiles[i]
        if  filename.endswith(".csv.gz"):
            
            s = "extracting {0} ... ".format(filename)
            if i == len(dirfiles)-1:
                s+='\n'        
            if i>0:
                s = '\r'+s
            print(s, end='')

            filepath = os.path.join(dirpath, filename)
            data = pd.read_csv(filepath,compression="gzip")
            cols_,sel_t,x,y = get_vecs(data,N_per_model)
            for i in range(len(x)):
                X.append(x[i])
                Y.append(y[i])
    
            
        else:
            continue
    
    print("Extraction completed. Splitting dataset...")
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=test_size)
    X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=val_size)
    print ("Done.")
    return {"cols":cols_,"times":sel_t,"x_train":X_train,"x_val":X_val,"x_test":X_test,"y_train":y_train,"y_val":y_val,"y_test":y_test}

In [None]:
dataset = create_dataset(sample_dir,10,0.3)

10 lines for each of the 150 models = 1500 samples
  735 samples for training
  315 samples for validating
  450 samples for testing
extracting nsample_1107_12.csv.gz ...  

In [None]:
class autoencoder(nn.Module):
    def __init__(self,lantent_dim):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(num_specs, 32),
            nn.ReLU(True),
            nn.Linear(32, 16),
            nn.ReLU(True), 
            nn.Linear(16, 8), 
            nn.ReLU(True), 
            nn.Linear(8, latent_dim))
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 8),
            nn.ReLU(True),
            nn.Linear(8, 16),
            nn.ReLU(True),
            nn.Linear(16, 32),
            nn.ReLU(True), nn.Linear(32, 2*num_specs), nn.Tanh())

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


In [None]:

num_specs = len(dataset["cols"])
print("{} species".format(num_specs))
num_epochs = 100
batch_size = 32
learning_rate = 1e-3

latent_dim = 5


In [None]:


model = autoencoder(latent_dim)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(
    model.parameters(), lr=learning_rate, weight_decay=1e-3)

for epoch in range(num_epochs):
    for i in range(len(dataset["x_train"])):
        xvec = dataset["x_train"][i]
        yvec = dataset["y_train"][i]
        
        
        xvec = torch.from_numpy(xvec)
        yvec = torch.from_numpy(yvec)
        # ===================forward=====================
        output = model(xvec.float())
        loss = criterion(output, yvec.float())
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log========================
    print('epoch [{}/{}], loss:{:.4f}'.format(epoch + 1, num_epochs, loss.data.item()))

torch.save(model.state_dict(), './sim_autoencoder.pth')

In [None]:
np.savez_compressed("dataset",dataset)