In [1]:
! which python
! pwd

/gpfs/runtime/opt/anaconda/3-5.2.0/bin/python
/gpfs/data/awebb/dyu20/cell2location/rvagene


In [2]:
import sys
import os
os.chdir('/gpfs/data/awebb/dyu20/cell2location/rvagene/RVAgene')

In [3]:
import scvi
import scanpy as sc
import numpy as np
import pandas as pd
from datetime import datetime

from rvagene.rvagene import RVAgene
from rvagene.utils import open_data
import numpy as np
import torch
import matplotlib
import seaborn as sns
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from mpl_toolkits.mplot3d import Axes3D
from torch.utils.data import TensorDataset

  from .utilsextension import (
  from .utilsextension import (


In [83]:
opt = {
  'dataset' : "microglia_pseudotime_0.05",
  'hidden_size' : 48,
  'hidden_layer_depth' : 2,
  'latent_length' : 2,
  'batch_size' : 24,
  'learning_rate' : 0.003,
  'n_epochs' : 100,
  'dropout_rate' : 0.2,
  'optimizer' : 'Adam', # options: ADAM, SGD
  'cuda' : True, # options: True, False
  'print_every' : 30,
  'clip' : True, # options: True, False
  'max_grad_norm' : 5,
  'loss' : 'MSELoss', # options: SmoothL1Loss, MSELoss
  'palette' : 'deep' # seaborn color palette
}

In [84]:
X_train, _, y_train, _ = open_data('data', ratio_train=1, dataset=opt['dataset'])
feature_names = pd.read_csv('data/{}/feature_names.csv'.format(opt['dataset']), header=None)

num_classes = len(np.unique(y_train))
base = np.min(y_train)  # Check if data is 0-based
if base != 0:
    y_train -= base

train_dataset = TensorDataset(torch.from_numpy(X_train))

sequence_length = X_train.shape[1]
number_of_features = X_train.shape[2]

In [86]:
rvagene_model = RVAgene(sequence_length=sequence_length,
            number_of_features = number_of_features,
            hidden_size = opt['hidden_size'],
            hidden_layer_depth = opt['hidden_layer_depth'],
            latent_length = opt['latent_length'],
            batch_size = opt['batch_size'],
            learning_rate = opt['learning_rate'],\
            n_epochs = opt['n_epochs'],
            dropout_rate = opt['dropout_rate'],
            optimizer = opt['optimizer'],
            cuda = opt['cuda'],
            print_every=opt['print_every'],
            clip=opt['clip'],
            max_grad_norm=opt['max_grad_norm'],
            loss = opt['loss'],
            #log_file = 'testing2.log'
            log_file = 'data/{}/{}.log'.format(opt['dataset'], opt['dataset'])
)

# training
rvagene_model.fit(train_dataset)

# save trained model:
torch.save(rvagene_model.state_dict(), "models/{}_trained_{}_epochs.pt".format(opt['dataset'], opt['n_epochs']))


Writing training log to given file, set log_file=None to log to STDOUT


In [87]:
# re-load trained model:
rvagene_model.load_state_dict(torch.load("models/{}_trained_{}_epochs.pt".format(opt['dataset'], opt['n_epochs'])))
rvagene_model.eval()

RVAgene(n_epochs=100,batch_size=24,cuda=True)

In [88]:
################################################################################
# plot loss curve
################################################################################

# load log file
file = open('data/{}/{}.log'.format(opt['dataset'], opt['dataset']), 'r')
lines = file.read().splitlines()

In [89]:
loss = []
for line in lines:
  if "Average loss:" in line:
    loss.append(float(line.split(':')[1].replace(' ', '')))

df = pd.DataFrame(loss)
df.columns = ['loss']
df['epoch'] = np.arange(0, df.shape[0])

fig, axes = plt.subplots(figsize=(6,3))
sns.scatterplot(data=df, x='epoch', y='loss', ax=axes, color='gray')
sns.lineplot(data=df, x='epoch', y='loss', ax=axes, color='black')

sns.despine();
plt.tight_layout()
plt.savefig('figures/{}_loss.pdf'.format(opt['dataset']))

In [90]:
################################################################################
# clustering genes
################################################################################

z_run = rvagene_model.transform(train_dataset)  ## latent embeddings of training genes

In [91]:
############### plot 2 dimensional z plot ##################

df = pd.DataFrame()
df['Z1'] = z_run[:,0]
df['Z2'] = z_run[:,1]
df['feature'] = feature_names[0].to_list()[0:df.shape[0]]

label_features = ['SPI1', 'ETS1']
df_labels = df[df['feature'].isin(label_features)]


fig, axes = plt.subplots(figsize=(4,4))
sns.scatterplot(data = df, x="Z1", y='Z2', ax=axes, color='black');
sns.despine();

In [92]:
#plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0)
plt.tick_params(
    axis='both',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False,
    labelleft=False,
    left=False,
    right=False)

In [93]:
# plot text:
# for x,y,label in zip(df_labels['Z1'], df_labels['Z2'], df_labels['feature']):
#   plt.text(x=x, y=y, s=label, color='blue')

params = {'mathtext.default': 'regular' }
plt.rcParams.update(params)
axes.set_xlabel("$Z_{1}$")
axes.set_ylabel("$Z_{2}$")
plt.tight_layout()
plt.savefig('figures/{}_Z_embedding.pdf'.format(opt['dataset']))

In [94]:
################################################################################
# reconstruction
################################################################################

# reconstruct actual data:
smoothed = np.squeeze(rvagene_model.reconstruct(train_dataset))

# df of reconstructed
smoothed_df = pd.DataFrame(smoothed)
smoothed_df.columns = feature_names[0].to_list()[0:smoothed.shape[1]]
smoothed_df['pseudotime_bin'] = np.arange(0, smoothed.shape[0])
smoothed_melted = smoothed_df.melt(id_vars='pseudotime_bin')
smoothed_melted.columns = ['pseudotime_bin', 'feature', 'reconstructed']

# format original data in same way:
feature_df = pd.DataFrame(np.transpose(np.squeeze(X_train)[0:smoothed.shape[1]]))
feature_df.columns = feature_names[0].to_list()[0:smoothed.shape[1]]
feature_df['pseudotime_bin'] = np.arange(0, smoothed.shape[0])
feature_melted = feature_df.melt(id_vars='pseudotime_bin')
feature_melted.columns = ['pseudotime_bin', 'feature', 'original']

# combine table:
smoothed_melted['original'] = feature_melted['original']

# write reconstructed matrix
# add k-means cluster ID and Z1 Z2 to this table!!
out_df = smoothed_df.drop('pseudotime_bin', axis=1).transpose()
out_df['Z1'] = df['Z1'].to_list()
out_df['Z2'] = df['Z2'].to_list()
# out_df['cluster'] = df['cluster'].to_list()

out_df.to_csv('data/reconstructed/{}_recon.csv'.format(opt['dataset']), header=True)

In [102]:
out_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,Z1,Z2
Rb1cc1,0.354995,0.181817,0.154737,-0.041261,0.902913,0.248327,0.150416,0.064817,0.080104,0.068727,...,0.037757,-0.024408,-0.128440,-0.117420,-0.120234,-0.265012,-0.124757,-0.951205,-1.621922,0.226180
Pcmtd1,-0.190572,-0.348974,-0.255057,-0.357097,-0.449995,0.818296,-0.184235,-0.326926,-0.392982,-0.417168,...,-0.466736,-0.486659,-0.492392,-0.482218,-0.503659,-0.539469,-0.403344,-0.982736,0.417432,0.595964
2610203C22Rik,-0.622434,-0.598717,-0.615300,-0.604465,-0.600945,-0.612673,-0.638815,-0.620681,-0.635714,-0.651351,...,-0.683708,-0.708115,-0.730417,-0.729549,-0.717215,-0.742672,-0.778544,0.997765,0.713206,-2.605780
Vxn,0.595815,0.356926,0.118156,-0.489217,-0.952749,0.216517,0.095417,-0.085176,-0.495047,-0.475590,...,-0.549384,-0.566851,-0.563781,-0.547947,-0.550447,-0.586284,-0.369585,-0.969550,-0.074147,0.646476
Sgk3,0.243301,0.062179,0.029067,-0.337919,-0.872346,-0.280622,-0.170796,-0.205287,-0.254899,-0.256983,...,-0.328289,-0.058428,0.583810,-0.128297,-0.394226,-0.513155,-0.307526,-0.982514,0.047652,-0.346995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Olfr1361,0.971264,-0.887971,-0.922570,-1.021011,-1.026150,-0.976979,-0.980453,-0.988568,-0.998324,-1.009327,...,-1.020270,-1.021084,-1.021615,-1.022821,-1.024437,-1.025277,-1.024152,-1.021867,1.063984,3.099673
Irf4,0.971264,-0.887971,-0.922570,-1.021011,-1.026150,-0.976979,-0.980453,-0.988568,-0.998324,-1.009327,...,-1.020270,-1.021084,-1.021615,-1.022821,-1.024437,-1.025277,-1.024152,-1.021867,1.063984,3.099673
2810403G07Rik,0.971264,-0.887971,-0.922570,-1.021011,-1.026150,-0.976979,-0.980453,-0.988568,-0.998324,-1.009327,...,-1.020270,-1.021084,-1.021615,-1.022821,-1.024437,-1.025277,-1.024152,-1.021867,1.063984,3.099673
Zpld1,0.971264,-0.887971,-0.922570,-1.021011,-1.026150,-0.976979,-0.980453,-0.988568,-0.998324,-1.009327,...,-1.020270,-1.021084,-1.021615,-1.022821,-1.024437,-1.025277,-1.024152,-1.021867,1.063984,3.099673
