In [1]:
## Start with the imports

# Numpy
import numpy as np
import numpy.random as random

# Pandas
import pandas as pd

# Plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode()

# Scikit-learn
from sklearn.decomposition import PCA, FastICA
from sklearn.manifold import MDS, LocallyLinearEmbedding, TSNE

# Custom plotting
from plotly_util import scatter_matrix, scatter_matrix_3d

# Umbrella
from umbrella import Umbrella

# Dataset-specific
from timecourse_util import timecourse_marker

In [2]:
umbrella = Umbrella(1000)

In [3]:
umbrella.plot()

In [4]:
umbrella_pca = PCA(n_components = 2).fit_transform(umbrella.matrix)

scatter_matrix(umbrella_pca, marker = umbrella.marker, 
               title="Principal component analysis", x_label="PC1", y_label="PC2")

In [6]:
umbrella_mds = MDS(n_components=2).fit_transform(umbrella.matrix)
scatter_matrix(umbrella_mds, umbrella.marker, title="Multidimensional scaling")

In [9]:
umbrella_ica = FastICA(n_components = 2).fit_transform(umbrella.matrix)
scatter_matrix(umbrella_ica, umbrella.marker, title="Independent component analysis")

In [10]:
umbrella_tsne = TSNE(n_components = 2).fit_transform(umbrella.matrix)
scatter_matrix(umbrella_tsne, umbrella.marker, title="T-distribution stochastic neighbor embedding")

# Working with data

In [11]:
%%bash
du -h data/expression_matrix.csv
head data/expression_matrix.csv | cut -d',' -f 1-3

900K	data/expression_matrix.csv
,ENSG00000000460.12,ENSG00000001630.11
T0_CT_A01,0.740922,23.743
T0_CT_A03,57.5785,75.9004
T0_CT_A05,3.93587,7.70763
T0_CT_A06,0,48.2161
T0_CT_A07,0,18.1033
T0_CT_A08,1.75868,6.75033
T0_CT_A10,0,46.4447
T0_CT_A11,0,0.0316664
T0_CT_B01,0.762205,35.4536


In [12]:
expression = pd.read_csv("data/expression_matrix.csv", index_col=0)
print(expression.info())
expression.head()

<class 'pandas.core.frame.DataFrame'>
Index: 271 entries, T0_CT_A01 to T72_CT_H12
Columns: 575 entries, ENSG00000000460.12 to ENSG00000271430.1
dtypes: float64(575)
memory usage: 1.2+ MB
None


Unnamed: 0,ENSG00000000460.12,ENSG00000001630.11,ENSG00000003989.12,ENSG00000005448.12,ENSG00000010292.8,ENSG00000011426.6,ENSG00000012048.14,ENSG00000018408.10,ENSG00000020922.7,ENSG00000022267.12,...,ENSG00000267519.1,ENSG00000267918.1,ENSG00000268310.1,ENSG00000268518.1,ENSG00000268949.1,ENSG00000269028.2,ENSG00000269468.1,ENSG00000269821.1,ENSG00000271043.1,ENSG00000271430.1
T0_CT_A01,0.740922,23.743,0.65881,3.14399,3.17807,5.51907,2.90642,2.53347,4.22463,0.0,...,0.0,54.8203,4.18345,0.0,0.0,312.074,0.0,2.00933,53.2734,0.550444
T0_CT_A03,57.5785,75.9004,0.642023,1.39663,2.71632,61.5427,0.29388,7.98845,4.60183,10.1639,...,0.0,17.9368,36.1613,0.0,0.0,179.821,0.0,0.037636,31.8427,1.02949
T0_CT_A05,3.93587,7.70763,0.796284,121.457,1.87148,78.4978,16.9961,9.05609,10.9743,0.9417,...,0.0,37.7717,65.7479,0.0,0.0,189.272,0.0,0.153773,29.7249,0.0
T0_CT_A06,0.0,48.2161,0.760456,43.1346,0.766509,0.048436,2.30681,20.0881,20.6257,0.0,...,0.0,48.2486,0.0,0.0,0.0,261.004,0.0,0.081869,43.0606,1.75778
T0_CT_A07,0.0,18.1033,11.5342,0.0,3.04019,53.133,11.919,101.073,72.7712,3.68148,...,0.0,328.906,3.09561,1.27603,0.0,204.578,0.0,0.263967,36.3855,72.5246


In [13]:
expression_marker = timecourse_marker(expression)

In [14]:
expression_pca = PCA(n_components=2).fit_transform(expression)
scatter_matrix(expression_pca, expression_marker, title="Cell expression profile PCA")

In [15]:
expression_ica = FastICA(n_components=2).fit_transform(expression)
scatter_matrix(expression_ica, expression_marker, title="Cell expression profile ICA")

In [16]:
expression_tsne = TSNE(n_components=2).fit_transform(expression)
scatter_matrix(expression_tsne, expression_marker, title="Cell expression profile T-SNE")

In [17]:
expression_mds = MDS(n_components=2).fit_transform(expression)
scatter_matrix(expression_mds, expression_marker, title="Cell expression profile MDS")

In [18]:
expression_mlle = LocallyLinearEmbedding(n_neighbors=10, 
                                         n_components=2, 
                                         method = "modified").fit_transform(expression)

In [19]:
scatter_matrix(expression_mlle, expression_marker, title="Cell expression profile MLLE")