In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, scale
from sklearn.decomposition import PCA
import json
import pickle

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import to_hex, to_rgb

In [None]:
rnafile = "../data/gene_FPKM_transposed_UMR75.parquet"
metafactor = "TissueClean"
df = pd.read_parquet(rnafile)
class_names = df[metafactor].unique().tolist()
print(class_names)

In [None]:
data = (df.iloc[:, 14:]).astype("float64")
data = data.apply(lambda x: np.log2(x+1.0))
# rnadf.iloc[:, 1:] = scale(rnadf.iloc[:, 1:], axis=1)
print(f"DataFrame shape: {df.shape}, rnaseq data shape: {data.shape}")
# DataFrame shape: (19415, 37349), rnaseq data shape: (19415, 37335)

In [None]:
# X = StandardScaler().fit_transform(data)
X_pca = PCA(n_components=2).fit_transform(data)
pcadf = pd.DataFrame({str(metafactor): df[metafactor].values,
                        "PC1": X_pca[:, 0],
                        "PC2": X_pca[:, 1],})
fname = "../results/arabidopsis/arabidopsis_pca_coordinates.csv"
pcadf.to_csv(fname, index=False)
print(f"PCA Done. Wrote coordinates to {fname}")

In [None]:
cmap = plt.get_cmap("tab10")
labels = df[metafactor]
labelenc = LabelEncoder().fit(class_names)
class_codes = labelenc.transform(class_names).tolist()
chex = [to_hex(cmap(i)) for i in class_codes]
lcodes = labelenc.transform(labels)
cols = [chex[ii] for ii in lcodes]

In [None]:
fig = plt.figure(figsize=(9, 9))
gs = fig.add_gridspec(nrows=1, ncols=1, height_ratios=[1.])
ax = gs.subplots()
handles = [mpl.patches.Patch(color=chex[i], label=l)
                             for i, l in enumerate(class_names)]
ax.scatter(X_pca[:, 0], X_pca[:, 1], c=cols, s=10.0)
ax.set_xlabel("PC 1")
ax.set_ylabel("PC 2")
ax.set_title("PCA Scatter Plot")
plt.subplots_adjust(right=0.9)
fig.legend(handles=handles, loc="center right",
            borderaxespad=0.2, title="Tissue Type")

plt.show()

In [None]:
import seaborn as sns
plt.figure(figsize=(15,15))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1],
                data=df, hue='AboveBelow', legend="full")
plt.legend(loc='center left', bbox_to_anchor=(0.8, 0.5))

In [None]:
import seaborn as sns
plt.figure(figsize=(15,15))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1],
                data=df, hue='VegetativeRepro', legend="full")
plt.legend(loc='center left', bbox_to_anchor=(0.8, 0.5))

In [None]:
import seaborn as sns
plt.figure(figsize=(15,15))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1],
                data=df, hue='TissueClean', legend="full")
plt.legend(loc='center left', bbox_to_anchor=(0.8, 0.5))