# Transcriptomic profile estimation with all samples

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy import io

from sklearn.metrics.pairwise import cosine_similarity

import sys
sys.path.append("../../scripts")
from noiseReductionMethodology import preprocessing, Raman_model
from analysis_pclda import LDA_model
from predictFunc import calcPrediction
from util import returnValues

## Load data

In [2]:
RAMAN = pd.read_csv("../../data/RAMAN_FINGERPRINT.csv")
RAMAN_PROCESSED = preprocessing(RAMAN)
RAMAN_PROCESSED.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,590,591,592,593,594,595,596,597,598,label
0,0.777427,0.798224,0.788404,0.760493,0.758397,0.730189,0.688064,0.620933,0.629037,0.658134,...,-1.786467,-1.778755,-1.773075,-1.801913,-1.817556,-1.802574,-1.790176,-1.800517,-1.835198,0
1,0.819071,0.83868,0.801314,0.734114,0.704192,0.677671,0.615264,0.532932,0.525684,0.557399,...,-1.942792,-1.988656,-2.001273,-2.014088,-2.021082,-2.015529,-1.991117,-1.972676,-1.962001,0
2,0.784357,0.799093,0.757752,0.689691,0.634893,0.599651,0.571955,0.523794,0.562298,0.624879,...,-1.856136,-1.828801,-1.792533,-1.794846,-1.801112,-1.815291,-1.843331,-1.893848,-1.902506,0
3,0.911177,0.94783,0.915633,0.857712,0.857826,0.800391,0.765022,0.717855,0.708535,0.691836,...,-1.736215,-1.723609,-1.73077,-1.767129,-1.791136,-1.787874,-1.769603,-1.755394,-1.78372,0
4,0.876973,0.864326,0.801489,0.735828,0.716139,0.703013,0.68159,0.651481,0.649721,0.639855,...,-1.958845,-1.9959,-2.03007,-2.019414,-2.005512,-1.999898,-2.020077,-2.048122,-2.082585,0


In [3]:
GROUP = [RAMAN_PROCESSED[RAMAN_PROCESSED["label"] == i].reset_index(drop=True) for i in range(RAMAN_PROCESSED["label"].max() + 1)]

In [4]:
mat = io.loadmat("../../data/Matlab codes/data.mat")
TRANSCRIPTOME = pd.DataFrame(np.mean(mat["spombe_transcriptome"], axis=2))
TRANSCRIPTOME["label"] = range(TRANSCRIPTOME.shape[0])
TRANSCRIPTOME.to_csv("../../data/TRANSCRIPTOME.csv", index=False)
TRANSCRIPTOME.head()

Consider mio5.varmats_from_mat to split file into single variable files
  matfile_dict = MR.get_variables(variable_names)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6551,6552,6553,6554,6555,6556,6557,6558,6559,label
0,19.22585,29.38335,79.488,32.8048,7.101195,1.21101,87.6489,23.23125,160.2995,4.79919,...,31606.09,35050.05,52729.1,13152.785,6155.145,194.078,8268.355,276840.0,55582.95,0
1,18.2585,25.5971,92.78095,34.47605,6.667625,0.74667,74.50975,22.12605,222.686,9.638689,...,18407.5,42399.185,60035.2,13023.2,4271.42,263.221,11688.1,396968.0,48771.65,1
2,19.97885,21.57395,77.10295,27.3735,20.46375,0.84722,56.3451,8.691435,193.848,17.95625,...,22123.35,90745.95,94191.15,25344.4,8367.4,0.0,22223.8,1052662.5,91330.85,2
3,22.185055,38.24995,104.96795,41.9848,4.73316,0.0,71.33305,16.7727,196.402,5.328506,...,0.0,74301.0,118475.45,27993.15,8382.475,0.0,17990.795,312665.5,96659.45,3
4,11.56685,23.8783,101.449,40.48195,10.51816,0.0,55.27985,16.3729,299.4645,7.22826,...,0.0,60735.75,88174.75,19185.9,5774.665,0.0,16664.15,545737.5,52562.45,4


In [5]:
clusterNum = TRANSCRIPTOME.shape[0]
colorList = ["gray", "#B51700"]
nameList = ["PCA", "NRM"]

## Perform transcriptomic profile estimation

In [6]:
cutPercentages = np.arange(80, 100, 0.5)
cutPercentages

array([80. , 80.5, 81. , 81.5, 82. , 82.5, 83. , 83.5, 84. , 84.5, 85. ,
       85.5, 86. , 86.5, 87. , 87.5, 88. , 88.5, 89. , 89.5, 90. , 90.5,
       91. , 91.5, 92. , 92.5, 93. , 93.5, 94. , 94.5, 95. , 95.5, 96. ,
       96.5, 97. , 97.5, 98. , 98.5, 99. , 99.5])

In [7]:
predict_list = []
percent_to_dim_PCA = []
percent_NRM = []

for cutPercent in cutPercentages:
    print(".", end="")
    raman_model = Raman_model(RAMAN_PROCESSED, cutRange=cutPercent, cutMode="percent_fixedDim")
    raman_model.calcTransformation()

    out = []
    for DATA in [raman_model.RAMAN_PCA, raman_model.RAMAN_NRM]:
        lda_model = LDA_model()
        DATA_LDA = lda_model.fit_transform(DATA)

        DATA_LDA = DATA_LDA.groupby("label").mean()
        DATA_LDA["label"] = np.arange(DATA_LDA.shape[0])

        PREDICT = calcPrediction(TRANSCRIPTOME, DATA_LDA, n_components=0, max_iter=50000)

        out.append(PREDICT)

    predict_list.append(out)
    percent_to_dim_PCA.append(raman_model.k_hat)
    percent_NRM.append(raman_model.percent_tilde)

........................................

In [19]:
SUMMARY_TABLE = pd.DataFrame(columns=[f"{i:.1f}%" for i in cutPercentages]).T
SUMMARY_TABLE["dim_PCA"] = percent_to_dim_PCA
SUMMARY_TABLE["percent_NRM"] = percent_NRM

out_pca = np.array([np.sum((returnValues(data[0]) - returnValues(TRANSCRIPTOME)) ** 2, axis=1).sum()
                    for data in predict_list])

out_nrm = np.array([np.sum((returnValues(data[1]) - returnValues(TRANSCRIPTOME)) ** 2, axis=1).sum()
                    for data in predict_list])
    

SUMMARY_TABLE["PRESS_PCA"] = out_pca
SUMMARY_TABLE["PRESS_NRM"] = out_nrm
SUMMARY_TABLE["PRESS_diff"] = out_nrm - out_pca

out_pca_perCon = np.vstack([np.sum((returnValues(data[0]) - returnValues(TRANSCRIPTOME)) ** 2, axis=1)
                            for data in predict_list])

out_nrm_perCon = np.vstack([np.sum((returnValues(data[1]) - returnValues(TRANSCRIPTOME)) ** 2, axis=1)
                            for data in predict_list])

SUMMARY_TABLE.loc[:, [f"PRESS_PCA_c{i + 1}" for i in range(clusterNum)]] = out_pca_perCon
SUMMARY_TABLE.loc[:, [f"PRESS_NRM_c{i + 1}" for i in range(clusterNum)]] = out_nrm_perCon


out_pca = np.array([np.hstack([cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0]
                               for a, b in zip(returnValues(data[0]), returnValues(TRANSCRIPTOME))]).mean()
                    for data in predict_list])

out_nrm = np.array([np.hstack([cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0]
                               for a, b in zip(returnValues(data[1]), returnValues(TRANSCRIPTOME))]).mean()
                    for data in predict_list])

SUMMARY_TABLE["cosine_PCA"] = out_pca
SUMMARY_TABLE["cosine_NRM"] = out_nrm

SUMMARY_TABLE["cosine_diff"] = out_nrm - out_pca

SUMMARY_TABLE.head()

Unnamed: 0,dim_PCA,percent_NRM,PRESS_PCA,PRESS_NRM,PRESS_diff,PRESS_PCA_c1,PRESS_PCA_c2,PRESS_PCA_c3,PRESS_PCA_c4,PRESS_PCA_c5,...,PRESS_NRM_c4,PRESS_NRM_c5,PRESS_NRM_c6,PRESS_NRM_c7,PRESS_NRM_c8,PRESS_NRM_c9,PRESS_NRM_c10,cosine_PCA,cosine_NRM,cosine_diff
80.0%,3,84.340141,1771563000000.0,1771551000000.0,-11247400.0,45138670000.0,39538550000.0,637459900000.0,240106900000.0,101720800000.0,...,240100200000.0,101721100000.0,118694800000.0,64548920000.0,57060160000.0,39219230000.0,428078000000.0,0.909213,0.909214,2.376565e-07
80.5%,3,84.340141,1771563000000.0,1771551000000.0,-11247400.0,45138670000.0,39538550000.0,637459900000.0,240106900000.0,101720800000.0,...,240100200000.0,101721100000.0,118694800000.0,64548920000.0,57060160000.0,39219230000.0,428078000000.0,0.909213,0.909214,2.376565e-07
81.0%,3,84.340141,1771563000000.0,1771551000000.0,-11247400.0,45138670000.0,39538550000.0,637459900000.0,240106900000.0,101720800000.0,...,240100200000.0,101721100000.0,118694800000.0,64548920000.0,57060160000.0,39219230000.0,428078000000.0,0.909213,0.909214,2.376565e-07
81.5%,3,84.340141,1771563000000.0,1771551000000.0,-11247400.0,45138670000.0,39538550000.0,637459900000.0,240106900000.0,101720800000.0,...,240100200000.0,101721100000.0,118694800000.0,64548920000.0,57060160000.0,39219230000.0,428078000000.0,0.909213,0.909214,2.376565e-07
82.0%,3,84.340141,1771563000000.0,1771551000000.0,-11247400.0,45138670000.0,39538550000.0,637459900000.0,240106900000.0,101720800000.0,...,240100200000.0,101721100000.0,118694800000.0,64548920000.0,57060160000.0,39219230000.0,428078000000.0,0.909213,0.909214,2.376565e-07


In [20]:
SUMMARY_TABLE.to_csv(f"../../results/SUMMARY_PERCENT_fixedDim_SPOMBE.csv", index=True)