In [70]:
import sys
from timeit import default_timer as timer
import numpy as np
import pandas as pd
from scipy.stats import zscore

In [71]:
def read_file(file_name: str):
    """
    :param file_name: File path of causal-priors or differential-exp file
    :return: Dataframe of causal-priors or differential-exp file
    """
    try:
        # Read the file in transposed form
        df = pd.read_csv(file_name, sep='\t', header=0, index_col=0).T
        # df = pd.read_csv(file_name, sep='\t')
        return df
    except Exception as e:
        print(f'Error in reading file {file_name}')
        print(e)
        sys.exit(1)


In [72]:
single_cell_file = '../data/normalized_mat.tsv'
main_df = read_file(single_cell_file)

main_df.head()

Unnamed: 0,Xkr4,Sox17,Mrpl15,Lypla1,Tcea1,Rgs20,Atp6v1h,Rb1cc1,4732440D04Rik,Pcmtd1,...,Gm27151,D930032P07Rik,Cnnm1,Pkd2l1,Scd4,Elovl3,Sorcs3,Nrap,Pnliprp1,Eno4
BPK.12x.4NQO_AAACCTGCACCCAGTG.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BPK.12x.4NQO_AAACCTGCAGCTTAAC.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BPK.12x.4NQO_AAACCTGGTGTGCGTC.1,0.0,0.0,0.0,0.777323,0.0,0.0,0.777323,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BPK.12x.4NQO_AAACCTGGTTGAACTC.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.794018,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BPK.12x.4NQO_AAACGGGAGGATGGTC.1,0.0,0.0,0.0,1.202458,0.0,0.0,0.0,1.202458,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
single_df = main_df.copy()
single_df.shape

(9089, 17506)

In [116]:
# Replace 0 with NaN
single_df = single_df.replace(0, np.NaN)

z_single_df = single_df.apply(zscore, axis=0, nan_policy='omit')

In [117]:
z_single_df.head(10)

Unnamed: 0,Xkr4,Sox17,Mrpl15,Lypla1,Tcea1,Rgs20,Atp6v1h,Rb1cc1,4732440D04Rik,Pcmtd1,...,Gm27151,D930032P07Rik,Cnnm1,Pkd2l1,Scd4,Elovl3,Sorcs3,Nrap,Pnliprp1,Eno4
BPK.12x.4NQO_AAACCTGCACCCAGTG.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACCTGCAGCTTAAC.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACCTGGTGTGCGTC.1,,,,0.12492,,,0.134734,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACCTGGTTGAACTC.1,,,,,,,,,,-0.048023,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGAGGATGGTC.1,,,,1.250153,,,,0.86804,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGAGGGCTCTC.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGAGTAACCCT.1,,,0.502048,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGCATGGGACA.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGGTCTGCAAT.1,,,,,0.475916,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGTCAATCTCT.1,,,,,0.396412,,,-0.41347,,,...,,,,,,,,,,


In [120]:
# Get the first row
first_row = single_df.iloc[0, :]

# Remove NaN
first_row = first_row.dropna()

first_row.shape

(1042,)

In [123]:
for idx, row in z_single_df.iterrows():
    print(idx)
    # drop NaN
    row = row.dropna()
    print(row)
    break

BPK.12x.4NQO_AAACCTGCACCCAGTG.1
Snhg6      1.879656
Arfgef1    2.143150
Rpl7       1.023823
Fam135a    2.847266
Phf3       1.834056
             ...   
mt-Nd3     1.271309
mt-Nd4l    1.046616
mt-Nd4     0.998091
mt-Nd5     0.575517
mt-Cytb   -0.270252
Name: BPK.12x.4NQO_AAACCTGCACCCAGTG.1, Length: 1042, dtype: float64


In [111]:
# Get first 10 rows
first_10 = single_df.iloc[:10, :]
first_10 = first_10.replace(0, np.NaN)
first_10.head(11)

Unnamed: 0,Xkr4,Sox17,Mrpl15,Lypla1,Tcea1,Rgs20,Atp6v1h,Rb1cc1,4732440D04Rik,Pcmtd1,...,Gm27151,D930032P07Rik,Cnnm1,Pkd2l1,Scd4,Elovl3,Sorcs3,Nrap,Pnliprp1,Eno4
BPK.12x.4NQO_AAACCTGCACCCAGTG.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACCTGCAGCTTAAC.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACCTGGTGTGCGTC.1,,,,0.777323,,,0.777323,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACCTGGTTGAACTC.1,,,,,,,,,,0.794018,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGAGGATGGTC.1,,,,1.202458,,,,1.202458,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGAGGGCTCTC.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGAGTAACCCT.1,,,1.003354,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGCATGGGACA.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGGTCTGCAAT.1,,,,,1.09107,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGTCAATCTCT.1,,,,,1.058914,,,0.663522,,,...,,,,,,,,,,


In [112]:
# Apply zscore omitting NaN
first_10 = first_10.apply(zscore, axis=0, nan_policy='omit')
first_10.head(11)

Unnamed: 0,Xkr4,Sox17,Mrpl15,Lypla1,Tcea1,Rgs20,Atp6v1h,Rb1cc1,4732440D04Rik,Pcmtd1,...,Gm27151,D930032P07Rik,Cnnm1,Pkd2l1,Scd4,Elovl3,Sorcs3,Nrap,Pnliprp1,Eno4
BPK.12x.4NQO_AAACCTGCACCCAGTG.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACCTGCAGCTTAAC.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACCTGGTGTGCGTC.1,,,,-1.0,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACCTGGTTGAACTC.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGAGGATGGTC.1,,,,1.0,,,,1.0,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGAGGGCTCTC.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGAGTAACCCT.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGCATGGGACA.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGGTCTGCAAT.1,,,,,1.0,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGTCAATCTCT.1,,,,,-1.0,,,-1.0,,,...,,,,,,,,,,


In [114]:
ar = np.array([1.208354, 1.493988, 2.077548, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN])

zscore(ar, nan_policy='omit', axis=0)

array([-1.0641758 , -0.27453927,  1.33871507,         nan,         nan,
               nan,         nan,         nan,         nan,         nan])

In [126]:
single_df.shape

(9089, 17506)

In [128]:
newdf = single_df.copy()

# set all values to NaN
newdf.iloc[:, :] = np.NaN
newdf.head()

Unnamed: 0,Xkr4,Sox17,Mrpl15,Lypla1,Tcea1,Rgs20,Atp6v1h,Rb1cc1,4732440D04Rik,Pcmtd1,...,Gm27151,D930032P07Rik,Cnnm1,Pkd2l1,Scd4,Elovl3,Sorcs3,Nrap,Pnliprp1,Eno4
BPK.12x.4NQO_AAACCTGCACCCAGTG.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACCTGCAGCTTAAC.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACCTGGTGTGCGTC.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACCTGGTTGAACTC.1,,,,,,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGAGGATGGTC.1,,,,,,,,,,,...,,,,,,,,,,
