In [None]:
# Imports and coolates expression data with metadata.
# Calculates z-scores and outputs results for input for classifier script

In [None]:
import pandas as pd
import numpy as np
import scipy.stats
import os.path

In [None]:
# Setup

# Input dataset
#expression_data_file = 'salmon.merged.gene_tpm.log2_tmp_plus_1.downsized.tsv.gz'
expression_data_file = 'salmon.merged.gene_tpm.log2_tmp_plus_1.retention_group_filtered.tsv.gz'


In [None]:
# Read in data
print("Reading in: " + expression_data_file)
expression_data = pd.read_csv(expression_data_file, sep="\t")
print(f'Number of cell lines: {expression_data.shape[1] - 2}')
print(f'Number of genes: {expression_data.shape[0]}')

In [None]:
#Calculate z-scores
z_scores = (expression_data.iloc[:, 2:]
                .to_numpy()
           )

z_scores = scipy.stats.zscore(z_scores, axis=1)
z_scores = pd.DataFrame(z_scores)  

z_scores = pd.concat([expression_data.iloc[:, 0:2], z_scores], 
                     axis=1,
                     ignore_index=True)

z_scores.columns = expression_data.columns

In [None]:
# Calculate mean and population standard deviation for each gene
# for use by the script that makes novel prediction (without using training data)
# These values are needed to calculate z-scores
means = (expression_data
            .iloc[:, 2:]
            .mean(axis=1)
        )

pop_std = (expression_data
            .iloc[:, 2:]
            .std(axis=1, ddof=0)
        )

means_stds = pd.concat([expression_data.loc[:, 'gene_id'],
                        means,
                        pop_std], axis=1)

means_stds.columns = ['gene_id', 'expression_mean', 'expression_pop_std']

In [None]:
# Write out output
outfile = os.path.basename(expression_data_file)
outfile = outfile.replace('.tsv.gz', '')
outfile = f'{outfile}.z_scores.tsv.gz'
print(f'Writing results to: {outfile}')
z_scores.to_csv(outfile, index=False, compression='gzip', sep="\t")

outfile = os.path.basename(expression_data_file)
outfile = outfile.replace('.tsv.gz', '')
outfile = f'{outfile}.means_stds.tsv.gz'
print(f'Writing out gene expression means and standard deviations to: {outfile}')
means_stds.to_csv(outfile, index=False, compression='gzip', sep="\t") 

In [None]:
print('Done')