In [217]:
#Arabidopsis  RNA-seq and protein level data for 30 different tissues was taken from Mergner et al 2020 https://www.nature.com/articles/s41586-020-2094-2
#the 2nd sheet (gene expression) of supplementary table 2 was saved as a csv. Location: ../../data/genes/mergner2020_RNA-seq_CVs.csv

#this notebook calculates the mean and SD and then CV (coefficient of variation, SD/mean) allowing ranking of genes by how stably expressed they are

##can us https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.variation.html for calculating cv in future

In [1]:
import pandas as pd

In [2]:
def calculate_cv(df, data_type):
    """function to calculate the cv (SD/mean) of the RNA or protein_df. Provide the dataframe containing columns of interest."""

    #reverse log transform
    df2 = df.copy()
    df2.iloc[:,3:] = df.iloc[:,3:].apply(lambda x: 2**x)
    #change NaN values to 0 expression (mRNA quantities are displayed as TPM and a cut-off value of 1 TPM was used as lower limit of detection across all samples)
    df = df2.copy()
    df.iloc[:,3:] = df2.iloc[:,3:].fillna(0)

    #calculate mean
    df = df.assign(mean=df.iloc[:,3:].mean(axis=1))
    df.rename(columns={'mean':f'{data_type}_mean'}, inplace=True)
    #calculate SD
    df = df.assign(sd=df.iloc[:,3:].std(axis=1))
    df.rename(columns={'sd':f'{data_type}_sd'}, inplace=True)
    #calculate CV
    df = df.assign(cv=df[f'{data_type}_sd'] / df[f'{data_type}_mean'])
    df.rename(columns={'cv':f'{data_type}_cv'}, inplace=True)
    
    return df
    

In [3]:
def save_csv(input_df, output):
    """function to extract useful columns and save them as a csv"""
    df = input_df.iloc[:, [0,1,2,35]]
    #remove nan
    df2 = df[df.iloc[:,3].notnull()]
    df_sorted = df2.sort_values(df.columns[3]).reset_index(drop=True)
    df_sorted.to_csv(output, index=False)
    
    

In [4]:
original_csv = '../../data/genes/mergner2020_RNA-seq_CVs.csv'
df = pd.read_csv(csv, header=1)
RNA_df = df[['AGI code', 'transcription (iTAK)_class', 'transcription (iTAK)_family', 'TPM_SP', 'TPM_PT','TPM_ST','TPM_CP','TPM_SQ','TPM_EB','TPM_SD','TPM_SDIMB','TPM_P','TPM_ND','TPM_IND','TPM_CLLF','TPM_LFD','TPM_LFP','TPM_LFPT','TPM_SCLF','TPM_TPM_RT','TPM_FLPD','TPM_FL','TPM_SQSP',
             'TPM_SQV','TPM_RKD2','TPM_CIM','TPM_CC3','TPM_CC10','TPM_CT','TPM_HY','TPM_RTTP','TPM_RTUZ','TPM_CTSAM'
]]
protein_df = df[['AGI code', 'transcription (iTAK)_class', 'transcription (iTAK)_family','iBAQ_SP','iBAQ_PT','iBAQ_ST','iBAQ_CP','iBAQ_SQ','iBAQ_EB','iBAQ_SD','iBAQ_SDIMB',
                 'iBAQ_P','iBAQ_ND','iBAQ_IND','iBAQ_CLLF','iBAQ_LFD','iBAQ_LFP','iBAQ_LFPT','iBAQ_SCLF','iBAQ_iBAQ_RT','iBAQ_FLPD','iBAQ_FL','iBAQ_SQSP','iBAQ_SQV','iBAQ_RKD2','iBAQ_CIM',
                 'iBAQ_CC3','iBAQ_CC10','iBAQ_CT','iBAQ_HY','iBAQ_RTTP','iBAQ_RTUZ','iBAQ_CTSAM'
]]

NameError: name 'csv' is not defined

In [5]:
RNA_output = '../../data/genes/RNA_CVs.csv'
protein_output = '../../data/genes/proteincontent_CVs.csv'

In [6]:
rna_reversetransformed = calculate_cv(RNA_df, 'RNA')
protein_reersetransformed = calculate_cv(protein_df, 'proteincontent')

NameError: name 'RNA_df' is not defined

In [7]:
save_csv(rna_reversetransformed, RNA_output)
save_csv(protein_reersetransformed, protein_output)

NameError: name 'rna_reversetransformed' is not defined

In [8]:
rna_reversetransformed

NameError: name 'rna_reversetransformed' is not defined

In [235]:
protein_reersetransformed

Unnamed: 0,AGI code,transcription (iTAK)_class,transcription (iTAK)_family,iBAQ_SP,iBAQ_PT,iBAQ_ST,iBAQ_CP,iBAQ_SQ,iBAQ_EB,iBAQ_SD,...,iBAQ_CC3,iBAQ_CC10,iBAQ_CT,iBAQ_HY,iBAQ_RTTP,iBAQ_RTUZ,iBAQ_CTSAM,proteincontent_mean,proteincontent_sd,proteincontent_cv
0,AT1G01010,TF,NAC,2.215542e+05,0.000000e+00,3.270319e+06,1.578287e+05,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,6.755954e+05,0.000000e+00,0.000000e+00,1.693880e+06,9.396083e+05,0.000000e+00,4.466096e+05,7.622711e+05,1.706795
1,AT1G01020,,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,
2,AT1G01030,TF,B3,1.205416e+06,2.907605e+06,1.677787e+05,1.027709e+06,7.518055e+05,2.134591e+05,0.000000e+00,...,0.000000e+00,0.000000e+00,4.658158e+05,0.000000e+00,3.112253e+05,0.000000e+00,1.117389e+06,3.349197e+05,6.204351e+05,1.852489
3,AT1G01040,,,1.738881e+05,1.190305e+06,6.004148e+05,6.577592e+05,5.272033e+05,4.609018e+05,2.409376e+05,...,3.509174e+06,7.077597e+05,3.730217e+05,2.896598e+05,1.144432e+06,6.138387e+05,7.597683e+05,6.187560e+05,6.275489e+05,1.014211
4,AT1G01050,,,3.084275e+06,2.252351e+06,2.879526e+06,7.547294e+05,1.386100e+06,2.009214e+06,0.000000e+00,...,0.000000e+00,5.216678e+06,0.000000e+00,4.430076e+06,3.252235e+06,0.000000e+00,5.085295e+06,2.796509e+06,2.448531e+06,0.875567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25280,ATMG01350,,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,
25281,ATMG01360,,,4.464515e+07,5.333946e+07,1.087319e+08,3.601514e+07,4.733487e+07,2.196495e+07,1.357642e+07,...,6.560034e+07,3.240246e+07,1.693847e+07,3.733176e+07,3.720002e+07,6.919670e+07,1.538691e+07,4.064264e+07,3.757992e+07,0.924643
25282,ATMG01370,,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,
25283,ATMG01400,,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,
