In [64]:
# Importing libraries
import pandas as pd
from scipy.stats import f_oneway
import numpy as np

In [2]:
## Reading and cleaning data

# Reading data
cpm_ngs_pivot = pd.read_csv("Data/cpm_NGS_1_deiden.csv", index_col=0)
cpm_ngs_pivot = cpm_ngs_pivot.rename(columns={'ENSEMBL':'ensembl_gene_id','ENTREZID':'entrez_gene_id',"GENENAME":'gene','SYMBOL':'symbol'})

# Seperate Gene ID Lookup table and expression data table
geneID = cpm_ngs_pivot[['ensembl_gene_id', 'entrez_gene_id','gene', 'symbol']].drop_duplicates()

cpm_ngs_pivot = cpm_ngs_pivot.drop(columns=['entrez_gene_id','gene', 'symbol'])

cpm_ngs = cpm_ngs_pivot.melt(id_vars=['ensembl_gene_id'],
                             var_name='drug_sub', value_name='expression_val')

# Creating seperate drug and subject columns
cpm_ngs['drug'] = cpm_ngs['drug_sub'].str[:-6]
cpm_ngs['subject'] = cpm_ngs['drug_sub'].str[-1:]

# String formatting
cpm_ngs['ensembl_gene_id'].str.strip()
cpm_ngs['subject'].str.strip()
cpm_ngs['drug'] = cpm_ngs['drug'].replace({'Drug A ': 'A', 'Drug B ': 'B', 'Drug C': 'C', 'Saline': 'S'})

# Ordering columns
cpm_ngs = cpm_ngs[['ensembl_gene_id','drug','subject','expression_val']]

cpm_ngs.head()


Unnamed: 0,ensembl_gene_id,drug,subject,expression_val
0,ENSMUSG00000000001,A,1,30.418821
1,ENSMUSG00000000003,A,1,0.0
2,ENSMUSG00000000028,A,1,0.790703
3,ENSMUSG00000000031,A,1,0.139536
4,ENSMUSG00000000037,A,1,0.930239


In [3]:
# Seperate the drug groups A, B, C against S
cpm_ngs_pivot = cpm_ngs.pivot_table(index=['ensembl_gene_id', 'subject'], columns='drug', values='expression_val')

cpm_ngs_AS = cpm_ngs_pivot[['A','S']]

cpm_ngs_BS = cpm_ngs_pivot[['B','S']]

cpm_ngs_CS = cpm_ngs_pivot[['C','S']]

print(cpm_ngs_AS.head())
print(cpm_ngs_BS.head())
print(cpm_ngs_CS.head())

drug                                A          S
ensembl_gene_id    subject                      
ENSMUSG00000000001 1        30.418821  30.675560
                   2        32.582558  31.004931
                   3        28.755032  29.336898
                   4        33.336779  29.452225
                   5        31.400374  30.045231
drug                                B          S
ensembl_gene_id    subject                      
ENSMUSG00000000001 1        30.165752  30.675560
                   2        27.359610  31.004931
                   3        33.329096  29.336898
                   4        29.058901  29.452225
                   5        32.975345  30.045231
drug                                C          S
ensembl_gene_id    subject                      
ENSMUSG00000000001 1        26.778892  30.675560
                   2        30.934139  31.004931
                   3        41.162374  29.336898
                   4        35.649002  29.452225
                   5

In [4]:
# Function for nonparametric Mann-Whitney U test 
from scipy.stats import mannwhitneyu

def mannwhitneyuFun(df):
    genes = []
    stat_vals = []
    p_vals = []

    # Iterate over ID1 values
    for gene in df.index.levels[0]:
        subset_pivot_table = df.loc[gene]
        statistic, p_value = mannwhitneyu(subset_pivot_table.iloc[:,0].tolist(), subset_pivot_table.iloc[:,1].tolist())

        genes.append(gene)
        stat_vals.append(statistic)
        p_vals.append(p_value)

    # Creating new dataframe of signifiance of mean differences for each gene
    mannwhitneyu_df = pd.DataFrame({'gene': genes, 'stat_val':stat_vals, 'p_val':p_vals})
    mannwhitneyu_df['significance'] = (mannwhitneyu_df['p_val'] < 0.05).astype(int)
    return mannwhitneyu_df


In [5]:
# Output for A-S, B-S, C-S
mannwhitneyu_AS = mannwhitneyuFun(cpm_ngs_AS)
mannwhitneyu_BS = mannwhitneyuFun(cpm_ngs_BS)
mannwhitneyu_CS = mannwhitneyuFun(cpm_ngs_CS)

mannwhitneyu_AS.to_csv('mannwhitneyu_AS')
mannwhitneyu_BS.to_csv('mannwhitneyu_BS')
mannwhitneyu_CS.to_csv('mannwhitneyu_CS')

print(mannwhitneyu_AS['significance'].sum())
print(mannwhitneyu_BS['significance'].sum())
print(mannwhitneyu_CS['significance'].sum())

1257
1133
1134


In [73]:

cpm_ngs_AS_sig = mannwhitneyu_AS.merge(cpm_ngs_AS.groupby('ensembl_gene_id').mean(), left_on='gene', right_on='ensembl_gene_id')
cpm_ngs_AS_sig.rename(columns={'stat_val': 'AS_stat_val', 'p_val': 'AS_p_val', 'significance': 'AS_sig'}, inplace=True)

cpm_ngs_BS_sig = mannwhitneyu_BS.merge(cpm_ngs_BS.groupby('ensembl_gene_id').mean(), left_on='gene', right_on='ensembl_gene_id')
cpm_ngs_BS_sig.rename(columns={'stat_val': 'BS_stat_val', 'p_val': 'BS_p_val', 'significance': 'BS_sig'}, inplace=True)

cpm_ngs_CS_sig = mannwhitneyu_CS.merge(cpm_ngs_CS.groupby('ensembl_gene_id').mean(), left_on='gene', right_on='ensembl_gene_id')
cpm_ngs_CS_sig.rename(columns={'stat_val': 'CS_stat_val', 'p_val': 'CS_p_val', 'significance': 'CS_sig'}, inplace=True)

mannwhitneyu_sig = (cpm_ngs_AS_sig.merge(cpm_ngs_BS_sig.drop(columns='S'), on='gene', how='outer')
                                          .merge(cpm_ngs_CS_sig.drop(columns='S'), on='gene', how='outer'))
mannwhitneyu_sig = mannwhitneyu_sig[mannwhitneyu_sig['AS_sig']+mannwhitneyu_sig['BS_sig']+mannwhitneyu_sig['CS_sig']>0]

# Add relative values
mannwhitneyu_sig['A/S'] = round(mannwhitneyu_sig['A']/mannwhitneyu_sig['S']*100,2)
mannwhitneyu_sig['B/S'] = round(mannwhitneyu_sig['B']/mannwhitneyu_sig['S']*100,2)
mannwhitneyu_sig['C/S'] = round(mannwhitneyu_sig['C']/mannwhitneyu_sig['S']*100, 2)

# Remove na, inf, organize columns
mannwhitneyu_result = mannwhitneyu_sig.drop(columns=['AS_stat_val', 'BS_stat_val', 'CS_stat_val', 'AS_sig', 'BS_sig', 'CS_sig'])
mannwhitneyu_result = mannwhitneyu_result.replace([np.inf, -np.inf], np.nan)
mannwhitneyu_result = mannwhitneyu_result[['gene', 'AS_p_val', 'BS_p_val', 'CS_p_val', 'A/S', 'B/S', 'C/S', 'A', 'B', 'C', 'S']].dropna()

mannwhitneyu_result

Unnamed: 0,gene,AS_p_val,BS_p_val,CS_p_val,A/S,B/S,C/S,A,B,C,S
16,ENSMUSG00000000126,0.150794,0.031746,1.000000,109.59,117.16,99.21,27.971144,29.904231,25.322633,25.523550
18,ENSMUSG00000000131,0.031746,1.000000,0.015873,103.91,99.34,98.05,97.311426,93.030722,91.827829,93.652032
26,ENSMUSG00000000167,0.309524,0.690476,0.031746,109.89,111.55,112.53,1.595239,1.619316,1.633501,1.451624
28,ENSMUSG00000000171,0.007937,0.055556,0.690476,94.77,96.63,101.52,54.387774,55.454540,58.258584,57.388724
34,ENSMUSG00000000202,0.007937,0.309524,0.015873,83.24,95.61,83.49,16.814456,19.311600,16.863648,20.199335
...,...,...,...,...,...,...,...,...,...,...,...
55268,ENSMUSG00000118124,0.031141,0.009701,0.044909,1609.65,2153.06,2078.69,1.402243,1.875638,1.810844,0.087115
55350,ENSMUSG00000118206,0.007937,0.150794,0.401965,29.89,58.75,62.88,0.032744,0.064352,0.068878,0.109544
55363,ENSMUSG00000118219,0.095238,0.031746,0.222222,127.14,122.74,114.59,11.173575,10.787075,10.071385,8.788700
55387,ENSMUSG00000118243,0.009701,1.000000,0.690476,7.29,79.52,129.21,0.006992,0.076230,0.123868,0.095869


In [80]:
mannwhitneyu_sig[mannwhitneyu_sig['AS_sig']+mannwhitneyu_sig['BS_sig']+mannwhitneyu_sig['CS_sig']==3].shape[0]

51

In [72]:
mannwhitneyu_result.to_csv('Output/mannwhitneyu_result')