In [1]:
# Importing libraries
import pandas as pd
from scipy.stats import f_oneway

In [2]:
## Reading and cleaning data

# Reading data
cpm_ngs_pivot = pd.read_csv("Data/cpm_NGS_1_deiden.csv", index_col=0)
cpm_ngs_pivot = cpm_ngs_pivot.rename(columns={'ENSEMBL':'ensembl_gene_id','ENTREZID':'entrez_gene_id',"GENENAME":'gene','SYMBOL':'symbol'})

# Seperate Gene ID Lookup table and expression data table
geneID = cpm_ngs_pivot[['ensembl_gene_id', 'entrez_gene_id','gene', 'symbol']].drop_duplicates()

cpm_ngs_pivot = cpm_ngs_pivot.drop(columns=['entrez_gene_id','gene', 'symbol'])

cpm_ngs = cpm_ngs_pivot.melt(id_vars=['ensembl_gene_id'],
                             var_name='drug_sub', value_name='expression_val')

# Creating seperate drug and subject columns
cpm_ngs['drug'] = cpm_ngs['drug_sub'].str[:-6]
cpm_ngs['subject'] = cpm_ngs['drug_sub'].str[-1:]

# String formatting
cpm_ngs['ensembl_gene_id'].str.strip()
cpm_ngs['subject'].str.strip()
cpm_ngs['drug'] = cpm_ngs['drug'].replace({'Drug A ': 'A', 'Drug B ': 'B', 'Drug C': 'C', 'Saline': 'S'})

# Ordering columns
cpm_ngs = cpm_ngs[['ensembl_gene_id','drug','subject','expression_val']]

# Printing first few rows
cpm_ngs.head()


Unnamed: 0,ensembl_gene_id,drug,subject,expression_val
0,ENSMUSG00000000001,A,1,30.418821
1,ENSMUSG00000000003,A,1,0.0
2,ENSMUSG00000000028,A,1,0.790703
3,ENSMUSG00000000031,A,1,0.139536
4,ENSMUSG00000000037,A,1,0.930239


In [26]:
import pandas as pd

# Example DataFrame
data = {
    'ID1': ['A', 'A', 'B', 'B', 'C'],
    'ID2': ['X', 'Y', 'X', 'Y', 'X'],
    'Value1': [10, 20, 30, 40, 50],
    'Value2': [15, 25, 35, 45, 55]
}

df = pd.DataFrame(data)

# Function to split DataFrame based on sorted columns
def split_df(df, split_columns):
    # Initialize dictionary to store split DataFrames
    dfs = {}
    # Initialize variable to keep track of previous key
    prev_key = None
    # Iterate over rows
    for index, row in df.iterrows():
        # Create key tuple from values in split_columns
        key = tuple(row[split_columns])
        # If key is different from previous key, create new DataFrame
        if key != prev_key:
            dfs[key] = pd.DataFrame(columns=df.columns)
            prev_key = key
        # Append row to DataFrame corresponding to key
        dfs[key] = pd.concat([dfs[key], pd.DataFrame([row], columns=df.columns)], ignore_index=True)
    return dfs

# Split DataFrame based on 'ID1' and 'ID2' columns
dfs = split_df(df.sort_values(['ID1', 'ID2']), ['ID1', 'ID2'])

# Output the split DataFrames
for key, value in dfs.items():
    print(f"DataFrame for ID1={key[0]} and ID2={key[1]}:")
    print(value)
    print()


DataFrame for ID1=A and ID2=X:
  ID1 ID2 Value1 Value2
0   A   X     10     15

DataFrame for ID1=A and ID2=Y:
  ID1 ID2 Value1 Value2
0   A   Y     20     25

DataFrame for ID1=B and ID2=X:
  ID1 ID2 Value1 Value2
0   B   X     30     35

DataFrame for ID1=B and ID2=Y:
  ID1 ID2 Value1 Value2
0   B   Y     40     45

DataFrame for ID1=C and ID2=X:
  ID1 ID2 Value1 Value2
0   C   X     50     55



In [28]:
def split_df(df, split_columns):
    # Initialize dictionary to store split DataFrames
    dfs = {}
    # Initialize variable to keep track of previous key
    prev_key = None
    # Iterate over rows
    for index, row in df.iterrows():
        # Create key tuple from values in split_columns
        key = tuple(row[split_columns])
        # If key is different from previous key, create new DataFrame
        if key != prev_key:
            dfs[key] = pd.DataFrame(columns=df.columns)
            prev_key = key
        # Append row to DataFrame corresponding to key
        dfs[key] = pd.concat([dfs[key], pd.DataFrame([row], columns=df.columns)], ignore_index=True)
    return dfs

dfs = split_df(df.sort_values(['ensembl_gene_id', 'drug']), ['ensembl_gene_id', 'drug'])

# Output the split DataFrames
for key, value in dfs.items():
    print(f"DataFrame for ID1={key[0]} and ID2={key[1]}:")
    print(value)
    print()


KeyError: 'ensembl_gene_id'

In [19]:
from scipy.stats import kruskal

# Example data
group1 = [12, 14, 15, 17, 18]
group2 = [9, 10, 11, 13, 16]
group3 = [8, 10, 11, 12, 14]

# Performing Kruskal-Wallis test
statistic, p_value = kruskal(group1, group2, group3)

# Output the results
print("Kruskal-Wallis Test Statistic:", statistic)
print("P-value:", p_value)

# Interpret the results
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis. There are significant differences among groups.")
else:
    print("Fail to reject the null hypothesis. There are no significant differences among groups.")


Kruskal-Wallis Test Statistic: 5.579856115107913
P-value: 0.061425632866321314
Fail to reject the null hypothesis. There are no significant differences among groups.


In [18]:
grouped_cpm_ngs = cpm_ngs.groupby(['ensembl_gene_id', 'drug'])[['expression_val']].mean().reset_index()
grouped_cpm_ngs



Unnamed: 0,ensembl_gene_id,drug,expression_val
0,ENSMUSG00000000001,A,31.298713
1,ENSMUSG00000000001,B,30.577741
2,ENSMUSG00000000001,C,33.623121
3,ENSMUSG00000000001,S,30.102969
4,ENSMUSG00000000003,A,0.000000
...,...,...,...
222139,ENSMUSG00000118392,S,0.000000
222140,ENSMUSG00000118393,A,0.000000
222141,ENSMUSG00000118393,B,0.000000
222142,ENSMUSG00000118393,C,0.000000


In [29]:
# ANOVA test

genes = []
f_vals = []
p_vals = []

for gene in cpm_ngs['ensembl_gene_id'].drop_duplicates():
    geneFilter = cpm_ngs[cpm_ngs['ensembl_gene_id'] == gene]

    drugA = geneFilter.loc[geneFilter['drug'] == 'A', 'expression_val'].tolist()
    drugB = geneFilter.loc[geneFilter['drug'] == 'B', 'expression_val'].tolist()
    drugC = geneFilter.loc[geneFilter['drug'] == 'C', 'expression_val'].tolist()
    saline = geneFilter.loc[geneFilter['drug'] == 'S', 'expression_val'].tolist()
    # f_val, p_val = f_oneway(drugA, drugB, drugC, saline)
    stat, p_val = kruskal(group1, group2, group3)


    genes.append(gene)
    f_vals.append(stat)
    p_vals.append(p_val)

    cpm_ngs = cpm_ngs[cpm_ngs['ensembl_gene_id'] != gene]


# Creating new dataframe of signifiance of mean differences for each gene
mean_diff_sig = pd.DataFrame({'gene': genes, 'f_val':f_vals, 'p_val':p_vals})

# Significance column indicating 1 if p_val < 0.05
mean_diff_sig['significance'] = (mean_diff_sig['p_val'] < 0.05).astype(int)

# Export table
mean_diff_sig.to_csv("Output/mean_diff_sig.csv", index=False)

# Show the table
mean_diff_sig


Unnamed: 0,gene,f_val,p_val,significance
0,ENSMUSG00000000001,5.579856,0.061426,0
1,ENSMUSG00000000003,5.579856,0.061426,0
2,ENSMUSG00000000028,5.579856,0.061426,0
3,ENSMUSG00000000031,5.579856,0.061426,0
4,ENSMUSG00000000037,5.579856,0.061426,0
...,...,...,...,...
55531,ENSMUSG00000118389,5.579856,0.061426,0
55532,ENSMUSG00000118390,5.579856,0.061426,0
55533,ENSMUSG00000118391,5.579856,0.061426,0
55534,ENSMUSG00000118392,5.579856,0.061426,0


In [30]:
mean_diff_sig.to_csv("nonParam.csv")

In [204]:
# Creating new dataframe of signifiance of mean differences for each gene
mean_diff_sig = pd.DataFrame({'gene': genes, 'f_val':f_vals, 'p_val':p_vals})

# Significance column indicating 1 if p_val < 0.05
mean_diff_sig['p_sig'] = (cpm_ngs['p_val'] < 0.05).astype(int)
mean_diff_sig['f_sig'] = (cpm_ngs['f_val'] > 4.0).astype(int)
mean_diff_sig['sig'] = (mean_diff_sig['p_sig'] & mean_diff_sig['f_sig']).astype(int)

sig_mean_diff = mean_diff_sig[mean_diff_sig['sig'] == 1]
sig_mean_diff = sig_mean_diff[['gene', 'f_val',	'p_val', 'sig']]

# Show the table
print(sig_mean_diff.shape)
sig_mean_diff

KeyError: 'p_val'