In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

gene_expression_data = pd.read_csv('dataset/gene_expression_data.csv', index_col=0)
gene_expression_data = gene_expression_data.T
gene_expression_data = np.log2(gene_expression_data)

def add_state(row):
    if '_S' in str(row.name):  # Ensure it's a string for the 'in' operation
        return 0
    elif '_P' in str(row.name):
        return 1

gene_expression_data['state'] = gene_expression_data.apply(add_state, axis=1)

### Z-test


In [2]:
stable = gene_expression_data[gene_expression_data['state'] == 0]
progressing = gene_expression_data[gene_expression_data['state'] == 1]

In [3]:

from scipy.stats import norm

def two_sample_z_test_for_features(data_group1, data_group2):

    num_features = data_group1.shape[1]
    p_values = np.zeros(num_features)
    for i in range(num_features):
        feature_data_group1 = data_group1.iloc[:, i]
        feature_data_group2 = data_group2.iloc[:, i]
        mean_group1 = np.mean(feature_data_group1)
        mean_group2 = np.mean(feature_data_group2)
        
        std_dev_group1 = np.std(feature_data_group1)
        std_dev_group2 = np.std(feature_data_group2)
        
        sample_size_group1 = len(feature_data_group1)
        sample_size_group2 = len(feature_data_group2)
        
        # Assuming null hypothesis: mean of group1 - mean of group2 = 0
        mean_difference = mean_group1 - mean_group2
        
        # Calculate the standard error of the difference between means
        std_error_difference = np.sqrt((std_dev_group1**2 / sample_size_group1) + 
                                        (std_dev_group2**2 / sample_size_group2))
        
        # Compute z-statistic
        z_statistic = mean_difference / std_error_difference
        
        # Compute p-value
        p_value = norm.cdf(z_statistic)
        
        p_values[i] = p_value
    
    # Sort indices based on p-values
    return p_values

# Example gene expression data for two groups (replace with your actual data)

p_values = two_sample_z_test_for_features(stable, progressing)

# # Print the p-values
print("P-values:", p_values)

P-values: [0.99711782 0.85741222 0.4012185  ... 0.97320467 0.18156975 0.        ]


  z_statistic = mean_difference / std_error_difference


In [4]:
# Create a df to store feature and p-value
z_test_data = pd.DataFrame({'Feature': list(gene_expression_data.columns), 'P-value': p_values})
z_test_data

Unnamed: 0,Feature,P-value
0,1007_s_at,0.997118
1,1053_at,0.857412
2,117_at,0.401219
3,121_at,0.445731
4,1255_g_at,0.969285
...,...,...
54671,AFFX-ThrX-M_at,0.870082
54672,AFFX-TrpnX-3_at,0.710107
54673,AFFX-TrpnX-5_at,0.973205
54674,AFFX-TrpnX-M_at,0.181570


In [5]:
relevant_z_test = z_test_data[z_test_data['P-value'] < 0.01]
relevant_z_test

Unnamed: 0,Feature,P-value
24,1552275_s_at,0.003488
38,1552295_a_at,0.001715
40,1552299_at,0.005588
88,1552375_at,0.001143
113,1552411_at,0.008763
...,...,...
54444,48808_at,0.002311
54571,65133_i_at,0.001285
54577,65585_at,0.000960
54629,AFFX-HUMGAPDH/M33197_3_at,0.000082


### Wilcoxon

In [6]:
### Wilkoxon test
from scipy.stats import mannwhitneyu

def wilcoxon_rank_sum_test(data_group1, data_group2):
    num_features = data_group1.shape[1]
    p_values = np.zeros(num_features)
    for i in range(num_features):
        feature_group1 = data_group1.iloc[:, i]
        feature_group2 = data_group2.iloc[:, i]
        u_statistic, p_value = mannwhitneyu(feature_group1, feature_group2)
        p_values[i] = p_value

    return p_values

p_values = wilcoxon_rank_sum_test(stable, progressing)
wilcoxon_data = pd.DataFrame({'Feature' : list(gene_expression_data.columns), 'P-values' : p_values})

wilcoxon_data
relevant_wicoxon_test = wilcoxon_data[wilcoxon_data['P-values'] < 0.01]

In [7]:
relevant_wicoxon_test

Unnamed: 0,Feature,P-values
0,1007_s_at,1.853688e-03
4,1255_g_at,9.132303e-04
6,1316_at,1.140697e-04
21,1552271_at,1.585177e-03
24,1552275_s_at,2.003067e-03
...,...,...
54665,AFFX-r2-Ec-bioD-3_at,1.234688e-03
54666,AFFX-r2-Ec-bioD-5_at,7.604822e-04
54667,AFFX-r2-P1-cre-3_at,5.008457e-05
54668,AFFX-r2-P1-cre-5_at,4.118920e-04


In [8]:
z_test_features = relevant_z_test['Feature']

In [9]:
wilcoxon_features = relevant_wicoxon_test['Feature']

### Pearson Correlation

In [10]:
z_test_features


24                    1552275_s_at
38                    1552295_a_at
40                      1552299_at
88                      1552375_at
113                     1552411_at
                   ...            
54444                     48808_at
54571                   65133_i_at
54577                     65585_at
54629    AFFX-HUMGAPDH/M33197_3_at
54675                        state
Name: Feature, Length: 1015, dtype: object

In [11]:
selected = z_test_features