In [None]:
import sys
sys.path.append("../")

import dataInterpreter as dt
import enrichmentAnalysis as ea
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
pd.options.mode.chained_assignment = None  # default='warn'
from sklearn.ensemble import RandomForestClassifier


path = "C:\\Users\\Pedro\\Documents\\BicPAMS\\bicpams_5.1\\data\\latecovid\\"

In [None]:
filtered_data_01 = pd.read_csv(path + 'data-p01.csv', index_col = 0, sep = '\t').T
filtered_data_05 = pd.read_csv(path + 'data-p05.csv', index_col = 0, sep = '\t').T

In [None]:
classes = {'NHBE': {}, 'A549': {}, 'Calu3': {}, 'Biopsy': {}}
i = 0
for c in filtered_data_01.index:
    info = dt.get_info_from_name(c)
    if info['Condition'] not in classes[info['Cell Type']]:
        classes[info['Cell Type']][info['Condition']] = i
        i += 1
        
y = []
for c in filtered_data_01.index:
    info = dt.get_info_from_name(c)
    y.append(classes[info['Cell Type']][info['Condition']])
    
y_names = [' '] * 13
for c_type in classes:
    for cond in classes[c_type]:
        y_names[classes[c_type][cond]] = c_type + ' ' + cond


In [None]:
parameters = {
    'random_state': 42, 
    'n_estimators': 400, 
    'criterion': "gini", 
    'max_depth': None, 
    'min_samples_split': 2, 
    'min_samples_leaf': 1,
    'max_features': "sqrt"
}

stats_NHBE = dt.apply_loocv(filtered_data_01.values, np.array(y), RandomForestClassifier(**parameters))
stats_NHBE

In [None]:
clf = RandomForestClassifier(**parameters).fit(filtered_data_01, y)

list(sorted(zip(clf.feature_importances_, filtered_data_01.columns), reverse = True))

In [None]:
selected_genes = [x[1] for x in list(sorted(zip(clf.feature_importances_, filtered_data_01.columns), reverse = True)) if x[0] > 0.001]
selected_genes

In [None]:
results = ea.getEnrichment(selected_genes, 'GO_Biological_Process_2021')['GO_Biological_Process_2021']
results

In [None]:
import json

with open('results_RandomForest.json', 'w') as file:
     #file.write(json.dumps(results)) # use `json.loads` to do the reverse

In [17]:
import json

with open('results_RandomForest.json') as file:
    results = json.load(file)

In [20]:
dataset = {'p-value': [], 'Score': []}
index = []

for term in results:
    index += [term[1]]
    dataset['p-value'] += [term[6]]
    dataset['Score'] += [term[4]]
enrichment_dataset = pd.DataFrame(dataset, index = index)

In [24]:
len(enrichment_dataset[enrichment_dataset['p-value'] < 0.05].index)

215

In [None]:
pd.set_option("display.max_rows", None)
selection = enrichment_dataset[enrichment_dataset['p-value'] < 0.01].sort_values('Score', ascending = False).head(25)

selection['p-value'] = selection['p-value'].map(lambda x: '%.2E' % x)
selection['Score'] = selection['Score'].map(lambda x: '%.2f' % x)

#selection.to_csv('RandomForest_table.csv')
selection

### Two class comparison

##### NHBE

In [None]:
cols_nhbe_healthy = dt.get_columns('NHBE', 'healthy')
cols_nhbe_cov2 = dt.get_columns('NHBE', 'sars-cov2')

labels_nhbe = [0] * len(cols_nhbe_healthy) + [1] * len(cols_nhbe_cov2)

data_nhbe = dt.get_data('NHBE', 'healthy', 'sars-cov2')

filtered_data_NHBE = dt.get_p_values('mannwhitneyu', data_nhbe, cols_nhbe_healthy, cols_nhbe_cov2)
filtered_data_NHBE.drop(['p-value'], axis = 1, inplace = True)

len(filtered_data_NHBE.index)

In [None]:
parameters = {
    'random_state': 42, 
    'n_estimators': 200, 
    'criterion': "gini", 
    'max_depth': None, 
    'min_samples_split': 2, 
    'min_samples_leaf': 1,
    'max_features': "sqrt"
}

stats_NHBE = dt.apply_loocv(filtered_data_NHBE.T.values, np.array(labels_nhbe), RandomForestClassifier(**parameters))
stats_NHBE

In [None]:
nhbe_clf = RandomForestClassifier(**parameters).fit(filtered_data_NHBE.T, labels_nhbe)

list(sorted(zip(nhbe_clf.feature_importances_, filtered_data_NHBE.T.columns), reverse = True))

In [None]:
selected_genes_nhbe = [x[1] for x in list(sorted(zip(nhbe_clf.feature_importances_, filtered_data_NHBE.T.columns), reverse = True)) if x[0] > 0]
selected_genes_nhbe

In [None]:
results_nhbe = ea.getEnrichment(selected_genes_nhbe, 'GO_Biological_Process_2021')['GO_Biological_Process_2021']
results_nhbe

In [None]:
import json

with open('results_RandomForest_NHBE.json', 'w') as file:
     #file.write(json.dumps(results_nhbe)) # use `json.loads` to do the reverse

In [11]:
import json

with open('results_RandomForest_NHBE.json') as file:
    results_nhbe = json.load(file)

In [13]:
dataset = {'p-value': [], 'Score': [], 'Value': []}
index_nhbe = []

cols_nhbe_healthy = dt.get_columns('NHBE', 'healthy')
cols_nhbe_cov2 = dt.get_columns('NHBE', 'sars-cov2')

data_nhbe = dt.get_data('NHBE', 'healthy', 'sars-cov2')

for term in results_nhbe:
    index_nhbe += [term[1]]
    dataset['p-value'] += [term[6]]
    dataset['Score'] += [term[4]]
    
    genes = term[5]
    avg_sub = np.mean(data_nhbe.loc[genes, cols_nhbe_healthy].values, axis = 1) - np.mean(data_nhbe.loc[genes, cols_nhbe_cov2].values, axis = 1)

    downs = 0
    ups = 0

    for e in avg_sub:
        if e > 0:
            downs += 1
        elif e < 0:
            ups += 1

    dataset['Value'] += ['%d up, %d down' % (ups, downs)]
enrichment_nhbe_dataset = pd.DataFrame(dataset, index = index_nhbe)

In [16]:
len(enrichment_nhbe_dataset[enrichment_nhbe_dataset['p-value'] < 0.001].index)

3

In [None]:
pd.set_option("display.max_rows", None)
selection = enrichment_nhbe_dataset[enrichment_nhbe_dataset['p-value'] < 0.05].sort_values('Score', ascending = False)#.head(25)[['p-value', 'Score']]

selection['p-value'] = selection['p-value'].map(lambda x: '%.2E' % x)
selection['Score'] = selection['Score'].map(lambda x: '%.2f' % x)

#selection.to_csv('NHBE_RandomForest_table.csv')
selection

##### A549

In [None]:
cols_healthy_A549 = dt.get_columns('A549', 'healthy')
cols_cov2_A549 = dt.get_columns('A549', 'sars-cov2')

labels_a549 = [0] * len(cols_healthy_A549) + [1] * len(cols_cov2_A549)

data_a549 = dt.get_data('A549', 'healthy', 'sars-cov2')

filtered_data_a549 = dt.get_p_values('mannwhitneyu', data_a549, cols_healthy_A549, cols_cov2_A549, limit = 0.001)
filtered_data_a549.drop(['p-value'], axis = 1, inplace = True)

filtered_data_a549

In [None]:
len(filtered_data_a549.index)

In [None]:
parameters = {
    'random_state': 42, 
    'n_estimators': 100, 
    'criterion': "gini", 
    'max_depth': None, 
    'min_samples_split': 2, 
    'min_samples_leaf': 1,
    'max_features': "sqrt"
}

results_a549 = dt.apply_loocv(filtered_data_a549.T.values, np.array(labels_a549), RandomForestClassifier(**parameters))
results_a549

In [None]:
clf_a549 = RandomForestClassifier(**parameters).fit(filtered_data_a549.T.values, np.array(labels_a549))

list(sorted(zip(clf_a549.feature_importances_, filtered_data_a549.T.columns), reverse = True))

In [None]:
selected_genes_a549 = [x[1] for x in list(sorted(zip(clf_a549.feature_importances_, filtered_data_a549.T.columns), reverse = True)) if x[0] > 0]
selected_genes_a549

In [None]:
enrichment_a549 = ea.getEnrichment(selected_genes_a549, 'GO_Biological_Process_2021')['GO_Biological_Process_2021']
enrichment_a549

In [None]:
import json

with open('results_RandomForest_A549.json', 'w') as file:
     file.write(json.dumps(enrichment_a549)) # use `json.loads` to do the reverse

In [None]:
import json

with open('results_RandomForest_A549.json') as file:
    enrichment_a549 = json.load(file)

In [None]:
dataset = {'p-value': [], 'Score': []} #, 'Value': []
index_a549 = []

cols_healthy_A549 = dt.get_columns('A549', 'healthy')
cols_cov2_A549 = dt.get_columns('A549', 'sars-cov2')

data_a549 = dt.get_data('A549', 'healthy', 'sars-cov2')

for term in enrichment_a549:
    index_a549 += [term[1]]
    dataset['p-value'] += [term[6]]
    dataset['Score'] += [term[4]]
    
    genes = term[5]
    avg_sub = np.mean(data_a549.loc[genes, cols_healthy_A549].values, axis = 1) - np.mean(data_a549.loc[genes, cols_cov2_A549].values, axis = 1)

    downs = 0
    ups = 0

    for e in avg_sub:
        if e > 0:
            downs += 1
        elif e < 0:
            ups += 1

    #dataset['Value'] += ['%d up, %d down' % (ups, downs)]
enrichment_a549_dataset = pd.DataFrame(dataset, index = index_a549)

In [None]:
len(enrichment_a549_dataset[enrichment_a549_dataset['p-value'] < 0.05].index)

In [None]:
pd.set_option("display.max_rows", None)
selection = enrichment_a549_dataset[enrichment_a549_dataset['p-value'] < 0.05].sort_values('Score', ascending = False)

selection['p-value'] = selection['p-value'].map(lambda x: '%.2E' % x)
selection['Score'] = selection['Score'].map(lambda x: '%.2f' % x)

selection.to_csv('A549_RandomForest_table.csv')
selection

##### Calu3

In [None]:
cols_healthy_Calu3 = dt.get_columns('Calu3', 'healthy')
cols_cov2_Calu3 = dt.get_columns('Calu3', 'sars-cov2')

labels_calu3 = [0] * len(cols_healthy_Calu3) + [1] * len(cols_cov2_Calu3)

data_calu3 = dt.get_data('Calu3', 'healthy', 'sars-cov2')

filtered_data_calu3 = dt.get_p_values('mannwhitneyu', data_calu3, cols_healthy_Calu3, cols_cov2_Calu3, limit = 0.05)
filtered_data_calu3.drop(['p-value'], axis = 1, inplace = True)

len(filtered_data_calu3.index)

In [None]:
parameters = {
    'random_state': 42, 
    'n_estimators': 100, 
    'criterion': "entropy", 
    'max_depth': None, 
    'min_samples_split': 3, 
    'min_samples_leaf': 2,
    'max_features': "log2"
}

stats_calu3 = dt.apply_loocv(filtered_data_calu3.T.values, np.array(labels_calu3), RandomForestClassifier(**parameters))
stats_calu3

In [None]:
clf_calu3 = RandomForestClassifier(**parameters).fit(filtered_data_calu3.T.values, np.array(labels_calu3))

list(sorted(zip(clf_calu3.feature_importances_, filtered_data_calu3.T.columns), reverse = True))

In [None]:
selected_genes_calu3 = [x[1] for x in list(sorted(zip(clf_calu3.feature_importances_, filtered_data_calu3.T.columns), reverse = True)) if x[0] > 0]
selected_genes_calu3

In [None]:
results_calu3 = ea.getEnrichment(selected_genes_calu3, 'GO_Biological_Process_2021')['GO_Biological_Process_2021']
results_calu3

In [None]:
import json

with open('results_RandomForest_Calu3.json', 'w') as file:
     #file.write(json.dumps(results_calu3)) # use `json.loads` to do the reverse

In [None]:
import json

with open('results_RandomForest_Calu3.json') as file:
    results_calu3 = json.load(file)

In [None]:
dataset = {'p-value': [], 'Score': [], 'Value': []}
index_calu3 = []

cols_healthy_Calu3 = dt.get_columns('Calu3', 'healthy')
cols_cov2_Calu3 = dt.get_columns('Calu3', 'sars-cov2')

data_calu3 = dt.get_data('Calu3', 'healthy', 'sars-cov2')

for term in results_calu3:
    index_calu3 += [term[1]]
    dataset['p-value'] += [term[6]]
    dataset['Score'] += [term[4]]
    
    genes = term[5]
    avg_sub = np.mean(data_calu3.loc[genes, cols_healthy_Calu3].values, axis = 1) - np.mean(data_calu3.loc[genes, cols_cov2_Calu3].values, axis = 1)

    downs = 0
    ups = 0

    for e in avg_sub:
        if e > 0:
            downs += 1
        elif e < 0:
            ups += 1

    dataset['Value'] += ['%d up, %d down' % (ups, downs)]
    
enrichment_calu3_dataset = pd.DataFrame(dataset, index = index_calu3)

In [None]:
pd.set_option("display.max_rows", None)
selection = enrichment_calu3_dataset[enrichment_calu3_dataset['p-value'] < 0.05].sort_values('Score', ascending = False).head(25)[['p-value', 'Score']]

selection['p-value'] = selection['p-value'].map(lambda x: '%.2E' % x)
selection['Score'] = selection['Score'].map(lambda x: '%.2f' % x)

selection.to_csv('Calu3_RandomForest_table.csv')
selection