In [1]:
import pandas as pd
import seaborn as sns
sns.set_style("whitegrid")
import pandas as pd
import numpy as np 
import scipy.stats as stats
from collections import Counter
import matplotlib.pyplot as plt
import umap
import matplotlib
import mygene
%matplotlib inline
import pickle
import sklearn
import random
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from xgboost import XGBClassifier
import os
# Specify the working directory
os.chdir('/Users/david/Desktop/FinetuneEmbed')

# import sentence_transformers
plt.style.use('ggplot')
#plt.style.use('seaborn-v0_8-dark-palette')
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams.update({
    "text.usetex": True,
    "font.family": "Helvetica"
})
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('retina')

In [2]:
with open(f"./data/embeddings/GPT_3_5_gene_embeddings_fromGenePT.pickle", "rb") as fp:
    GPT_3_5_gene_embeddings = pickle.load(fp)

## Long- vs short- range TFs
The input data used here are downloaded from Chen et al. (2020) (link: https://www-nature-com.stanford.idm.oclc.org/articles/s41467-020-16106-x).

In [3]:
# prepare the input data
with open("./data/long_vs_shortTF/train_data.pkl", "rb") as f:
    train_data = pickle.load(f)
with open("./data/long_vs_shortTF/test_data.pkl", "rb") as f:
    test_data = pickle.load(f)

In [4]:
train_genes = train_data['genes']
train_labels = train_data['labels']

test_genes = test_data['genes']
test_labels = test_data['labels']

In [5]:
overlap_train_gene = list(set(train_genes) & set(GPT_3_5_gene_embeddings.keys())) # find the intersected genes
overlap_test_gene = list(set(test_genes) & set(GPT_3_5_gene_embeddings.keys())) # find the intersected genes

train_indices = [train_genes.index(x) for x in overlap_train_gene]
overlap_train_labels = [train_labels[i] for i in train_indices]

test_indices = [test_genes.index(x) for x in overlap_test_gene]
overlap_test_labels = [test_labels[i] for i in test_indices]


X_train = [GPT_3_5_gene_embeddings[x] for x in overlap_train_gene \
               if x in GPT_3_5_gene_embeddings]
X_train = np.array(X_train)
y_train = overlap_train_labels
X_test = [GPT_3_5_gene_embeddings[x] for x in overlap_test_gene \
                if x in GPT_3_5_gene_embeddings]
X_test = np.array(X_test)
y_test = overlap_test_labels

In [6]:
len(X_train), len(y_train), len(X_test), len(y_test)

(121, 121, 52, 52)

In [11]:
# Logistic Regression
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_score_logistic = logistic_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_score_logistic)
roc_auc = auc(fpr, tpr)
# Print ROC AUC scores
print(f"Logistic Regression ROC AUC: {roc_auc:.4f}")
# Random Forest
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)
y_score_rf = random_forest_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_score_rf)
roc_auc = auc(fpr, tpr)
print(f"Random Forest ROC AUC: {roc_auc:.4f}")

Logistic Regression ROC AUC: 0.5639
Random Forest ROC AUC: 0.5714


## Dosage sensitive vs insensitive TFs

In [35]:
# prepare the input data
with open("./data/DosageSensitivity/train_data.pkl", "rb") as f:
    train_data = pickle.load(f)
with open("./data/DosageSensitivity/test_data.pkl", "rb") as f:
    test_data = pickle.load(f)

In [36]:
train_genes = train_data['genes']
train_labels = train_data['labels']

test_genes = test_data['genes']
test_labels = test_data['labels']

In [37]:
overlap_train_gene = list(set(train_genes) & set(GPT_3_5_gene_embeddings.keys())) # find the intersected genes
overlap_test_gene = list(set(test_genes) & set(GPT_3_5_gene_embeddings.keys())) # find the intersected genes

In [38]:
train_indices = [train_genes.index(x) for x in overlap_train_gene]
overlap_train_labels = [train_labels[i] for i in train_indices]

test_indices = [test_genes.index(x) for x in overlap_test_gene]
overlap_test_labels = [test_labels[i] for i in test_indices]

In [39]:
X_train = [GPT_3_5_gene_embeddings[x] for x in overlap_train_gene \
               if x in GPT_3_5_gene_embeddings]
X_train = np.array(X_train)
y_train = overlap_train_labels
X_test = [GPT_3_5_gene_embeddings[x] for x in overlap_test_gene \
                if x in GPT_3_5_gene_embeddings]
X_test = np.array(X_test)
y_test = overlap_test_labels

In [40]:
# Logistic Regression
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_score_logistic = logistic_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_score_logistic)
roc_auc = auc(fpr, tpr)
# Print ROC AUC scores
print(f"Logistic Regression ROC AUC: {roc_auc:.4f}")
# Random Forest
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)
y_score_rf = random_forest_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_score_rf)
roc_auc = auc(fpr, tpr)
print(f"Random Forest ROC AUC: {roc_auc:.4f}")

Logistic Regression ROC AUC: 0.8906
Random Forest ROC AUC: 0.9175


## Bivalent vs. lys4

In [41]:
# prepare the input data
with open("./data/MethylationState/bivalent_vs_lys4/train_data.pkl", "rb") as f:
    train_data = pickle.load(f)
with open("./data/MethylationState/bivalent_vs_lys4/test_data.pkl", "rb") as f:
    test_data = pickle.load(f)

train_genes = train_data['genes']
train_labels = train_data['labels']

test_genes = test_data['genes']
test_labels = test_data['labels']

overlap_train_gene = list(set(train_genes) & set(GPT_3_5_gene_embeddings.keys())) # find the intersected genes
overlap_test_gene = list(set(test_genes) & set(GPT_3_5_gene_embeddings.keys())) # find the intersected genes

train_indices = [train_genes.index(x) for x in overlap_train_gene]
overlap_train_labels = [train_labels[i] for i in train_indices]

test_indices = [test_genes.index(x) for x in overlap_test_gene]
overlap_test_labels = [test_labels[i] for i in test_indices]

X_train = [GPT_3_5_gene_embeddings[x] for x in overlap_train_gene \
               if x in GPT_3_5_gene_embeddings]
X_train = np.array(X_train)
y_train = overlap_train_labels
X_test = [GPT_3_5_gene_embeddings[x] for x in overlap_test_gene \
                if x in GPT_3_5_gene_embeddings]
X_test = np.array(X_test)
y_test = overlap_test_labels

# Logistic Regression
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_score_logistic = logistic_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_score_logistic)
roc_auc = auc(fpr, tpr)
# Print ROC AUC scores
print(f"Logistic Regression ROC AUC: {roc_auc:.4f}")
# Random Forest
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)
y_score_rf = random_forest_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_score_rf)
roc_auc = auc(fpr, tpr)
print(f"Random Forest ROC AUC: {roc_auc:.4f}")

Logistic Regression ROC AUC: 0.9901
Random Forest ROC AUC: 0.9967


## Bivalent vs. no methyl

In [42]:
# prepare the input data
with open("./data/MethylationState/bivalent_vs_no_methyl/train_data.pkl", "rb") as f:
    train_data = pickle.load(f)
with open("./data/MethylationState/bivalent_vs_no_methyl/test_data.pkl", "rb") as f:
    test_data = pickle.load(f)

train_genes = train_data['genes']
train_labels = train_data['labels']

test_genes = test_data['genes']
test_labels = test_data['labels']

overlap_train_gene = list(set(train_genes) & set(GPT_3_5_gene_embeddings.keys())) # find the intersected genes
overlap_test_gene = list(set(test_genes) & set(GPT_3_5_gene_embeddings.keys())) # find the intersected genes

train_indices = [train_genes.index(x) for x in overlap_train_gene]
overlap_train_labels = [train_labels[i] for i in train_indices]

test_indices = [test_genes.index(x) for x in overlap_test_gene]
overlap_test_labels = [test_labels[i] for i in test_indices]

X_train = [GPT_3_5_gene_embeddings[x] for x in overlap_train_gene \
               if x in GPT_3_5_gene_embeddings]
X_train = np.array(X_train)
y_train = overlap_train_labels
X_test = [GPT_3_5_gene_embeddings[x] for x in overlap_test_gene \
                if x in GPT_3_5_gene_embeddings]
X_test = np.array(X_test)
y_test = overlap_test_labels

# Logistic Regression
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_score_logistic = logistic_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_score_logistic)
roc_auc = auc(fpr, tpr)
# Print ROC AUC scores
print(f"Logistic Regression ROC AUC: {roc_auc:.4f}")
# Random Forest
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)
y_score_rf = random_forest_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_score_rf)
roc_auc = auc(fpr, tpr)
print(f"Random Forest ROC AUC: {roc_auc:.4f}")

Logistic Regression ROC AUC: 0.8684
Random Forest ROC AUC: 0.8224
