In [1]:
import os
import pickle
# Specify the working directory
os.chdir('/Users/david/Desktop/FinetuneEmbed')
from mod.utils import *

with open(f"./data/embeddings/GPT_3_5_gene_embeddings_fromGenePT.pickle", "rb") as fp:
    GPT_3_5_gene_embeddings = pickle.load(fp)

## Long- vs short- range TFs
The input data used here are downloaded from Chen et al. (2020) (link: https://www-nature-com.stanford.idm.oclc.org/articles/s41467-020-16106-x).

In [2]:
# prepare the input data
with open("./data/long_vs_shortTF/train_data.pkl", "rb") as f:
    train_data = pickle.load(f)
with open("./data/long_vs_shortTF/test_data.pkl", "rb") as f:
    test_data = pickle.load(f)

X_train, y_train, X_test, y_test = load_data(train_data, test_data, GPT_3_5_gene_embeddings)

print("Logistic regression results----------------")
best_model = LogisticReg_cv(X_train, y_train, folds=5)
# Train the best model on the full training data
best_model.fit(X_train, y_train)
# Evaluate the best model on the test set
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_pred_proba)
print(f"Test AUC using the best model from CV: {test_auc:.4f}")

print("Random forest results----------------")
best_model = RandomForest_cv(X_train, y_train, folds=5)
# Train the best model on the full training data
best_model.fit(X_train, y_train)
# Evaluate the best model on the test set
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_pred_proba)
print(f"Test AUC using the best model from CV: {test_auc:.4f}")


Logistic regression results----------------
Validation AUC: Mean = 0.6138, Standard Deviation = 0.0594
Test AUC using the best model from CV: 0.5692
Random forest results----------------
Validation AUC: Mean = 0.6089, Standard Deviation = 0.1348
Test AUC using the best model from CV: 0.5923


## Dosage sensitive vs insensitive TFs

In [3]:
# prepare the input data
with open("./data/DosageSensitivity/train_data.pkl", "rb") as f:
    train_data = pickle.load(f)
with open("./data/DosageSensitivity/test_data.pkl", "rb") as f:
    test_data = pickle.load(f)

X_train, y_train, X_test, y_test = load_data(train_data, test_data, GPT_3_5_gene_embeddings)

print("Logistic regression results----------------")
best_model = LogisticReg_cv(X_train, y_train, folds=5)
# Train the best model on the full training data
best_model.fit(X_train, y_train)
# Evaluate the best model on the test set
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_pred_proba)
print(f"Test AUC using the best model from CV: {test_auc:.4f}")

print("Random forest results----------------")
best_model = RandomForest_cv(X_train, y_train, folds=5)
# Train the best model on the full training data
best_model.fit(X_train, y_train)
# Evaluate the best model on the test set
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_pred_proba)
print(f"Test AUC using the best model from CV: {test_auc:.4f}")

Logistic regression results----------------
Validation AUC: Mean = 0.8897, Standard Deviation = 0.0130
Test AUC using the best model from CV: 0.9491
Random forest results----------------
Validation AUC: Mean = 0.9166, Standard Deviation = 0.0291
Test AUC using the best model from CV: 0.9491


## Bivalent vs. lys4

In [4]:
# prepare the input data
with open("./data/MethylationState/bivalent_vs_lys4/train_data.pkl", "rb") as f:
    train_data = pickle.load(f)
with open("./data/MethylationState/bivalent_vs_lys4/test_data.pkl", "rb") as f:
    test_data = pickle.load(f)

X_train, y_train, X_test, y_test = load_data(train_data, test_data, GPT_3_5_gene_embeddings)

print("Logistic regression results----------------")
best_model = LogisticReg_cv(X_train, y_train, folds=5)
# Train the best model on the full training data
best_model.fit(X_train, y_train)
# Evaluate the best model on the test set
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_pred_proba)
print(f"Test AUC using the best model from CV: {test_auc:.4f}")

print("Random forest results----------------")
best_model = RandomForest_cv(X_train, y_train, folds=5)
# Train the best model on the full training data
best_model.fit(X_train, y_train)
# Evaluate the best model on the test set
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_pred_proba)
print(f"Test AUC using the best model from CV: {test_auc:.4f}")

Logistic regression results----------------
Validation AUC: Mean = 0.9538, Standard Deviation = 0.0211
Test AUC using the best model from CV: 0.8750
Random forest results----------------
Validation AUC: Mean = 0.9589, Standard Deviation = 0.0149
Test AUC using the best model from CV: 0.9313


## Bivalent vs. no methyl

In [5]:
# prepare the input data
with open("./data/MethylationState/bivalent_vs_no_methyl/train_data.pkl", "rb") as f:
    train_data = pickle.load(f)
with open("./data/MethylationState/bivalent_vs_no_methyl/test_data.pkl", "rb") as f:
    test_data = pickle.load(f)

X_train, y_train, X_test, y_test = load_data(train_data, test_data, GPT_3_5_gene_embeddings)

print("Logistic regression results----------------")
best_model = LogisticReg_cv(X_train, y_train, folds=5)
# Train the best model on the full training data
best_model.fit(X_train, y_train)
# Evaluate the best model on the test set
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_pred_proba)
print(f"Test AUC using the best model from CV: {test_auc:.4f}")

print("Random forest results----------------")
best_model = RandomForest_cv(X_train, y_train, folds=5)
# Train the best model on the full training data
best_model.fit(X_train, y_train)
# Evaluate the best model on the test set
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_pred_proba)
print(f"Test AUC using the best model from CV: {test_auc:.4f}")

Logistic regression results----------------
Validation AUC: Mean = 0.9149, Standard Deviation = 0.0328
Test AUC using the best model from CV: 0.7000
Random forest results----------------
Validation AUC: Mean = 0.9221, Standard Deviation = 0.0283
Test AUC using the best model from CV: 0.6875
