In [8]:
from Data import CombinedData
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import pandas as pd 
from sklearn.preprocessing import StandardScaler


In [9]:
exp = pd.read_table("../EXP/expression.tsv", index_col=0)

In [10]:
g2v = pd.read_table("../gene2vec_data/dataset/g2v_1000emb.tsv", index_col=0)

In [11]:
import numpy as np

g2v_matrix = np.empty_like(exp)[:, :1000]

In [12]:
conv = {gene:i for i, gene in enumerate(exp.index)}

In [13]:
conv["AT1G01010"]

0

In [14]:
g2v.index = g2v.index.map(lambda x: conv[x])

In [22]:
for i, row in g2v.iterrows():
    g2v_matrix[i,:] = row.values

In [23]:
g2v_matrix.shape

(37336, 1000)

In [24]:
g2v_tensor = torch.tensor(g2v_matrix)

In [26]:
torch.save(g2v_tensor, "../gene2vec_data/dataset/g2v_1000emb.pt")

In [27]:
g2v_tensor = torch.load("../gene2vec_data/dataset/g2v_1000emb.pt", weights_only=False)

# Model

In [2]:
import torch 
import torch.nn as nn 

class LinearModel(nn.Module):
    def __init__(self, embed_size, num_go, n_classes, combine_size):
        super().__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else
                                   "cpu")
        self.combined_size = combine_size
        self.exp_encoder = nn.Sequential(
            nn.Linear(1343, 1280),
            nn.ReLU(),
            nn.Linear(1280, 1024),
            nn.ReLU(),
            nn.Linear(1024, embed_size),
        )
        self.go_encoder = nn.Sequential(
            nn.Linear(num_go, 100),
        )
        self.n_class = 1 if n_classes <= 2 else n_classes
        self.out = nn.Sequential(
            nn.Linear(self.combined_size, 1),
        )
        self.flatten = nn.Flatten()
        self.open = lambda x: x
        self.activation = self.open if self.n_class > 2 else nn.Sigmoid()

class GO(LinearModel):

    def forward(self, gp, g2v, go, exp):
        go = go.to(self.device).float()
        z_go = self.go_encoder(go)
        flatten = self.flatten(z_go)
        logit = self.out(flatten)
        prob = self.activation(logit)
        return prob


class EXP(LinearModel):

    def forward(self, gp, g2v, go, exp):
        exp = exp.to(self.device).float()
        z_exp = self.exp_encoder(exp)
        flatten = self.flatten(z_exp)
        logit = self.out(flatten)
        prob = self.activation(logit)
        return prob


class G2V(LinearModel):
    def forward(self, gp, g2v, go, exp):
        g2v = g2v.to(self.device).float()
        flatten = self.flatten(g2v)
        logit = self.out(flatten)
        prob = self.activation(logit)
        return prob


# Data

In [8]:
import os 

os.chdir("/home/llan/Desktop/WUR/thesis2")


In [9]:
LABEL_FOLDER = "shared_data/binary_labels/TF_split"
BATCHSIZE = 32


In [463]:

g2v_data = "/home/llan/Desktop/WUR/thesis2/gene2vec_data/dataset/g2v_embeddings.tsv"
go_data = "/home/llan/Desktop/WUR/thesis2/GO/GO_data.txt"
go_data = "GO/GO_RANDOM.txt"
exp_data_df = pd.read_csv("EXP/expression.tsv", sep="\t", header=0, index_col=0)
exp_data = StandardScaler().fit_transform(exp_data_df.T).T

train_data_path = LABEL_FOLDER + "/Train_set.tsv"
val_data_path = LABEL_FOLDER + "/Val_set.tsv"
test_data_path = LABEL_FOLDER + "/Test_set.tsv"

train_data = CombinedData(train_data_path, g2v_data, go_data, exp_data)
val_data = CombinedData(val_data_path,  g2v_data, go_data, exp_data)
test_data = CombinedData(test_data_path,  g2v_data, go_data, exp_data)

train_loader = DataLoader(train_data, batch_size=BATCHSIZE, shuffle=True)
val_loader = DataLoader(val_data, batch_size=BATCHSIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCHSIZE, shuffle=True)


# training 

In [4]:
model = GO(100, train_data.n_go, 2, 200).to("cuda")


NameError: name 'GO' is not defined

In [6]:
from Train import train
from torch.optim import Adam

optim = Adam(params=model.parameters(), lr=0.001)
losses = []
for epoch in range(10):
    model, loss = train(model, dataloader=train_loader, loss_func=nn.BCELoss(), optimizer=optim, epoch=epoch )
    

| epoch   0 |    1 /  51 batches |Train loss    0.700 | AUROC    0.360 | AUPRC    0.524
| epoch   0 |    2 /  51 batches |Train loss    0.707 | AUROC    0.450 | AUPRC    0.351
| epoch   0 |    3 /  51 batches |Train loss    0.709 | AUROC    0.610 | AUPRC    0.467
| epoch   0 |    4 /  51 batches |Train loss    0.679 | AUROC    0.592 | AUPRC    0.620
| epoch   0 |    5 /  51 batches |Train loss    0.729 | AUROC    0.552 | AUPRC    0.329
| epoch   0 |    6 /  51 batches |Train loss    0.719 | AUROC    0.490 | AUPRC    0.652
| epoch   0 |    7 /  51 batches |Train loss    0.682 | AUROC    0.617 | AUPRC    0.645
| epoch   0 |    8 /  51 batches |Train loss    0.699 | AUROC    0.534 | AUPRC    0.701
| epoch   0 |    9 /  51 batches |Train loss    0.716 | AUROC    0.478 | AUPRC    0.398
| epoch   0 |   10 /  51 batches |Train loss    0.779 | AUROC    0.312 | AUPRC    0.379
| epoch   0 |   11 /  51 batches |Train loss    0.628 | AUROC    0.721 | AUPRC    0.816
| epoch   0 |   12 /  51 batches

In [9]:
import sys
sys.path.append("Code")


In [10]:
from Test import test


In [11]:
lab, pred = test(model, test_loader, "/home/llan/Desktop/WUR/thesis2/media/TEST/")


In [12]:
import numpy as np 
_, (f, t) = np.unique((lab == (pred >= 0.5).int()).cpu().numpy().flatten(), return_counts=True)


In [13]:
t/ (f+t)


0.7935483870967742

In [14]:
train_data.dataset


array([[ 3192,    52],
       [ 3192,   501],
       [ 3192,  2419],
       ...,
       [30692,  3173],
       [30692, 21973],
       [30692, 36413]])

In [15]:
def row_to_string(data):
    return_list = []
    for row in data:
        string = str(row[0]) + str(row[1])
        return_list.append(string)
    return return_list


In [16]:
train = row_to_string(train_data.dataset)
val = row_to_string(val_data.dataset)
test = row_to_string(test_data.dataset)


# creating a False GO_data file 

In [16]:
go_data = "/home/llan/Desktop/WUR/thesis2/GO/GO_data.txt"
train_data = CombinedData(train_data_path, g2v_data, go_data, exp_data)


In [17]:
import random

new_file = dict()
with open(go_data, "r") as f:
    for line in f:
        val = line.strip().split("\t")
        #new_file[val[0]] = set([str(random.randint(0,train_data.n_go)) for _ in range(random.randint(1, 100))])
        new_file[val[0]] = ["0", "1", "2"]


In [18]:
random.randint(0, train_data.n_go) 


2402

In [20]:
with open("GO/one_one_hot.txt", "w") as f:
    for key, val in new_file.items():
        line = key + "\t" + " ".join(new_file[key])
        print(line)
        f.write(line + '\n')


0	0 1 2
1	0 1 2
2	0 1 2
3	0 1 2
4	0 1 2
5	0 1 2
6	0 1 2
7	0 1 2
8	0 1 2
9	0 1 2
10	0 1 2
11	0 1 2
12	0 1 2
13	0 1 2
14	0 1 2
15	0 1 2
16	0 1 2
17	0 1 2
18	0 1 2
19	0 1 2
20	0 1 2
21	0 1 2
22	0 1 2
23	0 1 2
24	0 1 2
25	0 1 2
26	0 1 2
27	0 1 2
28	0 1 2
29	0 1 2
30	0 1 2
31	0 1 2
32	0 1 2
33	0 1 2
34	0 1 2
35	0 1 2
36	0 1 2
37	0 1 2
38	0 1 2
39	0 1 2
40	0 1 2
41	0 1 2
42	0 1 2
43	0 1 2
44	0 1 2
45	0 1 2
46	0 1 2
47	0 1 2
48	0 1 2
49	0 1 2
50	0 1 2
51	0 1 2
52	0 1 2
53	0 1 2
54	0 1 2
55	0 1 2
56	0 1 2
57	0 1 2
58	0 1 2
59	0 1 2
60	0 1 2
61	0 1 2
62	0 1 2
63	0 1 2
64	0 1 2
65	0 1 2
66	0 1 2
67	0 1 2
68	0 1 2
69	0 1 2
70	0 1 2
71	0 1 2
72	0 1 2
73	0 1 2
74	0 1 2
75	0 1 2
76	0 1 2
77	0 1 2
78	0 1 2
79	0 1 2
80	0 1 2
81	0 1 2
82	0 1 2
83	0 1 2
84	0 1 2
85	0 1 2
86	0 1 2
87	0 1 2
88	0 1 2
89	0 1 2
90	0 1 2
91	0 1 2
92	0 1 2
93	0 1 2
94	0 1 2
95	0 1 2
96	0 1 2
97	0 1 2
98	0 1 2
99	0 1 2
100	0 1 2
101	0 1 2
102	0 1 2
103	0 1 2
104	0 1 2
105	0 1 2
106	0 1 2
107	0 1 2
108	0 1 2
109	0 1 2
110	0 1 2


# Validation 

In [4]:
from sklearn.model_selection import train_test_split


In [485]:
g2v_emb = pd.read_table("gene2vec_data/dataset/g2v_embeddings.tsv", index_col=0)

In [3]:
os.chdir("/home/llan/Desktop/WUR/thesis2")
LABEL_DIR = "shared_data/binary_labels/"
label_data = LABEL_DIR + "Labels.tsv"
tf_list = LABEL_DIR + "TF_list.tsv"
tg_list = LABEL_DIR + "TG_list.tsv"

label_data = pd.read_table(label_data, header=0, index_col=0)
tf_list = pd.read_table(tf_list, header=0).idx.tolist()
tg_list = pd.read_table(tg_list, header=0).idx.tolist()

In [7]:
tf_conv = pd.read_table("LABELS/TF_list.tsv")
tg_conv = pd.read_table("LABELS/TG_list.tsv")


In [430]:
ath = pd.read_table("LABELS/"+"Regulations_in_ATRM.tsv")

In [8]:
# hier laat ik zien dat de TF index in ATRM dezelfde TAIR ID heeft als dezelfde index in de expressie matrix.

for i in range(ath.shape[0]):
    tair, tf_i = ath["TF ID"][i], ath["TF index"][i]
    tair_exp = exp_data_df.index[tf_i]
    if tair_exp != tair:
        print(tair, tair_exp)

    else: 
        if (i % 100 ==0):
            print(tair, tair_exp)

AT1G01060 AT1G01060
AT1G19850 AT1G19850
AT1G33240 AT1G33240
AT1G65620 AT1G65620
AT2G01570 AT2G01570
AT2G24570 AT2G24570
AT2G43010 AT2G43010
AT3G16770 AT3G16770
AT3G28910 AT3G28910
AT4G02560 AT4G02560
AT4G25490 AT4G25490
AT4G37750 AT4G37750
AT5G13790 AT5G13790
AT5G40350 AT5G40350
AT5G61850 AT5G61850


In [9]:
# De IDs van tf_conv staan niet colom "TF ID" van Ath 
tf_conv[~tf_conv.ID.isin(ath["TF ID"])]

Unnamed: 0,ID,idx
0,AT1G01010,0
1,AT1G01030,2
3,AT1G01250,25
4,AT1G01260,26
5,AT1G01350,36
...,...,...
1710,AT5G67180,36998
1711,AT5G67190,36999
1713,AT5G67411,37025
1715,AT5G67450,37029


In [10]:
# tf_list heeft ook indeces die niet in colom "TF index" staan van Ath
pd.Series(tf_list).isin(ath["TF index"]).value_counts()

False    1215
True      297
Name: count, dtype: int64

In [11]:
len(set(ath["TF index"]).difference(g2v_emb.index))

#isin(g2v_emb.index).value_counts()

27

In [12]:
# De TF indeces van label_data komen overeen met tf_list
label_data["TF ID"].isin(tf_list).value_counts()

TF ID
True    1229
Name: count, dtype: int64

# Recreating the Data

In [1]:
import sys
sys.path.append("Code")
from DataProcessor import * 

In [4]:
dp = DatasetProcessor()

In [5]:
label_data

Unnamed: 0,TF ID,Target ID
0,5,456
1,5,3527
2,5,4626
3,5,7756
4,5,12597
...,...,...
1417,36454,35760
1418,36459,7756
1420,36747,15381
1421,36966,8342


In [13]:
pos_dict = dp._label_data_to_dict(label_data)
neg_dict = dp._neg_dict_from_pos_dict(pos_dict, tg_list)

NameError: name 'label_data' is not defined

In [14]:
tf_label = list(set(label_data["TF ID"]))

NameError: name 'label_data' is not defined

In [15]:
tg_label = (list(set(label_data["Target ID"])))

NameError: name 'label_data' is not defined

In [16]:
set(tf_label).difference(tg_label)

NameError: name 'tf_label' is not defined

In [17]:
tg_label

NameError: name 'tg_label' is not defined

In [18]:
label_data["TF ID"].union(label_data["Target ID"])

NameError: name 'label_data' is not defined

In [19]:
from sklearn.model_selection import train_test_split


In [20]:

tf_train, tf_test = train_test_split(unique_ids, test_size=0.3, train_size=0.7)



NameError: name 'unique_ids' is not defined

In [27]:
def dict_to_df(dictionary):
    d = dictionary
    row_list = []
    for key in d.keys():
        for val in d[key]:
            row_list.append([key, val])
    df = pd.DataFrame(row_list, columns=["TF", "Target"])
    return df 

In [277]:
tfs_pos = dict_to_df(pos_dict)

In [278]:
tfs_neg = dict_to_df(neg_dict)

In [279]:
tfs_neg.keys()

Index(['TF', 'Target'], dtype='object')

In [300]:
set(tf_train).intersection(tf_test)

set()

In [295]:
train_pos = tfs_pos.loc[tfs_pos["TF"].isin(tf_train) & tfs_pos["Target"].isin(tf_train)]
test_pos = tfs_pos.loc[tfs_pos["TF"].isin(tf_test) & tfs_pos["Target"].isin(tf_test)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_pos["Label"] = [1] * train_pos.shape[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_pos["Label"] = [1] * test_pos.shape[0]


In [None]:


train_pos["Label"] = [1] * train_pos.shape[0]
test_pos["Label"] = [1] * test_pos.shape[0]
train_neg = []
for i, (n_targets) in train_pos.groupby("TF").nunique().iterrows():
    n_targets = n_targets.loc["Target"]
    neg_sample = tfs_neg[(tfs_neg["TF"] == i) & (tfs_neg["Target"].isin(tf_train))].sample(n=n_targets)
    train_neg.append(neg_sample)

test_neg = []
for i, (n_targets) in test_pos.groupby("TF").nunique().iterrows():
    n_targets = n_targets.loc["Target"]
    neg_sample = tfs_neg[(tfs_neg["TF"] == i) & (tfs_neg["Target"].isin(tf_test))].sample(n=n_targets)
    test_neg.append(neg_sample)

train_neg = pd.concat(train_neg)
test_neg =  pd.concat(test_neg)

train_neg["Label"] = [0] * train_neg.shape[0]
test_neg["Label"] = [0] * test_neg.shape[0]

train_set = pd.concat((train_pos, train_neg))
test_set = pd.concat((test_pos, test_neg))

In [312]:
test_set.Label.value_counts()

Label
1    119
0    119
Name: count, dtype: int64

In [317]:
test_set.TF.isin(train_set.TF).value_counts()
test_set.Target.isin(train_set.Target).value_counts()

Target
False    238
Name: count, dtype: int64

In [318]:
print(train_set.shape, test_set.shape)
print(train_set.shape[0] + test_set.shape[0])

(1012, 3) (238, 3)
1250


In [319]:
train_set.to_csv("/home/llan/Desktop/WUR/thesis2/shared_data/binary_labels/BaseLine/TF_TG_split/Train_set.tsv", sep="\t")
test_set.to_csv("/home/llan/Desktop/WUR/thesis2/shared_data/binary_labels/BaseLine/TF_TG_split/Test_set.tsv", sep="\t")
test_set.to_csv("/home/llan/Desktop/WUR/thesis2/shared_data/binary_labels/BaseLine/TF_TG_split/Val_set.tsv", sep="\t")

In [326]:
train_set.to_csv("/home/llan/Desktop/WUR/thesis2/shared_data/binary_labels/TF_TG_split/Train_set.tsv", sep="\t")
test_set.to_csv("/home/llan/Desktop/WUR/thesis2/shared_data/binary_labels/TF_TG_split/Test_set.tsv", sep="\t")
test_set.to_csv("/home/llan/Desktop/WUR/thesis2/shared_data/binary_labels/TF_TG_split/Val_set.tsv", sep="\t")

In [320]:
test_set.loc[test_set.Target.isin(train_set.Target)]

Unnamed: 0,TF,Target,Label


In [321]:
train_set.loc[train_set.Target.isin(test_set.Target)]

Unnamed: 0,TF,Target,Label


In [322]:
print("overlap in TFs", test_set.TF.isin(train_set.TF).value_counts())
print("overlap in TGs", test_set.Target.isin(train_set.Target).value_counts())

overlap in TFs TF
False    238
Name: count, dtype: int64
overlap in TGs Target
False    238
Name: count, dtype: int64


In [323]:
print("overlap in TFs", test_set.TF.isin(val_set.TF).value_counts())
print("overlap in TGs", test_set.Target.isin(val_set.Target).value_counts())

overlap in TFs TF
False    162
True      76
Name: count, dtype: int64
overlap in TGs Target
False    168
True      70
Name: count, dtype: int64


In [85]:
print("overlap in TFs", train_set.TF.isin(val_set.TF).value_counts())
print("overlap in TGs", train_set.Target.isin(val_set.Target).value_counts())

overlap in TFs TF
False    1056
Name: count, dtype: int64
overlap in TGs Target
False    1056
Name: count, dtype: int64


In [250]:
unique_ids = set(label_data['TF ID']).union(label_data['Target ID']) 
unique_ids = list(unique_ids)
np.random.shuffle(unique_ids)

In [252]:
len(unique_ids)

700

In [255]:
len(tf_label) + len(tg_label)

836

In [267]:
train_ids, test_ids = train_test_split(unique_ids, test_size=0.4, random_state=42)

In [268]:
def assign_set(row):
    if row['TF ID'] in train_ids or row['Target ID'] in train_ids:
        return 'train'
    elif row['TF ID'] in test_ids or row['Target ID'] in test_ids:
        return 'test'
    else:
        return None
label_data['set'] = label_data.apply(assign_set, axis=1)

In [269]:
label_data.groupby("set").count()

Unnamed: 0_level_0,TF ID,Target ID
set,Unnamed: 1_level_1,Unnamed: 2_level_1
test,178,178
train,1051,1051


In [271]:
test_set = label_data.loc[label_data["set"] == "test"]
train_set = label_data.loc[label_data["set"] == "train"]

In [272]:
test_set

Unnamed: 0,TF ID,Target ID,set
40,1987,8914,test
41,1987,12154,test
50,2371,31337,test
51,2371,35319,test
52,2406,35423,test
...,...,...,...
1407,36415,25352,test
1409,36415,28543,test
1418,36459,7756,test
1421,36966,8342,test


In [35]:
test_set.Target.isin(ath["Target index"]).value_counts()

Target
True     120
False    114
Name: count, dtype: int64

In [36]:
test_set.Label.value_counts()

Label
1    117
0    117
Name: count, dtype: int64

In [114]:
tfs_neg.loc[tfs_neg["TF"] == 5].Target.isin(tfs_pos.loc[tfs_pos["TF"] == 5].Target).value_counts()

Target
False    24165
Name: count, dtype: int64

In [26]:
train_pos = tfs_pos[tfs_pos["TF"].isin(train)]
test_pos = tfs_pos[tfs_pos["TF"].isin(test)]

train_pos["Label"] = [1] * train_pos.shape[0]
test_pos["Label"] = [1] * test_pos.shape[0]
train_neg = []
for i, (n_targets) in train_pos.groupby("TF").nunique().iterrows():
    n_targets = n_targets.loc["Target"]
    neg_sample = tfs_neg[tfs_neg["TF"] == i].sample(n=n_targets)
    train_neg.append(neg_sample)

test_neg = []
for i, (n_targets) in test_pos.groupby("TF").nunique().iterrows():
    n_targets = n_targets.loc["Target"]
    neg_sample = tfs_neg[tfs_neg["TF"] == i].sample(n=n_targets)
    test_neg.append(neg_sample)

train_neg = pd.concat(train_neg)
test_neg =  pd.concat(test_neg)

train_neg["Label"] = [0] * train_neg.shape[0]
test_neg["Label"] = [0] * test_neg.shape[0]
train_set = pd.concat((train_pos, train_neg))
test_set = pd.concat((test_pos, test_neg))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_pos["Label"] = [1] * train_pos.shape[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_pos["Label"] = [1] * test_pos.shape[0]


In [35]:
tr_combs = train_set.TF.astype(str) + train_set.Target.astype(str)
te_combs = test_set.TF.astype(str) + test_set.Target.astype(str)

In [None]:
train_set.to_csv("/home/llan/Desktop/WUR/thesis2/shared_data/binary_labels/TF_split_v2/Train.tsv", sep="\t")
test_set.to_csv("/home/llan/Desktop/WUR/thesis2/shared_data/binary_labels/TF_split_v2/Test.tsv", sep="\t")

In [16]:
class CreateDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df.to_numpy() 
    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self, index):
        gp = (self.df[index, 0], self.df[index, 1])
        label = self.df[index, 2]
        return gp, label
    
class CreateDataset_1(torch.utils.data.Dataset):
    def __init__(self, df, go_tensor):
        self.df = df.to_numpy() 
        self.go = go_tensor
    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self, index):
        gp = (self.df[index, 0], self.df[index, 1])
        go = torch.stack((self.go[gp[0]].to("cuda"), self.go[gp[1]].to("cuda")), dim=0)
        label = self.df[index, 2]
        return gp, label, 0, go, 0

In [121]:
g2v_emb

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
24706,1.568677,1.631622,0.119945,0.771625,-0.899330,-1.031793,1.208614,3.020828,0.048907,-1.233505,...,-3.400109,0.009944,0.681104,-0.000046,-0.445416,1.259316,-1.523835,-1.137699,-0.277666,2.257707
4883,-0.723175,-0.244352,-0.055876,0.404235,0.406924,-2.307787,-0.175827,0.431764,1.598332,-2.636484,...,-1.096317,-0.067455,-0.312058,-0.801306,0.343842,-0.871584,2.822859,-2.177554,-0.214012,0.116603
24588,-0.184121,1.212518,1.918751,-0.447811,0.052049,-2.213879,1.522863,1.264264,0.664590,0.252343,...,-1.134392,0.111970,2.245530,-0.399459,-0.545620,1.096907,0.093931,-0.895952,-0.778729,-0.423828
5374,-0.166346,0.001834,-0.445775,0.248312,0.196083,-1.259047,0.140674,2.223567,0.110178,-0.675755,...,-0.770582,-0.237344,3.655307,-0.065836,-2.031344,-0.726406,1.987337,-4.830619,-0.917032,1.862046
20518,1.189372,-1.279634,-1.041846,-2.680534,0.025763,-0.000362,2.968912,0.201538,-1.040744,-0.190351,...,0.177983,-2.017023,0.954407,-0.138475,2.519109,-0.355223,0.465685,-1.228564,-1.223137,1.071640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552,0.014730,0.002724,0.132077,-0.020328,-0.019669,-0.059745,-0.050580,0.124199,0.050036,-0.022099,...,-0.100078,-0.026032,0.099223,0.039844,-0.000485,0.047409,0.012425,-0.075102,0.004089,0.123414
8628,-0.072972,0.038816,0.093389,-0.003401,-0.064729,-0.029800,-0.052376,0.096334,-0.010801,0.010337,...,0.017896,-0.087150,0.087778,-0.030314,0.015386,0.019778,0.042848,-0.053239,0.039736,0.071528
202,-0.037385,0.034734,0.054736,0.010230,-0.036009,-0.039968,-0.047445,0.098287,0.046143,-0.099476,...,-0.040611,-0.099705,0.099404,-0.010209,0.008153,0.066019,0.029942,-0.019235,0.052010,0.059085
3601,0.001528,0.083215,0.170878,0.006951,-0.088612,-0.156325,-0.112643,0.153788,0.068668,-0.053025,...,-0.139556,-0.056606,0.120066,-0.027897,0.041622,0.087281,0.036381,-0.086841,-0.025253,0.097873


In [124]:

go_data = pd.read_table("GO/GO_data.txt", names=["idx", "go"], dtype={
0: int}, index_col=0)
go_data = go_data.go.apply(lambda x: list(map(int, str(x).split()))).to_dict()
n_row = exp_data_df.shape[0]
n_terms = 7247 # number of unique GO terms 

go_tensor = torch.zeros((n_row, n_terms), dtype=torch.float)
for key, val in go_data.items():
    v_tens = torch.tensor(val, dtype=torch.long)
    oneh = nn.functional.one_hot(v_tens, num_classes=7247)
    comp = oneh.sum(dim=0)
    go_tensor[key] = comp


In [168]:
g2v_holder = torch.zeros((exp_data.shape[0], 100))

In [169]:
for i, val in g2v_emb.iterrows():
    g2v_holder[i, :] = torch.tensor(val.values)

In [176]:
torch.save(g2v_holder, "gene2vec_data/dataset/g2v_tensor.pt")


: 

In [126]:
torch.save(go_tensor, "GO/GO_tensor.pt")

In [51]:
train_dataset = CreateDataset_1(train_set, go_tensor)
train_dataset = DataLoader(train_dataset, batch_size=BATCHSIZE, shuffle=True)
test_dataset = CreateDataset_1(test_set, go_tensor)
test_dataset = DataLoader(test_dataset, batch_size=BATCHSIZE, shuffle=True)

In [62]:
# geen overlap 
train_set.TF.isin(test_set.TF).value_counts()

TF
False    1588
Name: count, dtype: int64

## testing models

In [52]:
from Model import GO

In [63]:
model = GO(100, 7247, 2, 200).to("cuda")
from Train import train
from torch.optim import Adam

optim = Adam(params=model.parameters(), lr=0.001)
losses = []
for epoch in range(10):
    model, loss = train(model, dataloader=train_dataset, loss_func=nn.BCELoss(), optimizer=optim, epoch=epoch)
    

| epoch   0 |    1 /  50 batches |Train loss    0.683 | AUROC    0.870 | AUPRC    0.918
| epoch   0 |    2 /  50 batches |Train loss    0.700 | AUROC    0.722 | AUPRC    0.679
| epoch   0 |    3 /  50 batches |Train loss    0.693 | AUROC    0.816 | AUPRC    0.862
| epoch   0 |    4 /  50 batches |Train loss    0.711 | AUROC    0.991 | AUPRC    0.984
| epoch   0 |    5 /  50 batches |Train loss    0.696 | AUROC    0.931 | AUPRC    0.925
| epoch   0 |    6 /  50 batches |Train loss    0.681 | AUROC    0.836 | AUPRC    0.884
| epoch   0 |    7 /  50 batches |Train loss    0.686 | AUROC    0.991 | AUPRC    0.986
| epoch   0 |    8 /  50 batches |Train loss    0.671 | AUROC    0.896 | AUPRC    0.881
| epoch   0 |    9 /  50 batches |Train loss    0.637 | AUROC    0.949 | AUPRC    0.957
| epoch   0 |   10 /  50 batches |Train loss    0.596 | AUROC    0.988 | AUPRC    0.989
| epoch   0 |   11 /  50 batches |Train loss    0.544 | AUROC    0.945 | AUPRC    0.961
| epoch   0 |   12 /  50 batches

In [64]:
test_labels = []
predicted_label = []
for (gp, label, g2v, go, exp) in test_dataset:
    model.eval()
    with torch.no_grad():
        # one_hot_pair = torch.stack([go_tensor[tf], go_tensor[tg]], dim=1)
        prediction = model(None, None, go.to("cuda"), None)

        test_labels.append(label.numpy())
        predicted_label.extend(prediction.cpu().detach().numpy())

orig_predicted_label = np.concatenate(predicted_label) 
orig_test_t = (orig_predicted_label >= 0.5).astype(int)
orig_test_label = np.concatenate(test_labels)

score = pd.Series(orig_test_t == orig_test_label).value_counts()
print(score.iloc[0] / score.sum())

0.8910411622276029


In [37]:
class GO_test(nn.Module):
    def __init__(self):
        super().__init__()
        self.go_enc = nn.Sequential(
            nn.Linear(7247, 1280),
            nn.ReLU(),
            nn.Linear(1280, 1024),
            nn.ReLU(),
            nn.Linear(1024, 100)
        )
    
        self.out = nn.Sequential(
            nn.Linear(200, 128),
            nn.ReLU(),
            nn.Linear(128,64), 
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        self.activation = nn.Sigmoid()

    def forward(self, a, b, x, e):
        z = self.go_enc(x)
        conc = torch.flatten(z, 1)
        prob = self.activation(self.out(conc))
        return prob

In [110]:
model = GO_test().to("cuda")

In [111]:
from  torch.optim import Adam
list_of_loss = []
list_of_prediction = []
list_of_labels = []
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=0.001)

In [112]:
train_dataset =DataLoader(CreateDataset(train_set), batch_size= BATCHSIZE, shuffle=True)

for epoch in range(9):
    intermediate_loss = []
    for i, ((tf, tg), label) in enumerate(train_dataset):
        optimizer.zero_grad()
        label = label.float().cuda()
        one_hot_pair = torch.stack([go_tensor[tf], go_tensor[tg]], dim = 1)
        
        model.train()
        prediction = model(one_hot_pair.to("cuda"))
        loss = criterion(prediction.view(-1, 1), label.view(-1, 1))
        loss.backward()
        optimizer.step()

        intermediate_loss.append(loss.cpu().detach().numpy())
        list_of_prediction.append(prediction.view(-1, 1).cpu().detach().numpy())
        list_of_labels.append(label.view(-1, 1).cpu().detach().numpy())
    list_of_loss.append(sum(intermediate_loss))
        

      

In [113]:
list_of_loss

[20.013739079236984,
 6.413235714659095,
 3.428970648907125,
 2.1227217240957543,
 1.346638912880735,
 1.6506343854998704,
 1.6129159709089436,
 1.2996200336201582,
 1.0458390201674774]

In [116]:
test_dataset = DataLoader(CreateDataset(test_set), batch_size=BATCHSIZE, shuffle=False)
test_labels = []
predicted_label = []
for (tf, tg), label in test_dataset:
    model.eval()
    with torch.no_grad():
        one_hot_pair = torch.stack([go_tensor[tf], go_tensor[tg]], dim=1)
        prediction = model(one_hot_pair.to("cuda"))

        test_labels.append(label.numpy())
        predicted_label.extend(prediction.cpu().detach().numpy())

alt_test_lab = np.concatenate(test_labels)
alt_pred_lab = np.concatenate(predicted_label)
score_alt = pd.Series(((alt_pred_lab >= 0.5).astype(int) == alt_test_lab)).value_counts()


In [117]:
score_alt.iloc[0] / score_alt.sum()

0.8946731234866828

In [None]:
train_dataset =DataLoader(CreateDataset(train_set), batch_size= BATCHSIZE, shuffle=True)

for epoch in range(9):
    intermediate_loss = []
    for i, ((tf, tg), label) in enumerate(train_dataset):
        optimizer.zero_grad()
        label = label.float().cuda()
        one_hot_pair = torch.stack([go_tensor[tf], go_tensor[tg]], dim = 1)
        
        model.train()
        prediction = model(one_hot_pair.to("cuda"))
        loss = criterion(prediction.view(-1, 1), label.view(-1, 1))
        loss.backward()
        optimizer.step()

        intermediate_loss.append(loss.cpu().detach().numpy())
        list_of_prediction.append(prediction.view(-1, 1).cpu().detach().numpy())
        list_of_labels.append(label.view(-1, 1).cpu().detach().numpy())
    list_of_loss.append(sum(intermediate_loss))
        


## permuting the go_tensor matrix

In [75]:
from Train import train

In [76]:
idx = torch.randperm(go_tensor.nelement())

shuffled_go_tensor = go_tensor.view(-1)[idx].view(go_tensor.size())


In [105]:
train_dataset = CreateDataset_1(train_set, shuffled_go_tensor)
train_dataset = DataLoader(train_dataset, batch_size=BATCHSIZE, shuffle=True)
test_dataset = CreateDataset_1(test_set, go_tensor)
test_dataset = DataLoader(test_dataset, batch_size=BATCHSIZE, shuffle=False)

In [106]:
model = GO(100, 7247, 2, 200).to("cuda")
model = GO_test().to("cuda")
list_of_losses = []

optim = Adam(params=model.parameters(), lr=0.001)
losses = []
for epoch in range(10):
    model.train()
    log_interval = 1
    total_loss = 0
    train_losses = []
    for idx, (gp, label, g2v, go, exp) in enumerate(train_dataset):
        optimizer.zero_grad()
        label = label.to("cuda").float()
        predicted_label = model(go)  # returns a
        # vector of batchsize x 2(0, 1)
        loss = criterion(predicted_label.view(-1, predicted_label.shape[1]),
                         label.view(-1, predicted_label.shape[1]))
        loss.backward()
        optimizer.step()
        loss = loss.item()
        total_loss += loss
        train_losses.append(loss)

        if idx % log_interval == 0 and idx > 0:
            print('| epoch {:3d} | {:4d} /{:4d} batches |Train loss {:8.3f}'.format(
                epoch, idx, len(train_dataset), loss))
    print('| epoch {:3d} | total_loss {:8.3f}'.format(epoch, total_loss))

    

| epoch   0 |    1 /  50 batches |Train loss    0.694
| epoch   0 |    2 /  50 batches |Train loss    0.692
| epoch   0 |    3 /  50 batches |Train loss    0.694
| epoch   0 |    4 /  50 batches |Train loss    0.692
| epoch   0 |    5 /  50 batches |Train loss    0.693
| epoch   0 |    6 /  50 batches |Train loss    0.693
| epoch   0 |    7 /  50 batches |Train loss    0.692
| epoch   0 |    8 /  50 batches |Train loss    0.693
| epoch   0 |    9 /  50 batches |Train loss    0.694
| epoch   0 |   10 /  50 batches |Train loss    0.693
| epoch   0 |   11 /  50 batches |Train loss    0.693
| epoch   0 |   12 /  50 batches |Train loss    0.693
| epoch   0 |   13 /  50 batches |Train loss    0.693
| epoch   0 |   14 /  50 batches |Train loss    0.693
| epoch   0 |   15 /  50 batches |Train loss    0.692
| epoch   0 |   16 /  50 batches |Train loss    0.693
| epoch   0 |   17 /  50 batches |Train loss    0.692
| epoch   0 |   18 /  50 batches |Train loss    0.693
| epoch   0 |   19 /  50 bat

In [107]:
test_labels = []
predicted_label = []
for (gp, label, g2v, go, exp) in test_dataset:
    model.eval()
    with torch.no_grad():
        # one_hot_pair = torch.stack([go_tensor[tf], go_tensor[tg]], dim=1)
        prediction = model(go.to("cuda"))

        test_labels.append(label.numpy())
        predicted_label.extend(prediction.cpu().detach().numpy())

orig_predicted_label = np.concatenate(predicted_label) 
orig_test_t = (orig_predicted_label >= 0.5).astype(int)
orig_test_label = np.concatenate(test_labels)

score = pd.Series(orig_test_t == orig_test_label).value_counts()
print(score.iloc[0] / score.sum())

0.5


In [84]:
model = GO_test().to("cuda")
model = GO(100, 7247, 2, 200).to("cuda")

In [85]:
from  torch.optim import Adam
list_of_loss = []
list_of_prediction = []
list_of_labels = []
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=0.001)

In [86]:
train_dataset =DataLoader(CreateDataset_1(train_set, shuffled_go_tensor), batch_size= BATCHSIZE, shuffle=True)
for epoch in range(10):
    intermediate_loss = []
    for i, ((tf, tg), label, _, go, _) in enumerate(train_dataset):
        optimizer.zero_grad()
        label = label.float().cuda()
        #one_hot_pair = torch.stack([go_tensor[tf], go_tensor[tg]], dim = 1)
        
        model.train()
        prediction = model(None, None, go, None)
        loss = criterion(prediction.view(-1, 1), label.view(-1, 1))
        loss.backward()
        optimizer.step()

        intermediate_loss.append(loss.cpu().detach().numpy())
        list_of_prediction.append(prediction.view(-1, 1).cpu().detach().numpy())
        list_of_labels.append(label.view(-1, 1).cpu().detach().numpy())

        print(f"Epoch {epoch} | Batch: {i} | {loss}")
    list_of_loss.append(sum(intermediate_loss))
        

      

Epoch 0 | Batch: 0 | 0.6912267208099365
Epoch 0 | Batch: 1 | 0.6833721399307251
Epoch 0 | Batch: 2 | 0.6955382823944092
Epoch 0 | Batch: 3 | 0.6824278831481934
Epoch 0 | Batch: 4 | 0.714228630065918
Epoch 0 | Batch: 5 | 0.6955875754356384
Epoch 0 | Batch: 6 | 0.7038499712944031
Epoch 0 | Batch: 7 | 0.691303014755249
Epoch 0 | Batch: 8 | 0.6985581517219543
Epoch 0 | Batch: 9 | 0.6946661472320557
Epoch 0 | Batch: 10 | 0.7008693218231201
Epoch 0 | Batch: 11 | 0.7000980377197266
Epoch 0 | Batch: 12 | 0.7015104293823242
Epoch 0 | Batch: 13 | 0.6977074146270752
Epoch 0 | Batch: 14 | 0.6917563080787659
Epoch 0 | Batch: 15 | 0.6929088830947876
Epoch 0 | Batch: 16 | 0.6948170065879822
Epoch 0 | Batch: 17 | 0.6933051347732544
Epoch 0 | Batch: 18 | 0.6931113004684448
Epoch 0 | Batch: 19 | 0.6936311721801758
Epoch 0 | Batch: 20 | 0.6912349462509155
Epoch 0 | Batch: 21 | 0.6917296648025513
Epoch 0 | Batch: 22 | 0.687575101852417
Epoch 0 | Batch: 23 | 0.6857473850250244
Epoch 0 | Batch: 24 | 0.69069

In [87]:
test_dataset = DataLoader(CreateDataset_1(test_set, go_tensor), batch_size=BATCHSIZE, shuffle=False)

test_labels = []
predicted_label = []
for (tf, tg), label, _, go, _ in test_dataset:
    model.eval()
    with torch.no_grad():
        one_hot_pair = torch.stack([shuffled_go_tensor[tf], shuffled_go_tensor[tg]], dim=1)
        # print("one_hot_pair shape", one_hot_pair.shape)
        # print("regular go shape", go.shape)

        prediction = model(None, None, go, None)

        test_labels.append(label.numpy())
        predicted_label.extend(prediction.cpu().detach().numpy())

alt_test_lab = np.concatenate(test_labels)
alt_pred_lab = np.concatenate(predicted_label)
score_alt = pd.Series(((alt_pred_lab >= 0.5).astype(int) == alt_test_lab)).value_counts()


In [88]:
score_alt.iloc[0] / score_alt.sum()

0.576271186440678

In [89]:
model = GO(100, 7247, 2, 200).to("cuda")
model = GO_test().to("cuda")

In [90]:
train_dataset = DataLoader(CreateDataset(train_set), batch_size= BATCHSIZE, shuffle=True)

for epoch in range(10):
    intermediate_loss = []
    for i, ((tf, tg), label) in enumerate(train_dataset):
        optimizer.zero_grad()
        label = label.float().cuda()
        one_hot_pair = torch.stack([shuffled_go_tensor[tf], shuffled_go_tensor[tg]], dim = 1)
        
        model.train()
        prediction = model(one_hot_pair.to("cuda"))
        loss = criterion(prediction.view(-1, 1), label.view(-1, 1))
        loss.backward()
        optimizer.step()

        intermediate_loss.append(loss.cpu().detach().numpy())
        list_of_prediction.append(prediction.view(-1, 1).cpu().detach().numpy())
        list_of_labels.append(label.view(-1, 1).cpu().detach().numpy())

        print(f"Epoch {epoch} | Batch: {i} | {loss}")
    list_of_loss.append(sum(intermediate_loss))
        

Epoch 0 | Batch: 0 | 0.6931695938110352
Epoch 0 | Batch: 1 | 0.6921018958091736
Epoch 0 | Batch: 2 | 0.694257378578186
Epoch 0 | Batch: 3 | 0.6942309141159058
Epoch 0 | Batch: 4 | 0.6931350827217102
Epoch 0 | Batch: 5 | 0.6941810846328735
Epoch 0 | Batch: 6 | 0.691727876663208
Epoch 0 | Batch: 7 | 0.6924663782119751
Epoch 0 | Batch: 8 | 0.6942036151885986
Epoch 0 | Batch: 9 | 0.6924481391906738
Epoch 0 | Batch: 10 | 0.6928067207336426
Epoch 0 | Batch: 11 | 0.6928068399429321
Epoch 0 | Batch: 12 | 0.6956278681755066
Epoch 0 | Batch: 13 | 0.6927778124809265
Epoch 0 | Batch: 14 | 0.6924594640731812
Epoch 0 | Batch: 15 | 0.6924299001693726
Epoch 0 | Batch: 16 | 0.6952900886535645
Epoch 0 | Batch: 17 | 0.691745936870575
Epoch 0 | Batch: 18 | 0.6921002268791199
Epoch 0 | Batch: 19 | 0.6924597024917603
Epoch 0 | Batch: 20 | 0.6949219703674316
Epoch 0 | Batch: 21 | 0.6932026147842407
Epoch 0 | Batch: 22 | 0.6924241185188293
Epoch 0 | Batch: 23 | 0.692819356918335
Epoch 0 | Batch: 24 | 0.692450

In [94]:
test_dataset = DataLoader(CreateDataset(test_set), batch_size=BATCHSIZE, shuffle=False)

test_labels = []
predicted_label = []
for (tf, tg), label in test_dataset:
    model.eval()
    with torch.no_grad():
        one_hot_pair = torch.stack([shuffled_go_tensor[tf], shuffled_go_tensor[tg]], dim=1)
        # print("one_hot_pair shape", one_hot_pair.shape)
        # print("regular go shape", go.shape)

        prediction = model(one_hot_pair.to("cuda"))

        test_labels.append(label.numpy())
        predicted_label.extend(prediction.cpu().detach().numpy())

alt_test_lab = np.concatenate(test_labels)
alt_pred_lab = np.concatenate(predicted_label)
score_alt = pd.Series(((alt_pred_lab >= 0.5).astype(int) == alt_test_lab)).value_counts()


In [95]:
score_alt.iloc[0] / score_alt.sum()

0.5

In [96]:
predicted_label

[array([0.50278926], dtype=float32),
 array([0.50285375], dtype=float32),
 array([0.50282806], dtype=float32),
 array([0.5028282], dtype=float32),
 array([0.50283664], dtype=float32),
 array([0.5028256], dtype=float32),
 array([0.50283474], dtype=float32),
 array([0.5027451], dtype=float32),
 array([0.5028274], dtype=float32),
 array([0.50277525], dtype=float32),
 array([0.5028201], dtype=float32),
 array([0.50283426], dtype=float32),
 array([0.50282407], dtype=float32),
 array([0.50281084], dtype=float32),
 array([0.5028189], dtype=float32),
 array([0.50280917], dtype=float32),
 array([0.5027982], dtype=float32),
 array([0.5027752], dtype=float32),
 array([0.50283015], dtype=float32),
 array([0.50280875], dtype=float32),
 array([0.50280046], dtype=float32),
 array([0.5029257], dtype=float32),
 array([0.5028427], dtype=float32),
 array([0.50278956], dtype=float32),
 array([0.50281686], dtype=float32),
 array([0.5028345], dtype=float32),
 array([0.5028201], dtype=float32),
 array([0.502

In [97]:
gp, _ = train_dataset[0]
ts = [shuffled_go_tensor[gp[0]], shuffled_go_tensor[gp[1]]]

TypeError: 'DataLoader' object is not subscriptable

In [313]:
for i in range(len(train_dataset)):
    gp, lab1 = train_dataset[i]
    ts = [shuffled_go_tensor[gp[0]], shuffled_go_tensor[gp[1]]]
    
    v1 = all(train_dataset_1[i][3][0].cpu() == ts[0])
    v2 = all(train_dataset_1[i][3][1].cpu() == ts[1])
    
    print(f"v1 are equal?: {v1}")
    print(f"v2 are equal?: {v2}")    
    

KeyError: 0

In [199]:
all(train_dataset_1[0][3][1].cpu() == ts[1])

True

In [178]:
train_dataset[0]

((57, 23451), 1)

In [46]:
model

GO_test(
  (go_enc): Sequential(
    (0): Linear(in_features=7247, out_features=1280, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1280, out_features=1024, bias=True)
    (3): ReLU()
    (4): Linear(in_features=1024, out_features=100, bias=True)
  )
  (out): Sequential(
    (0): Linear(in_features=200, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
  )
  (activation): Sigmoid()
)


# Debug loop 

In [1]:
# modellen scoren allemaal hoger op random data dan zou moeten 

In [2]:
torch.cuda.empty_cache()

NameError: name 'torch' is not defined

In [38]:

import pandas as pd
import numpy as np
import torch
import copy
import torch.nn as nn
import sys
import os
from Model import GO, EXP
from Data import CreateDataset
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
from sklearn.metrics import roc_auc_score


BATCHSIZE = 32
EPOCHS = 10
LABEL_FOLDER = "/home/llan/Desktop/WUR/thesis2/shared_data/binary_labels/TF_split"
LR = 0.001

os.chdir("/home/llan/Desktop/WUR/thesis2/")

exp_data = pd.read_csv("EXP/expression.tsv", sep="\t", header=0, index_col=0)
exp_data = torch.randint(0, 1000, (exp_data.shape[0], exp_data.shape[1]))
exp_data = StandardScaler().fit_transform(exp_data.T).T
exp_data = torch.tensor(exp_data).to("cuda")


g2v_data = "/home/llan/Desktop/WUR/thesis2/gene2vec_data/dataset/g2v_embeddings.tsv"
g2v_data = torch.load("gene2vec_data/dataset/g2v_tensor.pt").to("cuda")
# go_data = "/home/llan/Desktop/WUR/thesis2/GO/GO_data.txt"
# go_data = "/home/llan/Desktop/WUR/thesis2/GO/GO_RANDOM.txt"
go_data = torch.load("GO/GO_tensor.pt").to("cuda")
go_correct = go_data

idx = torch.randperm(go_data.nelement())
go_data = go_data.view(-1)[idx].view(go_data.size())
go_data = torch.randint(0, 2, (go_data.shape[0], go_data.shape[1])).float()
go_data.sum(dim=1)[:20]


  g2v_data = torch.load("gene2vec_data/dataset/g2v_tensor.pt").to("cuda")
  go_data = torch.load("GO/GO_tensor.pt").to("cuda")


tensor([3557., 3539., 3714., 3581., 3647., 3620., 3669., 3717., 3636., 3514.,
        3635., 3569., 3609., 3594., 3557., 3587., 3624., 3547., 3676., 3636.])

In [39]:

# train_data_path = LABEL_FOLDER + "/Train_set.tsv"
train_data_path = "/home/llan/Desktop/WUR/thesis2/shared_data/binary_labels/TF_split_v2/Train.tsv"
val_data_path = LABEL_FOLDER + "/Val_set.tsv"
# test_data_path = LABEL_FOLDER + "/Test_set.tsv"
test_data_path = "/home/llan/Desktop/WUR/thesis2/shared_data/binary_labels/TF_split_v2/Test.tsv"




In [5]:
train_data_og = pd.read_table(train_data_path, index_col=0)

In [6]:
train_data_og.loc[train_data_og["TF"] == 5]

Unnamed: 0,TF,Target,Label
0,5,456,1
1,5,3527,1
2,5,4626,1
3,5,7756,1
4,5,12597,1
5,5,14733,1
6,5,15508,1
7,5,21187,1
8,5,29211,1
9,5,36347,1


In [7]:
train_data_sample = pd.DataFrame(train_data.df, columns=["TF", "Target", "Label"])

NameError: name 'train_data' is not defined

In [46]:
pd.Series(train_data.df[:, 1]).isin(test_set.Target).value_counts(normalize=True)

False    0.903716
True     0.096284
Name: proportion, dtype: float64

In [59]:
test_frac = test_set.shape[0] / test_data.df.shape[0]
train_frac = train_set.shape[0] / train_data.df.shape[0]


In [60]:
test_frac

0.35454545454545455

In [68]:


def shuffle_label_col(df):
    df['Label'] = df['Label'].sample(frac=1).values
    return df

 
# train_data = CreateDataset(pd.read_table(train_data_path,
#                                                            index_col=0).sample(frac=train_frac))
# val_data = CreateDataset(pd.read_table(val_data_path, index_col=0))
# test_data = CreateDataset(pd.read_table(test_data_path,
#                                                           index_col=0).sample(frac=test_frac))

train_data = CreateDataset(train_set)
val_data = CreateDataset(pd.read_table(val_data_path, index_col=0))
test_data = CreateDataset(test_set)

train_loader = torch.utils.data.DataLoader(
    train_data, batch_size=BATCHSIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(
    val_data, batch_size=BATCHSIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, shuffle=False)

model = GO(100, 7247, 2, 200).to("cuda")
# model = GO_test().to("cuda")
# model = EXP(100, 7247, 2, 200).to("cuda")


crit = nn.BCELoss()


def check_overlap(data1, data2):
    string1 = [str(data1.df[i, 0]) + str(data1[i, 1])
               for i in range(data1.df.shape[0])]
    string2 = [str(data2.df[i, 0]) + str(data2[i, 1])
               for i in range(data2.df.shape[0])]
    return (set(string1).intersection(string2))


print("OVERLAP TRAIN-TEST", check_overlap(train_data, test_data))


def stack_vector(v1, v2):
    stacked = torch.stack((v1, v2), dim=1)
    return stacked


def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)



OVERLAP TRAIN-TEST set()


In [408]:
train_data.df.shape
test_data.df.shape

AttributeError: 'CombinedData' object has no attribute 'df'

In [10]:
go_data = go_data.cuda()

In [135]:
import gc
gc.collect()

100344

In [158]:

# train_gene_pairs = defaultdict(lambda: defaultdict(list))
# test_gene_pairs = defaultdict(lambda: defaultdict(list))
# train_go_input = defaultdict(lambda: defaultdict(list))
# test_go_input = defaultdict(lambda: defaultdict(list))

In [70]:

# train_gene_pairs = defaultdict(lambda: defaultdict(list))
# test_gene_pairs = defaultdict(lambda: defaultdict(list))
# train_go_input = defaultdict(lambda: defaultdict(list))
# test_go_input = defaultdict(lambda: defaultdict(list))
for i in range(10):
    print("#" * 25, i, "#"*25)
    model.apply(init_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    model.train()
    # initial_state = copy.deepcopy(list(model.state_dict().items())[0])
    # if i < 1:
    #    print(initial_state)

    for epoch in range(EPOCHS):
        list_of_losses = []
        
        
        for _, ((tfs, tgs), label) in enumerate(train_loader):
            # train_gene_pairs[i][epoch].extend(list(zip(tfs.tolist(), tgs.tolist())))
            label = label.float().to("cuda")

            g2v = stack_vector(g2v_data[tfs], g2v_data[tgs])
            go = stack_vector(go_data[tfs], go_data[tgs])
            exp = stack_vector(exp_data[tfs], exp_data[tgs])
            # train_go_input[i][epoch].extend(go.tolist())

            prediction = model(None, None, go, None)
            
            optimizer.zero_grad()
            loss = crit(prediction.view(-1, 1), label.view(-1, 1))
            loss.backward()
            optimizer.step()
            # status = f"batch: {i} | loss: {loss.item()}"
            # print(status)
            list_of_losses.append(loss.item())
#        if np.array(list_of_losses).mean() > 40:
#            print(initial_state)
#            sys.exit()
        print(f"EPOCH: {epoch} | mean loss: {np.array(list_of_losses).mean()}")

    test_prediction = []
    model.eval()
    with torch.no_grad():
        for (tfs, tgs), label in test_loader:
            # test_gene_pairs[i][epoch].extend(list(zip(tfs.tolist(), tgs.tolist())))
            g2v = stack_vector(g2v_data[tfs], g2v_data[tgs])
            go = stack_vector(go_data[tfs], go_data[tgs])
            exp = stack_vector(exp_data[tfs], exp_data[tgs])

            # test_go_input[i][epoch].extend(go.tolist())

            prediction = model(None, None, go, None)
            test_prediction.append(prediction.cpu().detach().item())
           

    # print("TEST PREDICTIONS:")
    # print(test_prediction)
    pred_cut = (np.array(test_prediction) >= 0.5).astype(int)

    # print("TEST LABELS")
    # print(np.unique(test_data.df[:, 2], return_counts=True))
    stats = pd.Series(pred_cut == test_data.df[:, 2]).value_counts()
    print(stats)
    score = stats.iloc[0]/stats.sum()
    print(test_prediction[:10])
    print("ACCURACY:", score)
    roc_auc = roc_auc_score(test_data.df[:, 2], test_prediction)
    print("ROC-AUC:", roc_auc)
#print(gene_pairs)


######################### 0 #########################
EPOCH: 0 | mean loss: 1.2109361597009607
EPOCH: 1 | mean loss: 0.7794945707192292
EPOCH: 2 | mean loss: 0.44420616771723775
EPOCH: 3 | mean loss: 0.19382872799965176
EPOCH: 4 | mean loss: 0.09639796287425466
EPOCH: 5 | mean loss: 0.07207387320797991
EPOCH: 6 | mean loss: 0.05496588399684107
EPOCH: 7 | mean loss: 0.01628144398706059
EPOCH: 8 | mean loss: 0.01870295245238152
EPOCH: 9 | mean loss: 0.020581580923454888
False    118
True     116
Name: count, dtype: int64
[0.9528703093528748, 0.006148911081254482, 0.7866168022155762, 0.9928221106529236, 0.2140091210603714, 0.05635440722107887, 0.67962646484375, 0.337398886680603, 0.003935405518859625, 9.955252608051524e-05]
ACCURACY: 0.5042735042735043
ROC-AUC: 0.5231207538899847
######################### 1 #########################
EPOCH: 0 | mean loss: 1.1756640849886715
EPOCH: 1 | mean loss: 0.6388628539201375
EPOCH: 2 | mean loss: 0.3478452257207922
EPOCH: 3 | mean loss: 0.18336476340

In [75]:
# resultaten behaalt met de TF_split data gesampled naar 1184 (train) en 234 (test)  exemplaren 
# TF_split bevat geen overlap van TFs tussen train en test 

# ######################### 0 #########################
# EPOCH: 0 | mean loss: 1.2908007921399296
# EPOCH: 1 | mean loss: 0.6350054664386285
# EPOCH: 2 | mean loss: 0.5653243403177004
# EPOCH: 3 | mean loss: 0.23829586572341016
# EPOCH: 4 | mean loss: 0.24090148906248646
# EPOCH: 5 | mean loss: 0.10325294804155223
# EPOCH: 6 | mean loss: 0.07690077762094301
# EPOCH: 7 | mean loss: 0.13383556069252459
# EPOCH: 8 | mean loss: 0.22592922789359315
# EPOCH: 9 | mean loss: 0.147826824248556
# True     164
# False     70
# Name: count, dtype: int64
# [0.9394792914390564, 0.9990804195404053, 0.0035369046963751316, 0.9997015595436096, 0.33432018756866455, 0.05005986616015434, 0.7579227089881897, 0.9999353885650635, 0.0007423709030263126, 5.2523041631502565e-06]
# ACCURACY: 0.7008547008547008
# ROC-AUC: 0.7676701091335236
# ######################### 1 #########################
# EPOCH: 0 | mean loss: 1.24633182544966
# EPOCH: 1 | mean loss: 0.6629845000602104
# EPOCH: 2 | mean loss: 0.43934707101937887
# EPOCH: 3 | mean loss: 0.19795471168047674
# EPOCH: 4 | mean loss: 0.13613222889300133
# EPOCH: 5 | mean loss: 0.10768534648987288
# EPOCH: 6 | mean loss: 0.053464610922477535
# EPOCH: 7 | mean loss: 0.044176091625657235
# EPOCH: 8 | mean loss: 0.03575679690677773
# EPOCH: 9 | mean loss: 0.012276560994733527
# True     169
# False     65
# Name: count, dtype: int64
# [0.9631931185722351, 0.9996137022972107, 0.00244502373971045, 0.9997251629829407, 0.14541125297546387, 0.09720917791128159, 0.01642964966595173, 0.9999603033065796, 0.00157522177323699, 0.0002764848177321255]
# ACCURACY: 0.7222222222222222
# ROC-AUC: 0.8068556361239289
# ######################### 2 #########################
# EPOCH: 0 | mean loss: 1.2692906751826003
# EPOCH: 1 | mean loss: 0.7438174317011962
# EPOCH: 2 | mean loss: 0.5039294205001883
# EPOCH: 3 | mean loss: 0.17602797688261881
# EPOCH: 4 | mean loss: 0.12668784526554314
# EPOCH: 5 | mean loss: 0.09324906972815862
# EPOCH: 6 | mean loss: 0.06231072721247737
# EPOCH: 7 | mean loss: 0.03297120572785167
# EPOCH: 8 | mean loss: 0.05729529462641458
# EPOCH: 9 | mean loss: 0.037602706025732124
# True     156
# False     78
# Name: count, dtype: int64
# [0.9190929532051086, 0.9997871518135071, 0.0047633107751607895, 0.9996066689491272, 0.9216578602790833, 0.0029121506959199905, 0.021412137895822525, 0.9999912977218628, 0.00881150458008051, 0.004602228756994009]
# ACCURACY: 0.6666666666666666
# ROC-AUC: 0.7523621182157768
# ######################### 3 #########################
# EPOCH: 0 | mean loss: 1.531091039245193
# EPOCH: 1 | mean loss: 0.6850417707417462
# EPOCH: 2 | mean loss: 0.642583000498849
# EPOCH: 3 | mean loss: 0.2826313728819022
# EPOCH: 4 | mean loss: 0.15684385624135266
# EPOCH: 5 | mean loss: 0.06154649002427185
# EPOCH: 6 | mean loss: 0.05047930873508842
# EPOCH: 7 | mean loss: 0.03809002039928895
# EPOCH: 8 | mean loss: 0.02093651532069654
# EPOCH: 9 | mean loss: 0.04159888464680596
# True     161
# False     73
# Name: count, dtype: int64
# [0.9382014274597168, 0.9977228045463562, 0.013858940452337265, 0.9942784309387207, 0.4416259825229645, 0.0014348439872264862, 0.18912199139595032, 0.9996391534805298, 0.015422893688082695, 0.0005259273457340896]
# ACCURACY: 0.688034188034188
# ROC-AUC: 0.7902292536438877
# ######################### 4 #########################
# EPOCH: 0 | mean loss: 1.7090727090835571
# EPOCH: 1 | mean loss: 0.6980272563728126
# EPOCH: 2 | mean loss: 0.5149867703785768
# EPOCH: 3 | mean loss: 0.23978507599315127
# EPOCH: 4 | mean loss: 0.09512259292642812
# EPOCH: 5 | mean loss: 0.09690062424822433
# EPOCH: 6 | mean loss: 0.0658368583189676
# EPOCH: 7 | mean loss: 0.03190262008433205
# EPOCH: 8 | mean loss: 0.05883896465388101
# EPOCH: 9 | mean loss: 0.030820609451070225
# True     156
# False     78
# Name: count, dtype: int64
# [0.9922797679901123, 0.9996652603149414, 0.005394676700234413, 0.9996961355209351, 0.23291900753974915, 0.036912981420755386, 0.6909781098365784, 0.999985933303833, 0.00011004201951436698, 0.0017559996340423822]
# ACCURACY: 0.6666666666666666
# ROC-AUC: 0.7544861935105839
# ######################### 5 #########################
# EPOCH: 0 | mean loss: 1.243936669182133
# EPOCH: 1 | mean loss: 0.7034460708901689
# EPOCH: 2 | mean loss: 0.4721286860672203
# EPOCH: 3 | mean loss: 0.24680726119392626
# EPOCH: 4 | mean loss: 0.1745469638750561
# EPOCH: 5 | mean loss: 0.06394961474438172
# EPOCH: 6 | mean loss: 0.05912919136509929
# EPOCH: 7 | mean loss: 0.04933584799889375
# EPOCH: 8 | mean loss: 0.03519315694181903
# EPOCH: 9 | mean loss: 0.03664025145140791
# True     153
# False     81
# Name: count, dtype: int64
# [0.9844501614570618, 0.9998881816864014, 0.013151480816304684, 0.9998871088027954, 0.29129573702812195, 0.01155152078717947, 0.4819895625114441, 0.9999434947967529, 0.04609158635139465, 0.00175341812428087]
# ACCURACY: 0.6538461538461539
# ROC-AUC: 0.7885446422031788
# ######################### 6 #########################
# EPOCH: 0 | mean loss: 1.4786208513620738
# EPOCH: 1 | mean loss: 0.6860372794641031
# EPOCH: 2 | mean loss: 0.5525172538048512
# EPOCH: 3 | mean loss: 0.28616513070222493
# EPOCH: 4 | mean loss: 0.09863472938210376
# EPOCH: 5 | mean loss: 0.061589109302322205
# EPOCH: 6 | mean loss: 0.0386095934207677
# EPOCH: 7 | mean loss: 0.034685767029185556
# EPOCH: 8 | mean loss: 0.02501185829247776
# EPOCH: 9 | mean loss: 0.019724133739096893
# True     149
# False     85
# Name: count, dtype: int64
# [0.7952884435653687, 0.9998565912246704, 0.11612247675657272, 0.9997285008430481, 0.02850249409675598, 0.02789178118109703, 0.8745959401130676, 0.9999635219573975, 0.011133053340017796, 0.0011400280054658651]
# ACCURACY: 0.6367521367521367
# ROC-AUC: 0.7913279132791328
# ######################### 7 #########################
# EPOCH: 0 | mean loss: 1.4906553400529396
# EPOCH: 1 | mean loss: 0.63396650391656
# EPOCH: 2 | mean loss: 0.43700904983121
# EPOCH: 3 | mean loss: 0.22561096057698532
# EPOCH: 4 | mean loss: 0.10121008945075241
# EPOCH: 5 | mean loss: 0.08070107821274448
# EPOCH: 6 | mean loss: 0.07067356038149572
# EPOCH: 7 | mean loss: 0.06243192310747061
# EPOCH: 8 | mean loss: 0.026465009564433147
# EPOCH: 9 | mean loss: 0.027069691766833735
# True     150
# False     84
# Name: count, dtype: int64
# [0.9583554863929749, 0.9998263716697693, 0.18948590755462646, 0.9991509914398193, 0.6270049810409546, 0.1048772931098938, 0.7594329118728638, 0.9999476671218872, 0.13030895590782166, 0.008553258143365383]
# ACCURACY: 0.6410256410256411
# ROC-AUC: 0.7649600820332527
# ######################### 8 #########################
# EPOCH: 0 | mean loss: 1.8349746642885982
# EPOCH: 1 | mean loss: 0.6972033204259099
# EPOCH: 2 | mean loss: 0.47386297664126836
# EPOCH: 3 | mean loss: 0.3349917041691574
# EPOCH: 4 | mean loss: 0.15873134112287615
# EPOCH: 5 | mean loss: 0.10846027623660662
# EPOCH: 6 | mean loss: 0.06928935080665995
# EPOCH: 7 | mean loss: 0.050549945215115674
# EPOCH: 8 | mean loss: 0.035352279478526746
# EPOCH: 9 | mean loss: 0.014051692143029131
# True     155
# False     79
# Name: count, dtype: int64
# [0.8915040493011475, 0.9997445940971375, 5.75973026570864e-05, 0.9999617338180542, 0.31810060143470764, 3.80532510462217e-05, 0.05569222941994667, 0.9999994039535522, 6.43112653051503e-05, 0.00012076117127435282]
# ACCURACY: 0.6623931623931624
# ROC-AUC: 0.7821724163187578
# ######################### 9 #########################
# EPOCH: 0 | mean loss: 1.4148215570965328
# EPOCH: 1 | mean loss: 0.5812335070726034
# EPOCH: 2 | mean loss: 0.3834337213554898
# EPOCH: 3 | mean loss: 0.1910669899520439
# EPOCH: 4 | mean loss: 0.15032583560693907
# EPOCH: 5 | mean loss: 0.0903506668216102
# EPOCH: 6 | mean loss: 0.04483242887953246
# EPOCH: 7 | mean loss: 0.051485495698462065
# EPOCH: 8 | mean loss: 0.04415184463456122
# EPOCH: 9 | mean loss: 0.0269895800219446
# True     142
# False     92
# Name: count, dtype: int64
# [0.9823852181434631, 0.9997959733009338, 0.07022245228290558, 0.9998014569282532, 0.3058154284954071, 0.5174615383148193, 0.7393720746040344, 0.9999474287033081, 0.00886604841798544, 0.003223792416974902]
# ACCURACY: 0.6068376068376068
# ROC-AUC: 0.7789496813887059


In [76]:
# resultaten behaalt met de TF_TG 1184 (train) en 234 (test)  exemplaren 
# TF_TG bevat geen overlap van TFs en Targets tussen Test en Train datasets 

# ######################### 0 #########################
# EPOCH: 0 | mean loss: 1.2109361597009607
# EPOCH: 1 | mean loss: 0.7794945707192292
# EPOCH: 2 | mean loss: 0.44420616771723775
# EPOCH: 3 | mean loss: 0.19382872799965176
# EPOCH: 4 | mean loss: 0.09639796287425466
# EPOCH: 5 | mean loss: 0.07207387320797991
# EPOCH: 6 | mean loss: 0.05496588399684107
# EPOCH: 7 | mean loss: 0.01628144398706059
# EPOCH: 8 | mean loss: 0.01870295245238152
# EPOCH: 9 | mean loss: 0.020581580923454888
# False    118
# True     116
# Name: count, dtype: int64
# [0.9528703093528748, 0.006148911081254482, 0.7866168022155762, 0.9928221106529236, 0.2140091210603714, 0.05635440722107887, 0.67962646484375, 0.337398886680603, 0.003935405518859625, 9.955252608051524e-05]
# ACCURACY: 0.5042735042735043
# ROC-AUC: 0.5231207538899847
# ######################### 1 #########################
# EPOCH: 0 | mean loss: 1.1756640849886715
# EPOCH: 1 | mean loss: 0.6388628539201375
# EPOCH: 2 | mean loss: 0.3478452257207922
# EPOCH: 3 | mean loss: 0.1833647634028583
# EPOCH: 4 | mean loss: 0.06552233275126766
# EPOCH: 5 | mean loss: 0.08646742404262359
# EPOCH: 6 | mean loss: 0.06325489568297525
# EPOCH: 7 | mean loss: 0.03236326202907524
# EPOCH: 8 | mean loss: 0.08079792897695222
# EPOCH: 9 | mean loss: 0.03985625061810621
# True     118
# False    116
# Name: count, dtype: int64
# [0.9990813732147217, 0.017169706523418427, 0.009132904931902885, 0.9987401366233826, 0.982964038848877, 0.9890333414077759, 0.9543372392654419, 0.01809493824839592, 0.0033102051820605993, 0.0007092221057973802]
# ACCURACY: 0.5042735042735043
# ROC-AUC: 0.5031777339469647
# ######################### 2 #########################
# EPOCH: 0 | mean loss: 1.3541251727052637
# EPOCH: 1 | mean loss: 0.7136319505201804
# EPOCH: 2 | mean loss: 0.5302969386448732
# EPOCH: 3 | mean loss: 0.2749827816276937
# EPOCH: 4 | mean loss: 0.13703857344021467
# EPOCH: 5 | mean loss: 0.026342848987582868
# EPOCH: 6 | mean loss: 0.08190031370424943
# EPOCH: 7 | mean loss: 0.06414108968818107
# EPOCH: 8 | mean loss: 0.01958414739016695
# EPOCH: 9 | mean loss: 0.0278857543716849
# True     122
# False    112
# Name: count, dtype: int64
# [0.9942219853401184, 0.033504385501146317, 0.9768292307853699, 0.9927991628646851, 0.6325933933258057, 0.05109052360057831, 0.8625434041023254, 0.9923384189605713, 0.002369594294577837, 0.0012298315996304154]
# ACCURACY: 0.5213675213675214
# ROC-AUC: 0.5378040762656147
# ######################### 3 #########################
# EPOCH: 0 | mean loss: 1.3051611040089581
# EPOCH: 1 | mean loss: 0.709514231295199
# EPOCH: 2 | mean loss: 0.5486470968336672
# EPOCH: 3 | mean loss: 0.33866160040771637
# EPOCH: 4 | mean loss: 0.18318546730773272
# EPOCH: 5 | mean loss: 0.05804813979193568
# EPOCH: 6 | mean loss: 0.07694865145871567
# EPOCH: 7 | mean loss: 0.021009883413884184
# EPOCH: 8 | mean loss: 0.01911602384171675
# EPOCH: 9 | mean loss: 0.009588006903524385
# False    118
# True     116
# Name: count, dtype: int64
# [0.9998028874397278, 0.0008875943603925407, 0.6515579223632812, 0.9994484782218933, 0.03441942110657692, 0.14081458747386932, 0.996987521648407, 0.09416083991527557, 0.0005284010549075902, 0.0011297495802864432]
# ACCURACY: 0.5042735042735043
# ROC-AUC: 0.4904667981591058
# ######################### 4 #########################
# EPOCH: 0 | mean loss: 48.48966426301647
# EPOCH: 1 | mean loss: 48.733108108108105
# EPOCH: 2 | mean loss: 48.733108108108105
# EPOCH: 3 | mean loss: 50.0
# EPOCH: 4 | mean loss: 50.0
# EPOCH: 5 | mean loss: 48.733108108108105
# EPOCH: 6 | mean loss: 51.266891891891895
# EPOCH: 7 | mean loss: 51.266891891891895
# EPOCH: 8 | mean loss: 51.266891891891895
# EPOCH: 9 | mean loss: 50.0
# True     117
# False    117
# Name: count, dtype: int64
# [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
# ACCURACY: 0.5
# ROC-AUC: 0.5
# ######################### 5 #########################
# EPOCH: 0 | mean loss: 1.4041840256871403
# EPOCH: 1 | mean loss: 0.664264617739497
# EPOCH: 2 | mean loss: 0.44125111159440633
# EPOCH: 3 | mean loss: 0.24447225749089913
# EPOCH: 4 | mean loss: 0.04826253250153181
# EPOCH: 5 | mean loss: 0.026704841334297246
# EPOCH: 6 | mean loss: 0.015419007323027865
# EPOCH: 7 | mean loss: 0.03238676176280589
# EPOCH: 8 | mean loss: 0.019525103871962306
# EPOCH: 9 | mean loss: 0.011646685734819117
# False    119
# True     115
# Name: count, dtype: int64
# [0.8294554948806763, 0.0011413096217438579, 0.0034623390529304743, 0.3127686381340027, 0.0013984538381919265, 0.026014553382992744, 0.06883230805397034, 0.0013600982492789626, 0.000623082451056689, 2.4991933969431557e-05]
# ACCURACY: 0.5085470085470085
# ROC-AUC: 0.4803126597998393
# ######################### 6 #########################
# EPOCH: 0 | mean loss: 1.5392201478416856
# EPOCH: 1 | mean loss: 0.7444435680234754
# EPOCH: 2 | mean loss: 0.6585202619836137
# EPOCH: 3 | mean loss: 0.3785963831721125
# EPOCH: 4 | mean loss: 0.1512887997905145
# EPOCH: 5 | mean loss: 0.11484863124536099
# EPOCH: 6 | mean loss: 0.0713640193933168
# EPOCH: 7 | mean loss: 0.03547692660720566
# EPOCH: 8 | mean loss: 0.023247347568178305
# EPOCH: 9 | mean loss: 0.030228577698958484
# True     119
# False    115
# Name: count, dtype: int64
# [0.46576446294784546, 0.021546002477407455, 0.02445993572473526, 0.901681661605835, 0.009477450512349606, 0.051973093301057816, 0.30650264024734497, 0.011257047764956951, 0.0018863874720409513, 0.0008080578409135342]
# ACCURACY: 0.5085470085470085
# ROC-AUC: 0.5285996055226825
# ######################### 7 #########################
# EPOCH: 0 | mean loss: 50.35789640207548
# EPOCH: 1 | mean loss: 48.733108108108105
# EPOCH: 2 | mean loss: 48.733108108108105
# EPOCH: 3 | mean loss: 51.266891891891895
# EPOCH: 4 | mean loss: 50.0
# EPOCH: 5 | mean loss: 50.0
# EPOCH: 6 | mean loss: 51.266891891891895
# EPOCH: 7 | mean loss: 50.0
# EPOCH: 8 | mean loss: 50.0
# EPOCH: 9 | mean loss: 48.733108108108105
# True     117
# False    117
# Name: count, dtype: int64
# [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
# ACCURACY: 0.5
# ROC-AUC: 0.5
# ######################### 8 #########################
# EPOCH: 0 | mean loss: 1.3306840899828318
# EPOCH: 1 | mean loss: 0.7748424539694915
# EPOCH: 2 | mean loss: 0.42830452484053533
# EPOCH: 3 | mean loss: 0.1747211800837839
# EPOCH: 4 | mean loss: 0.20333418300425685
# EPOCH: 5 | mean loss: 0.1749290537145744
# EPOCH: 6 | mean loss: 0.13704684861570698
# EPOCH: 7 | mean loss: 0.0271886609851798
# EPOCH: 8 | mean loss: 0.02840655899332912
# EPOCH: 9 | mean loss: 0.02608579768943978
# True     120
# False    114
# Name: count, dtype: int64
# [0.4406079053878784, 0.0023255343548953533, 0.4547938108444214, 0.7747179269790649, 0.04528110846877098, 0.05120786651968956, 0.2697964608669281, 1.5297471691155806e-05, 6.931503594387323e-05, 0.0001442210777895525]
# ACCURACY: 0.5128205128205128
# ROC-AUC: 0.531886916502301
# ######################### 9 #########################
# EPOCH: 0 | mean loss: 1.8354344347844254
# EPOCH: 1 | mean loss: 0.8024360215341723
# EPOCH: 2 | mean loss: 0.512218652947529
# EPOCH: 3 | mean loss: 0.19418683712295182
# EPOCH: 4 | mean loss: 0.15315136519839634
# EPOCH: 5 | mean loss: 0.19343554696722612
# EPOCH: 6 | mean loss: 0.13862389131965167
# EPOCH: 7 | mean loss: 0.12448498254289499
# EPOCH: 8 | mean loss: 0.05169623460922692
# EPOCH: 9 | mean loss: 0.050215281898409875
# True     125
# False    109
# Name: count, dtype: int64
# [0.8752012848854065, 0.001824546023271978, 0.10242830961942673, 0.9807192087173462, 0.0006746407598257065, 0.0005155044491402805, 0.0016608854057267308, 4.007817915407941e-05, 1.2145465916546527e-05, 3.190320512658218e-06]
# ACCURACY: 0.5341880341880342
# ROC-AUC: 0.5401417196288991


In [43]:
for key in range(10):
    df = pd.DataFrame(train_gene_pairs[0][key])
    print(df.head())
    break

       0      1
0  36415   4404
1   3852  36151
2  21701  30753
3  15512  26700
4  27494  31645


In [50]:
train_df = pd.DataFrame(train_gene_pairs[0][9])

In [62]:
print(train_df[0].isin(train_data.df[:,0]).value_counts())
print(train_df[1].isin(train_data.df[:,1]).value_counts())
print(train_df[0].isin(test_data.df[:,0]).value_counts())
print(train_df[1].isin(test_data.df[:,1]).value_counts())

0
True    1754
Name: count, dtype: int64
1
True    1754
Name: count, dtype: int64
0
False    1754
Name: count, dtype: int64
1
False    1312
True      442
Name: count, dtype: int64


In [75]:
test_df = pd.DataFrame(test_gene_pairs[0][9])

In [82]:
test_df[0].isin(train_df[0]).isin(train_df[1]).value_counts()

0
False    660
Name: count, dtype: int64

In [69]:
422/ (1312 + 442)

0.24059293044469784

In [66]:
test_df = pd.DataFrame(test_data.df)
test_df.loc[test_df[1] ==5]

Unnamed: 0,0,1,2


In [47]:
len(test_gene_pairs[0][9])

660

In [26]:
train_gene_pairs.keys()
for key in train_gene_pairs.keys():
    
    

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54])

In [25]:
train_go_input.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54])

In [17]:
t_go_in = np.matrix(train_go_input)

In [14]:
t_go_in = np.array(train_go_input)

In [15]:
t_go_in.shape

()

# TG_split 

In [409]:
from DataProcessor import DatasetProcessor

In [410]:
def dict_to_df(dictionary):
    d = dictionary
    row_list = []
    for key in d.keys():
        for val in d[key]:
            row_list.append([key, val])
    df = pd.DataFrame(row_list, columns=["TF", "Target"])
    return df 

In [411]:
dp = DatasetProcessor()
pos_dict = dp._label_data_to_dict(label_data)
neg_dict = dp._neg_dict_from_pos_dict(pos_dict, tg_list)
tfs_pos = dict_to_df(pos_dict)
tfs_neg = dict_to_df(neg_dict)

In [412]:
total_pos_ex = tfs_pos.shape[0]
train_max = total_pos_ex * 0.67
val_max = train_max * 0.1
train, val, test = [], [], []

pos_ex = list(tfs_pos.groupby("Target").count().TF.sort_values(ascending=False).to_dict().items())
np.random.shuffle(pos_ex)

train, val, test = [],[],[]
n_train, n_val, n_test = 0, 0, 0 
n_total = 0
for tg, unq_tf in pos_ex:
    if n_train < train_max:
        train.append(tg)
        n_train += unq_tf
    elif n_val < val_max:
        val.append(tg)
        n_val += unq_tf
    else:
        test.append(tg)
        n_test += unq_tf
    n_total +=unq_tf

# def hns(pos_df, neg_df, mask):
#     hnss = []
#     sel = neg_df[~neg_df.Target.isin(mask)]
#     for (i, val) in pos_df.iterrows():
#         tf, _, _ = val
#         data = sel.loc[sel.TF == tf].sample(n=1).to_numpy()[0]
#         hnss.append(data)
#     hnss = pd.DataFrame(hnss, columns=["TF", "Target"])
#     return hnss

    
df_train = tfs_pos.loc[tfs_pos.Target.isin(train)].copy()
df_val = tfs_pos.loc[tfs_pos.Target.isin(val)].copy()
df_test = tfs_pos.loc[tfs_pos.Target.isin(test)].copy()
df_train["Label"] = df_train.shape[0] * [1]
df_val["Label"] = df_val.shape[0] * [1]
df_test["Label"] = df_test.shape[0] * [1]

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)


(809, 3)
(81, 3)
(317, 3)


In [413]:
from itertools import permutations

class NegativeSamples:
    def __init__(self, train, val, test, neg_pool):
        self.data = [train, val, test]
        self.neg = neg_pool
        self.used_targets = set()
        self.train = None 
        self.val = None
        self.test = None

    def distribute(self):
        permute = list(permutations(self.data, 3))
    

        datasets = [[], [], []]
        i = 0
        for x in range(6):
            if x % 2 !=0:
                # print(permute[x][0].shape, permute[x][1].shape, permute[x][2].shape)
                pos = permute[x][0]
                mask = set(permute[x][1].Target.to_list()).union(permute[x][2].Target.to_list()).union(self.used_targets)
                # print(len(mask))
                
                for (name, num) in pos.groupby("TF").count().Target.items():
                    neg_samples = list(set(self.neg[name]).difference(mask))
                    np.random.shuffle(neg_samples)
                    self.used_targets.update(neg_samples)
                    neg_pairs = [[name, x, 0] for x in neg_samples[:num]]
                    datasets[i].extend(neg_pairs)
                    # print(datasets)
                i += 1
        
        train, val, test = datasets
        self.train= pd.DataFrame(train, columns=["TF", "Target", "Label"])
        self.val= pd.DataFrame(val, columns=["TF", "Target", "Label"])
        self.test = pd.DataFrame(test, columns=["TF", "Target", "Label"])     

    
neg = NegativeSamples(df_train, df_val, df_test, neg_dict)

In [414]:
neg.distribute()

In [415]:
train_set = pd.concat((df_train, neg.train))
val_set = pd.concat((df_val, neg.val))
test_set = pd.concat((df_test, neg.test))

In [439]:
ath.columns

Index(['Unnamed: 0', 'TF ID', 'Target ID', 'TF alias', 'Target alias',
       'Activate/Repress', 'Reference', 'Note', 'TF index', 'Target index'],
      dtype='object')

In [451]:
df = test_set

count = 0

for i, (tf, tg, l) in df.iterrows():
    tmp_ath = ath.loc[(ath["TF index"] == tf) & (ath["Target index"] == tg)]
    if tmp_ath.shape[0] > 0:
        count +=1
    
count

317

In [433]:
train_set.loc[train_set["Target"].isin()]

Unnamed: 0,TF,Target,Label
0,5,456,1
1,5,3527,1
2,5,4626,1
4,5,12597,1
5,5,14733,1
...,...,...,...
804,36454,9483,0
805,36454,22552,0
806,36747,37236,0
807,36966,23503,0


In [416]:
train_set.Target.isin(test_set.Target).value_counts()

Target
False    1618
Name: count, dtype: int64

In [417]:
train_set.Target.isin(val_set.Target).value_counts()

Target
False    1618
Name: count, dtype: int64

In [418]:
test_set.Target.isin(val_set.Target).value_counts()

Target
False    634
Name: count, dtype: int64

In [422]:
from pathlib import Path

In [424]:
cwd = Path(os.getcwd())

In [425]:
train_set.to_csv(cwd.joinpath("shared_data/binary_labels/TG_split/Train_set.tsv"), sep="\t")
val_set.to_csv(cwd.joinpath("shared_data/binary_labels/TG_split/Val_set.tsv"), sep="\t")
test_set.to_csv(cwd.joinpath("shared_data/binary_labels/TG_split/Test_set.tsv"), sep="\t")

In [429]:
train_set.sample(n=1012).to_csv(cwd.joinpath("shared_data/binary_labels/BaseLine/TG_split/Train_set.tsv"), sep="\t")

In [70]:
tg_train, tg_temp = train_test_split(tfs_pos.Target.unique(), test_size=0.3)
tg_test, tg_val = train_test_split(tg_temp, test_size=0.2)

In [223]:
tg_train, tg_temp = train_test_split(tg_list, test_size=0.3)
tg_test, tg_val = train_test_split(tg_temp, test_size=0.2)

In [224]:
train_pos = tfs_pos.loc[tfs_pos["Target"].isin(tg_train)]
test_pos = tfs_pos.loc[tfs_pos["Target"].isin(tg_test)]
val_pos = tfs_pos.loc[tfs_pos["Target"].isin(tg_val)]


In [None]:
return_df = []
pos_df = train_pos
neg_df = tfs_neg
for name, frame in pos_df.groupby("TF").nunique().iterrows():
    tf, n_targets = name, frame.item()
    tmp_targets = neg_df.loc[(neg_df["TF"] == tf)]
    tmp_targets = tmp_targets.loc[tmp_targets.Target.isin(blacklist)]

NameError: name 'black_list' is not defined

In [234]:
def neg_samples(pos_df, neg_df, black_list):
    return_df = []
    for name, frame in pos_df.groupby("TF").nunique().iterrows():
        tf, n_targets = name, frame.item()
        tmp_targets = neg_df.loc[(neg_df["TF"] == tf)]
        tmp_targets = tmp_targets.drop(tmp_targets.Target.isin(black_list) == True, inplace=True)
        tmp_targets = tmp_targets.sample(n=n_targets)
        return_df.append(tmp_targets)
    stackd = pd.concat(return_df)
    return stackd

        


In [244]:
train_pos.where(train_pos["Target"] >10000).dropna()

Unnamed: 0,TF,Target
5,5.0,14733.0
6,5.0,15508.0
8,5.0,29211.0
9,5.0,36347.0
11,57.0,26018.0
...,...,...
1190,36400.0,35237.0
1192,36415.0,25352.0
1194,36415.0,28543.0
1195,36415.0,32212.0


# GO datasets

In [1]:
wo_iea = "/home/llan/Desktop/WUR/thesis2/GO/go_wo_iea_n.txt"
wo_comp = "/home/llan/Desktop/WUR/thesis2/GO/go_wo_comp_n.txt"

In [6]:
base_line = "/home/llan/Desktop/WUR/thesis2/shared_data/binary_labels/BaseLine"

from glob import glob
import numpy as np 

In [505]:
ms_idxs = pd.read_table("missing_idxs.tsv", index_col=0).to_numpy().squeeze().tolist()

In [556]:
paden = glob(base_line + "//*/*set.tsv")

for pad in paden:
    pad = Path(pad)
    df = pd.read_table(pad, index_col=0)
    print(df.shape)
    subset = df.loc[~((df.TF.isin(ms_idxs)) | (df.Target.isin(ms_idxs)))].copy()
    counts = subset.Label.value_counts(normalize=True)
    print(counts)


    parts = list(pad.parts)

    parts = [part if part != "BaseLine" else "BaseLine_mod_GO" for part in parts]
    pad = Path(*parts)
    pad.parents[0].mkdir(exist_ok=True)
    subset.to_csv(pad, sep="\t")

(1012, 3)
Label
1    0.520439
0    0.479561
Name: proportion, dtype: float64
(824, 3)
Label
0    0.537332
1    0.462668
Name: proportion, dtype: float64
(166, 3)
Label
1    0.5
0    0.5
Name: proportion, dtype: float64
(1012, 3)
Label
1    0.537657
0    0.462343
Name: proportion, dtype: float64
(634, 3)
Label
1    0.504
0    0.496
Name: proportion, dtype: float64
(162, 3)
Label
1    0.5
0    0.5
Name: proportion, dtype: float64
(1012, 3)
Label
1    0.503992
0    0.496008
Name: proportion, dtype: float64
(238, 3)
Label
1    0.504274
0    0.495726
Name: proportion, dtype: float64
(238, 3)
Label
1    0.504274
0    0.495726
Name: proportion, dtype: float64
(1012, 3)
Label
1    0.537155
0    0.462845
Name: proportion, dtype: float64
(620, 3)
Label
1    0.529514
0    0.470486
Name: proportion, dtype: float64
(174, 3)
Label
1    0.5375
0    0.4625
Name: proportion, dtype: float64


In [14]:
def to_tensor(txt):
    data = dict()
    n_col = 0
    n_row = 37336
    min_val = 100
    with open(txt, "r") as f:
        for line in f:
            idx, val = line.strip().split("\t")
            val = val.split(" ")
            idx = int(idx)
            val = np.array(val).astype(int)
            n_col = val.max() if val.max() > n_col else n_col
            min_val = val.min() if val.min() < min_val else min_val
            data[idx] = val

    matrix = torch.zeros((n_row, n_col+1))
    for key, val in data.items():
        matrix[key] = torch.nn.functional.one_hot(torch.tensor(val), num_classes=n_col+1).sum(axis=0)
    return matrix


In [15]:
wo_comp_t = to_tensor(wo_comp)

In [17]:
wo_iea_t = to_tensor(wo_iea)

In [None]:
wo_comp

In [18]:
torch.save(wo_comp_t, "/home/llan/Desktop/WUR/thesis2/GO/go_wo_comp.pt")
torch.save(wo_iea_t, "/home/llan/Desktop/WUR/thesis2/GO/go_wo_iea.pt")