In [4]:
import pandas as pd 
import torch 
import os 
from collections import defaultdict


In [5]:
os.chdir("/home/llan/Desktop/WUR/thesis2")


In [6]:
col_names = [
    "locus_name",
    "TAIR_accession",
    "object_name",
    "relationship_type",
    "GO_term",
    "GO_ID",
    "TAIR_Keyword_ID",
    "Aspect",
    "GOslim_term",
    "Evidence_code",
    "Evidence_description",
    "Evidence_with",
    "Reference",
    "Annotator",
    "Date_annotated",
]

goslim = pd.read_csv("GO/ATH_GO_GOSLIM.txt", sep="\t", skiprows=5, names=col_names)
genes = pd.read_csv("EXP/expression.tsv", sep="\t", header=0, usecols=[0])["Sample"].tolist()


In [7]:
import pandas as pd

# Data for the DataFrame
data = {
    "Code": ["ISS", "IPI", "ISM", "IEA", "IMP", "IBA", "ND", "IDA", "IGI", "TAS", "HDA", "IEP", "IC", "HEP", "NAS", "RCA"],
    "Category": [
        "Computational", "Experimental", "Computational", "Automatically-Generated", "Experimental",
        "Computational", "Curatorial", "Experimental", "Experimental", "Author Statement",
        "High-Throughput", "Experimental", "Curatorial", "High-Throughput", "Author Statement", "Computational"
    ],
    "Description": [
        "Inferred from sequence or structural similarity.",
        "Inferred from physical interaction (e.g., protein-protein interactions).",
        "Inferred from sequence model (e.g., HMM profiles).",
        "Inferred from electronic annotation (automatically generated, not human-reviewed).",
        "Inferred from mutant phenotype.",
        "Inferred from biological aspect of ancestor.",
        "No biological data available.",
        "Inferred from direct assay (e.g., enzyme assays, microscopy).",
        "Inferred from genetic interaction.",
        "Traceable author statement (from a published paper with a traceable source).",
        "Inferred from high-throughput direct assay.",
        "Inferred from expression pattern.",
        "Inferred by a curator.",
        "Inferred from high-throughput expression pattern.",
        "Non-traceable author statement (from a paper without a traceable source).",
        "Inferred from reviewed computational analysis."
    ]
}

# Create the DataFrame
evidence = pd.DataFrame(data)



In [11]:
print(evidence.set_index("Code").to_latex(float_format="{:0.3f}".format))

\begin{tabular}{lll}
\toprule
 & Category & Description \\
Code &  &  \\
\midrule
ISS & Computational & Inferred from sequence or structural similarity. \\
IPI & Experimental & Inferred from physical interaction (e.g., protein-protein interactions). \\
ISM & Computational & Inferred from sequence model (e.g., HMM profiles). \\
IEA & Automatically-Generated & Inferred from electronic annotation (automatically generated, not human-reviewed). \\
IMP & Experimental & Inferred from mutant phenotype. \\
IBA & Computational & Inferred from biological aspect of ancestor. \\
ND & Curatorial & No biological data available. \\
IDA & Experimental & Inferred from direct assay (e.g., enzyme assays, microscopy). \\
IGI & Experimental & Inferred from genetic interaction. \\
TAS & Author Statement & Traceable author statement (from a published paper with a traceable source). \\
HDA & High-Throughput & Inferred from high-throughput direct assay. \\
IEP & Experimental & Inferred from expression pattern. 

In [13]:
print("unique GO terms", goslim["GO_term"].unique().shape[0])
print("unique GO ids", goslim["GO_ID"].unique().shape[0])


unique GO terms 7265
unique GO ids 7265


In [14]:
goslim.shape

(337090, 15)

In [15]:
goslim = goslim.loc[goslim["locus_name"].isin(genes)]
print("shape", goslim.shape)


shape (328727, 15)


In [16]:
evidence = evidence.set_index("Code").join(goslim.Evidence_code.value_counts())

In [19]:
evidence

Unnamed: 0_level_0,Category,Description,count
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ISS,Computational,Inferred from sequence or structural similarity.,10887
IPI,Experimental,"Inferred from physical interaction (e.g., prot...",27339
ISM,Computational,"Inferred from sequence model (e.g., HMM profil...",37754
IEA,Automatically-Generated,Inferred from electronic annotation (automatic...,38876
IMP,Experimental,Inferred from mutant phenotype.,40065
IBA,Computational,Inferred from biological aspect of ancestor.,60897
ND,Curatorial,No biological data available.,20874
IDA,Experimental,"Inferred from direct assay (e.g., enzyme assay...",32390
IGI,Experimental,Inferred from genetic interaction.,8999
TAS,Author Statement,Traceable author statement (from a published p...,12744


In [21]:
print(evidence.to_latex(float_format="{:0.3f}".format))

\begin{tabular}{lllr}
\toprule
 & Category & Description & count \\
Code &  &  &  \\
\midrule
ISS & Computational & Inferred from sequence or structural similarity. & 10887 \\
IPI & Experimental & Inferred from physical interaction (e.g., protein-protein interactions). & 27339 \\
ISM & Computational & Inferred from sequence model (e.g., HMM profiles). & 37754 \\
IEA & Automatically-Generated & Inferred from electronic annotation (automatically generated, not human-reviewed). & 38876 \\
IMP & Experimental & Inferred from mutant phenotype. & 40065 \\
IBA & Computational & Inferred from biological aspect of ancestor. & 60897 \\
ND & Curatorial & No biological data available. & 20874 \\
IDA & Experimental & Inferred from direct assay (e.g., enzyme assays, microscopy). & 32390 \\
IGI & Experimental & Inferred from genetic interaction. & 8999 \\
TAS & Author Statement & Traceable author statement (from a published paper with a traceable source). & 12744 \\
HDA & High-Throughput & Inferred fr

In [None]:
evidence.groupby("Category")["count"].sum().sort_values(ascending=False).to

Category
Experimental               117971
Computational              109697
Automatically-Generated     38876
High-Throughput             26897
Curatorial                  21120
Author Statement            14166
Name: count, dtype: int64

In [16]:
goterms = goslim["GO_ID"].unique().tolist()
gotoi = {x:y for x, y in zip(goterms, range(len(goterms)))}


In [17]:
gtoi = {x:y for x, y in zip(genes, range(len(genes)))}

In [13]:
# goslime grouped by genes for extraction of GO terms per gene 
def go_dict(goslim):
    gene_go = {} 
    grouped_go = goslim.groupby(["locus_name"]).GO_ID
    print(len(grouped_go))
    
    for (gene,) , go in grouped_go:
        gene_go[gene] = go.unique().tolist()
    return gene_go


In [1]:
dfs[0]

NameError: name 'dfs' is not defined

In [97]:
goslim_i = pd.DataFrame()

In [103]:
goslim_i["locus_name"] = goslim["locus_name"].apply(lambda x: gtoi[x])
goslim_i["GO_ID"] = goslim["GO_ID"].apply(lambda x: gotoi[x])
goslim_i["Evidence_code"] = goslim["Evidence_code"]

In [114]:
computational_annotation = evidence.loc[evidence["Category"] == "Computational"].index.values

In [129]:
wo_iea = goslim_i.loc[goslim_i["Evidence_code"] != "IEA"]
wo_comp = wo_iea.loc[~wo_iea["Evidence_code"].isin(computational_annotation)]

In [136]:
wo_iea_dict = go_dict(wo_iea)

28787


In [137]:
wo_comp_dict = go_dict(wo_comp)

25386


In [138]:
all_goslim = go_dict(goslim_i)

28802


In [148]:
def to_txt(go_dict, path):
    path = "GO/" + path + ".txt"
    
    with open(path, "w") as f:
        for gene, go in go_dict.items():
            line = str(gene) + "\t" + " ".join(list(map(str, go))) + "\n"
            f.write(line)


In [149]:
to_txt(wo_iea_dict, "go_wo_iea")

In [150]:
to_txt(wo_comp_dict, "go_wo_comp")

In [145]:
goslim = pd.read_table("GO/TairID_GO.txt", sep="\t", header=None, index_col=0)


In [11]:
go_dict = {}

for idx, (gene, go) in goslim.iterrows():
    go_dict[gene] = go.split(" ")


In [12]:
for key, val in go_dict.items():
    go_dict[key] = [gotoi[x] for x in val]


In [13]:
test = torch.nn.functional.one_hot(
   torch.Tensor(go_dict["AT1G01010"]).long(),
   num_classes=len(gotoi.keys())
)


In [55]:
onehots = []

for key in go_dict.keys():
    idx = torch.Tensor(go_dict[key]).long()
    classes = len(gotoi.keys())
    one_hot_vec = torch.nn.functional.one_hot(idx, classes).sum(dim=0)
    onehots.append(one_hot_vec)

In [14]:
# ik heb one hots
# dus ik kan voorspellen 
# probleem er zijn ~37k genen en maar voor 28k genen met GO terms 
# dus de index voor ~37k genen behoud ik, maar ik heb wel nieuwe train, val, test split nodig zodat ik niet een gen zonder embedding 

In [33]:
test = goslim.iloc[:,0].str.split().to_dict()

In [49]:
import itertools
len(itertools.chain.from_iterable(test.values()))

TypeError: object of type 'itertools.chain' has no len()

In [51]:
gp = [1, 0]

In [None]:
gene1, gene2 = gp 
dene1

[1, 0]

In [58]:
set(itertools.chain.from_iterable(test.values()))

{'GO:0008446',
 'GO:0047341',
 'GO:0030915',
 'GO:0006423',
 'GO:0071025',
 'GO:0032147',
 'GO:0070876',
 'GO:0018061',
 'GO:0004514',
 'GO:1904680',
 'GO:0006003',
 'GO:0043325',
 'GO:0035102',
 'GO:0007584',
 'GO:0098609',
 'GO:0015149',
 'GO:0045430',
 'GO:0009875',
 'GO:0004356',
 'GO:0052656',
 'GO:1902265',
 'GO:0033743',
 'GO:0010249',
 'GO:0072722',
 'GO:0010045',
 'GO:0010087',
 'GO:0019210',
 'GO:0080184',
 'GO:0004029',
 'GO:0000400',
 'GO:0034219',
 'GO:0071558',
 'GO:0103045',
 'GO:0006625',
 'GO:0048527',
 'GO:0009715',
 'GO:0000455',
 'GO:1902369',
 'GO:0007140',
 'GO:0019217',
 'GO:0080011',
 'GO:0003746',
 'GO:0051724',
 'GO:0005667',
 'GO:0042565',
 'GO:0070971',
 'GO:0010168',
 'GO:0003925',
 'GO:0070330',
 'GO:0106049',
 'GO:0051640',
 'GO:0003879',
 'GO:0017178',
 'GO:0030692',
 'GO:0070012',
 'GO:0033862',
 'GO:0019900',
 'GO:0000476',
 'GO:0007130',
 'GO:0004034',
 'GO:0033947',
 'GO:0005739',
 'GO:0048040',
 'GO:0046244',
 'GO:0019222',
 'GO:0050792',
 'GO:00430

In [63]:
pd.Series(list(set(itertools.chain.from_iterable(test.values())))).to_csv("GO_list.txt", sep="\t", index=False, header=False)

In [69]:
pd.read_table("GO/GO_list.txt", header=None).iloc[:, 0].tolist()

['GO:0008446',
 'GO:0047341',
 'GO:0030915',
 'GO:0006423',
 'GO:0071025',
 'GO:0032147',
 'GO:0070876',
 'GO:0018061',
 'GO:0004514',
 'GO:1904680',
 'GO:0006003',
 'GO:0043325',
 'GO:0035102',
 'GO:0007584',
 'GO:0098609',
 'GO:0015149',
 'GO:0045430',
 'GO:0009875',
 'GO:0004356',
 'GO:0052656',
 'GO:1902265',
 'GO:0033743',
 'GO:0010249',
 'GO:0072722',
 'GO:0010045',
 'GO:0010087',
 'GO:0019210',
 'GO:0080184',
 'GO:0004029',
 'GO:0000400',
 'GO:0034219',
 'GO:0071558',
 'GO:0103045',
 'GO:0006625',
 'GO:0048527',
 'GO:0009715',
 'GO:0000455',
 'GO:1902369',
 'GO:0007140',
 'GO:0019217',
 'GO:0080011',
 'GO:0003746',
 'GO:0051724',
 'GO:0005667',
 'GO:0042565',
 'GO:0070971',
 'GO:0010168',
 'GO:0003925',
 'GO:0070330',
 'GO:0106049',
 'GO:0051640',
 'GO:0003879',
 'GO:0017178',
 'GO:0030692',
 'GO:0070012',
 'GO:0033862',
 'GO:0019900',
 'GO:0000476',
 'GO:0007130',
 'GO:0004034',
 'GO:0033947',
 'GO:0005739',
 'GO:0048040',
 'GO:0046244',
 'GO:0019222',
 'GO:0050792',
 'GO:00430

In [74]:
go_dict2 = {gtoi[x]:go_dict[x] for x in go_dict.keys()}

In [82]:
with open("GO/GO_data.txt", "w") as f:
    for key, val in go_dict2.items():
        line = str(key) + "\t" + " ".join([str(x) for x in val]) + "\n"
        f.write(line)

In [86]:
os.chdir("Code")

In [1]:
from Data import GoDataSet

In [3]:
go_data = "../GO/GO_data.txt"
test_data = "/home/llan/Desktop/WUR/thesis2/GO/dataset/PN_1_1/Test_set.tsv"

data = GoDataSet(test_data, go_data)

In [4]:
import pandas as pd 
test = pd.read_table(go_data, names=["idx", "go"], dtype={0:int}, index_col=0)

In [5]:
data.dataset[0]

array([   5, 3527])

In [6]:
data.go_data[0]

[0, 1, 2, 3]

In [13]:
data[0][1].shape

torch.Size([14492])

In [8]:
from torch.utils.data import DataLoader

In [9]:
load =  DataLoader(data)

In [10]:
for i, x in enumerate(load):
    x

In [4]:
import os 
os.getcwd()

'/home/llan/Desktop/WUR/thesis2'

In [5]:
tf = pd.read_table("/home/llan/Desktop/WUR/thesis2/LABELS/TF_list.tsv", header=0)
tg = pd.read_table("/home/llan/Desktop/WUR/thesis2/LABELS/TG_list.tsv", header=0)

In [6]:
genes_go = goslim.locus_name.unique().tolist()

In [7]:
tf.ID.isin(genes_go).value_counts()

ID
True    1717
Name: count, dtype: int64

In [8]:
tg.ID.isin(genes_go).value_counts()

ID
True     27085
False     8534
Name: count, dtype: int64

In [10]:
tg[tg["ID"].isin(genes_go)].to_csv("GO/dataset/TG_list.tsv", sep="\t", index=False)