* Символы генов **done**
* Степени генов в интерактоме **done**
* Длины генов **done**
* Generic GO slim теги **done**
* Экспрессия в биполярном нейроне
* Экспрессия в B-клетке

In [1]:
import pandas as pd
import numpy as np

In [2]:
lol_df = pd.read_csv("data/uniprot.tab", sep="\t")

## Get gene symbols

In [186]:
from mygene import MyGeneInfo

In [211]:
def get_symbols(ensembl_ids, scopes):
    mg = MyGeneInfo()
    def get_id(ens_id):
        return ens_id.split(".")[0]
    ensembl_ids = ensembl_ids.apply(get_id)
    
    rev_ens_ids = pd.Series(data=ensembl_ids.index, 
                            index=ensembl_ids.values)
    resp = mg.querymany(ensembl_ids, scopes=scopes)
    symbols, index = zip(*[(hit.get("symbol", np.nan), rev_ens_ids[hit["query"]])
                           for hit in resp])
    symbols = pd.Series(data=symbols, index=index)
    return symbols

## Get degrees

In [3]:
import igraph as ig

In [4]:
iome = ig.Graph.Read_Pickle("data/graph.pkl")

In [5]:
def get_symb_closure(symbs):
    def get_by_symbs(vertex):
        return vertex["symbol"] in symbs
    return get_by_symbs
get_by_symbs = get_symb_closure(set(lol_df["Symbol"].values))

In [6]:
genes = iome.vs.select(get_by_symbs)

In [8]:
def get_degree_df(genes):
    res_df = pd.DataFrame()
    res_df["Symbol"] = genes["symbol"]
    res_df["Degree"] = genes.degree()
    return res_df

In [9]:
degree_df = get_degree_df(genes)

In [14]:
# add 'em finally
lol_df = pd.merge(lol_df, degree_df, on="Symbol")

## Get GO SLIMs

In [25]:
import re

In [21]:
go_df = pd.read_csv("data/total_go.tsv", sep="\t", 
                    header=None)
go_df.columns = ["Symbol", "IDs"]

In [69]:
id_re = r"(?<=id: )GO:[0-9]+"
name_re = r"(?<=name: ).+"
namespace_re = r"(?<=namespace: ).+"

In [74]:
IDs = list()
names = list()
namespaces = list()
prev = None
with open("data/goslim_generic.obo.txt", "r") as goslim_file:
    for line in goslim_file:
        ID_match = re.search(id_re, line)
        name_match = re.search(name_re, line)
        namespace_match = re.search(namespace_re, line)
        if line == "[Term]\n":
            prevprev = prev
            prev = "Term"
        elif ID_match and (prev == "Term"):
            IDs.append(ID_match.group())
            prevprev = prev
            prev = "ID"
        elif name_match and (prev == "ID"):
            names.append(name_match.group())
            prev = "name"
        elif namespace_match and (prev == "name"):
            namespaces.append(namespace_match.group())
        else:
            prev = None

In [163]:
go_slim_df = pd.DataFrame()
go_slim_df["ID"] = IDs
go_slim_df["Name"] = names
go_slim_df["Namespace"] = namespaces
go_slim_df = go_slim_df[go_slim_df["Namespace"] == "molecular_function"]

In [164]:
Counter(go_slim_df["Namespace"])

Counter({'molecular_function': 43})

In [127]:
id_name_mapping = pd.Series(data=go_slim_df["Name"].values, index=go_slim_df["ID"].values)

In [157]:
chosen_mapping = id_name_mapping[(id_name_mapping == "mitochondrion organization") | 
                                 (id_name_mapping == "cell adhesion") |
                                 (id_name_mapping == "signal transduction")]

In [158]:
chosen_mapping

GO:0007005    mitochondrion organization
GO:0007155                 cell adhesion
GO:0007165           signal transduction
dtype: object

In [159]:
def get_slim_closure(id_name_mapping):
    def get_slim(id_list_str):
        id_list = id_list_str.split(";")
        name_dict = [int(ID in id_list)
                     for ID in id_name_mapping.index]
        return name_dict
    return get_slim 
get_slim = get_slim_closure(chosen_mapping)

In [160]:
slims = go_df["IDs"].apply(get_slim)

In [161]:
sparse_bioproc = np.array(list(slims.values))

In [170]:
Counter(sparse_bioproc.sum(axis=1))

Counter({0: 18527, 1: 1633, 2: 66})

In [162]:
sparse_bioproc.sum(axis=0)

array([  89,  465, 1211])

In [167]:
go_df["mitochondrion organization"], go_df["cell adhesion"], go_df["signal transduction"] = sparse_bioproc[:,0], sparse_bioproc[:,1], sparse_bioproc[:,2]

In [176]:
lol_df = pd.merge(lol_df, go_df[["Symbol", "mitochondrion organization", "cell adhesion", "signal transduction"]],
                  on="Symbol")

## Get expression

In [235]:
neuro_expr = pd.read_csv("data/bipolar_neuron_expr.tsv", sep="\t")

In [182]:
b_expr = pd.read_csv("data/B_cell_expr.tsv", sep="\t")

In [218]:
neuro_symb = get_symbols(neuro_expr["gene_id"], scopes=["ensembl.gene"])

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-24000...done.
querying 24001-25000...done.
querying 25001-26000...done.
querying 26001-27000...done.
querying 27001-28000...done.
querying 28001-29000...done.
querying 29001-30000...done.
querying 30001-31000...done.
querying 31001-32000...done.
querying 32001-33000...done.
querying 33001-34000...done.
querying 34001-35000...done.
queryin

In [231]:
neuro_symb = neuro_symb[~neuro_symb.index.duplicated(keep="first")]

In [238]:
neuro_expr["Symbol"] = neuro_symb

In [239]:
b_expr["Symbol"] = neuro_symb

In [241]:
b_expr.dropna(inplace=True)

In [243]:
neuro_expr.dropna(inplace=True)

In [248]:
lol_df = pd.merge(lol_df, b_expr[["Symbol", "FPKM"]], on="Symbol")
lol_df["B cell FPKM"] = lol_df["FPKM"]

In [256]:
lol_df = pd.merge(lol_df, neuro_expr[["Symbol", "FPKM"]], on="Symbol")
lol_df["Neuron FPKM"] = lol_df["FPKM"]
del(lol_df["FPKM"])

In [254]:
del lol_df["FPKM"]

In [260]:
lol_df.to_csv("data/primary.tsv", sep="\t", index=False)