# Building 3 types of disease feature  


In [33]:
import numpy as np
import pandas as pd
import re
import torch
import math
from tqdm import tqdm
from sklearn import preprocessing
import multiprocessing as mp

In [18]:
# Calculate Gaussian similarity
def kernel_(data):
    A = torch.Tensor(list(data.values))
    nd2, nd = A.shape
    KD = np.zeros([nd,nd])
    ym = 0
    # calculate r_m, where r_m' is set to 1
    for j in range(0,nd):
        A1 = A[:,j].view([nd2,-1])
        pj = torch.mm(A1.t(),A1)
        ym = ym+pj
    ym = 1/(ym/nd)

    for i in tqdm(range(0,nd)):
        p1 = A[:,i].view([nd2,-1])
        p3 = p1-A
        p4 = torch.mm(p3.t(),p3).diag()
        KD[i][:] = torch.exp(-ym*p4)
    result = pd.DataFrame(KD, index=data.columns, columns=data.columns)
    return(result)
# delete all unuseful words in disease name
def dele_common(dele, disease):
    for i in dele:
        disease = re.sub(i, '', disease)
    return disease

delete_word = [' diseases', ' disease', ' to', ' syndrome', ' disorders', ' disorder', ' and', ' of', ' or', ' with']

## disease feature based on disease-gene

In [7]:
# SNAP and DisGeNET
disease_gene2 = pd.read_csv('../data/disease_gene_final_final.csv')
disease_gene2 = disease_gene2[['disease_similarity','geneSymbol']]
disease_gene2.columns = ['disease_name','Symbol']

In [24]:
# paper
disease_gene3 = pd.read_csv('../data/DOIO_gene.csv')
disease_DOID = pd.read_csv('../data/disease_DOID.csv')
disease_gene3 = pd.merge(disease_DOID,disease_gene3,left_on='DOIO',right_on='DOID',how='inner')[['disease_name','Entrez_Gene_ID']]
gene_3_match = pd.read_csv('../data/gene_match_on_paper.csv')
disease_gene3_1 = pd.merge(gene_3_match, disease_gene3, left_on='GeneID',right_on='Entrez_Gene_ID',how='inner')
disease_gene3_1 = disease_gene3_1[['disease_name','Symbol']]

In [44]:
# all disease feature
disease_gene = pd.concat([disease_gene3_1,disease_gene2],axis=0).drop_duplicates()
disease_gene.columns = ['disease_name','gene']
disease_gene.to_csv('../data/disease_gene_final_end.csv')

# Remove the [' diseases', ' disease', ' to', ' syndrome', ' disorders', ' disorder', ' and', ' of', ' or', ' with']
# disease_gene['clean_name'] = disease_gene['disease_name'].apply(lambda x: dele_common(delete_word, str(x).lower()))
# disease_gene[['disease_name', 'clean_name']].drop_duplicates().to_csv("diseasename_match_in_gene.csv")
disease_gene['value'] = 1
disease_gene = disease_gene[['gene','disease_name','value']].drop_duplicates()
disease_gene_matrix = disease_gene.pivot(index = 'gene', columns = 'disease_name', values = 'value')
disease_gene_matrix[disease_gene_matrix.isnull()] = 0

## Gaussian similarity
# disease_simiparity_gene_direct = kernel_(disease_gene_matrix)
# print(disease_simiparity_gene_direct.shape)
# disease_simiparity_gene_direct.head()
# disease_simiparity_gene_direct.to_csv("disease_gene_sim.csv")

## disease feature based on disease-mirna

In [45]:
# HMDD v3.2 and miRNA-cancer
disease_mirna = pd.read_csv('../data/mirna_disease.csv')
disease_mirna.columns = ['mir','disease_name']

#  Remove the [' diseases', ' disease', ' to', ' syndrome', ' disorders', ' disorder', ' and', ' of', ' or', ' with']
# disease_mirna['clean_name'] = disease_mirna['disease_name'].apply(lambda x: dele_common(delete_word, str(x).lower()))

## Remove hsa-prefix (HSA removed or not?)
disease_mirna['mir'] = disease_mirna['mir'].apply(lambda x: x.replace('hsa-', ''))
# disease_mirna[['disease_name', 'clean_name']].drop_duplicates().to_csv("/home/mw/project/match_part/diseasename_match_in_mirna.csv")
## De-duplication converts long tables into wide tables, with missing ones filled with zeros
disease_mirna = disease_mirna[['mir', 'disease_name']].drop_duplicates()
disease_mirna['value'] = 1
disease_mirna_matrix = disease_mirna.pivot(index = 'mir', columns = 'disease_name', values = 'value')
disease_mirna_matrix[disease_mirna_matrix.isnull()] = 0


# disease_simiparity_mirna_direct = kernel_(disease_mirna_matrix)
# print(disease_simiparity_mirna_direct.shape)
# disease_simiparity_mirna_direct.to_csv("disease_mirna_sim.csv")

## disease feature based on disease-GO

In [41]:
mesh = pd.read_csv('../data/all_mesh.CSV')
# disease_in_food = pd.read_csv('/home/mw/input/combine_disease9321/disease_food_infer_4.csv')
k = mesh
k['d05'] = k['d03'].apply(lambda x: x[0])

# filter disease go
k = k[k['d05'] == 'C']
k['d04'] = k['d03'].apply(lambda x: len(x))
k[k['d04'] == k['d04'].max()]['d03']
k[k['d04'] == k['d04'].max()]['d03']
k['d06'] = k['d03'].apply(lambda x: x[1:].split('.'))

# count the len of GO
k['d04'] = k['d06'].apply(lambda x: len(x))
disease_all_name = list(k['d02'].drop_duplicates())
# diseasename_in_food = pd.DataFrame(list(disease_in_food.columns[1:]),columns=['disease2'])
# diseasename_in_food['disease2'] = diseasename_in_food['disease2'].apply(lambda x: str(x).lower())
k['d02'] = k['d02'].apply(lambda x: str(x).lower())
# merge_name = pd.merge(k,diseasename_in_food,left_on='d02',right_on='disease2',how='inner')
merge_name = k
mesh_merge_final = merge_name[['d02','d06','d04']]

# count score about sub_tree
score = pd.DataFrame(list(range(1,k['d04'].max()+1)),columns=['number'])
import math
score2 = []
for item in list(score['number']):
    score2.append(pow(0.5,item))
score3 = []
for item in range(0,len(score2)+1):
    score3.append(np.array(score2[:item]).sum())
score3 = score3[1:]
score['score'] = score3
mesh_merge_final = pd.merge(mesh_merge_final,score,left_on='d04',right_on='number',how='inner')[['d02','d06','score']]
disease_all_name = list(mesh_merge_final['d02'].drop_duplicates())
disease_sim = pd.DataFrame(np.zeros((len(disease_all_name),len(disease_all_name))),index = disease_all_name, columns=disease_all_name)
def similarity(o1,o2):
    o3 = list(set(o1).intersection(set(o2)))
    location = []
    o1 = pd.DataFrame(o1,columns=['loca'])
    
    for item in o3:
        for item2 in list(o1[o1['loca'] == item].index):
            location.append(item2)
    location_sorted = sorted(location)
    defi_location = -1
    for item3 in list(range(0,len(location_sorted))):
        item2 = location_sorted[item3]
        if item2 == item3:
            

            defi_location = item2
        else:
            break
    return defi_location

score['score'] = score['score'].apply(lambda x: round(x,6))
mesh_merge_final['score'] = mesh_merge_final['score'].apply(lambda x: round(x,6))

# start building similarity matrix
for item in disease_all_name:
    for item1 in disease_all_name:
        item2 = mesh_merge_final[mesh_merge_final['d02'] == item]
        item2 = item2.reset_index()
        item3 = mesh_merge_final[mesh_merge_final['d02'] == item1]
        item3 = item3.reset_index()
        similarity_all = []
        for item4 in range(0,item2.shape[0]):
            item6 = item2['d06'][item4]
            item7 = item2['score'][item4]
            for item5 in range(0,item3.shape[0]):
                item8 = item3['d06'][item5]
                item9 = item3['score'][item5]
                if similarity(item6,item8) == -1:
                    similarity_all.append(0)
                else:
                    similarity_all.append((2*score['score'][similarity(item6,item8)])/(item7+item9))

        disease_sim[item][item1] = max(similarity_all)
disease_similarity_go_direct = disease_sim
# disease_similarity_go_direct.to_csv("/home/mw/project/final_data/disease_go_sim.csv")