In [1]:
import pandas as pd
import pickle
from tqdm import tqdm

In [2]:
df = pd.read_csv('../data/combined_more_new.emb', delimiter=' ', header=None)

hidden_dim = df.shape[1] - 1

columns = ['protein']
for i in range(hidden_dim):
    columns.append(('dim'+str(i+1)))

df.columns = columns

proteins = set(df['protein'].to_list())
len(proteins)

12306

In [3]:
# Covert string to the proper set format, since each location is of the form {'location_name'}
def get_set(x):
    x = x[1:-1]
    x = x.replace("'", "")
    x = x.replace('"', '"').strip()
    x = x.split(',')
    x = set([i.strip() for i in x])
    return x

In [4]:
omit_locs = ['Rods & Rings', 'Aggresome', 'Microtubule ends', 'Cleavage furrow']

def get_loc_values(df, info_retain=1.0):
    
    # Compute a dictionary of the form {protein_id: {location: count}, ...}
    prot2loc_c = dict()
    for i in tqdm(df.index):
        p1 = df['protein1'][i]
        p2 = df['protein2'][i]
        l = get_set(df['locations'][i])

        try:
            prot2loc_c[p1]
        except:
            prot2loc_c[p1] = dict()
        try:
            prot2loc_c[p2]
        except:
            prot2loc_c[p2] = dict()

        for loc in l:
            if loc not in omit_locs:
                try:
                    prot2loc_c[p1][loc] += 1
                except:
                    prot2loc_c[p1][loc] = 1
            
                try:
                    prot2loc_c[p2][loc] += 1
                except:
                    prot2loc_c[p2][loc] = 1

    # Pick locations corresponding to a protein that hold more information than info_retain
    for i in tqdm(prot2loc_c.keys()):
        counts = prot2loc_c[i].values()
        total_c = sum(counts)
        counts = [c/total_c for c in counts]
        norm_counts = dict(zip(prot2loc_c[i].keys(), counts))
        norm_counts = {
            k: v for k, v in sorted(norm_counts.items(), key=lambda x: x[1], reverse=True)
            }

        cumsum = 0
        temp = dict()
        for key,val in norm_counts.items():
            if cumsum > info_retain:
                break
            cumsum += val
            temp[key] = val

        prot2loc_c[i] = set(temp.keys())

    return prot2loc_c

In [5]:
data = pd.read_csv('../data/combined_more_new_helper.csv')

# data = data[data.reliability1.notnull()]
# data = data[data.reliability2.notnull()]

prot2loc = get_loc_values(data, info_retain=0.7)

# prot2loc = dict()
# locs = set()
# drop_l = list()
# for i in data.index:

#     l = get_set(data['locations'][i])
#     for x in l:
#         locs.add(x)
            
#     p1 = data['protein1'][i]  
#     try:
#         prot2loc[p1] = prot2loc[p1].union(l)
#     except:
#         prot2loc[p1] = l

#     p2 = data['protein2'][i]
#     try:
#         prot2loc[p2] = prot2loc[p2].union(l)
#     except:
#         prot2loc[p2] = l
p = next(iter(prot2loc))
print(p, prot2loc[p])
# list(prot2loc.values())[0]

100%|██████████| 150000/150000 [00:09<00:00, 15211.20it/s]
100%|██████████| 12306/12306 [00:00<00:00, 174067.21it/s]ENSP00000326119 {'Vesicles'}



In [6]:
# locs = list(locs)
# loc2id = {k: v for v, k in enumerate(locs)}

# with open('../data/loc2id_string_new.pkl','wb') as f:
#         pickle.dump(loc2id, f)

# for key,value in loc2id.items():
#         print(key, value)

In [7]:
df.head()

Unnamed: 0,protein,dim1,dim2,dim3,dim4,dim5,dim6,dim7,dim8,dim9,...,dim119,dim120,dim121,dim122,dim123,dim124,dim125,dim126,dim127,dim128
0,ENSP00000388107,0.630998,0.353874,-0.623053,1.560223,0.01736,-1.903826,-2.548556,0.126147,-1.032244,...,0.006003,2.094278,-6.036074,1.40241,-0.918161,0.452964,-4.30454,-0.994951,-2.446774,-1.721137
1,ENSP00000407586,4.630718,2.08115,0.336501,-2.344453,1.361587,-1.731791,0.110057,-1.188406,0.119458,...,-3.062323,-1.555196,1.50739,-0.327374,1.434506,4.572812,1.239097,1.209302,-0.997174,-2.745158
2,ENSP00000306330,3.333804,0.591292,-0.438372,0.039246,-0.634735,-2.153432,-2.813201,-0.211307,-1.182109,...,-0.167855,-1.812999,0.48334,0.245042,-1.349984,-0.268223,-0.834833,1.103712,-1.817778,-1.262102
3,ENSP00000451828,3.362763,-1.196866,1.493734,-1.760889,-1.583714,0.378306,1.572643,1.202605,-0.85891,...,-1.681248,0.872071,2.700834,-0.570942,0.221468,2.828533,-1.633486,1.301244,-4.481641,-2.152392
4,ENSP00000361021,-0.228882,0.782611,-1.00598,-0.750401,2.079447,1.332219,-1.387472,-1.130411,-1.077732,...,-1.545805,-2.552706,0.691746,2.384368,1.061209,-0.449063,-2.12228,0.095804,-0.430881,-0.638577


In [8]:
df_locs = list()
count = 0
for i in df.index:
    try:
        loc = prot2loc[df['protein'][i]]
    except:
        count+=1
        loc = None
    df_locs.append(loc)
        
df['locations'] = df_locs

df.dropna(inplace=True)

print(df.shape, count)

(12306, 130) 0


In [9]:
cols = list(df.columns)
cols.remove('locations')
cols.remove('protein')
cols = ['locations']+cols

df = df[cols].sample(frac=1, random_state=42)

n_prots = df.shape[0]
alpha = 0.85
n_train = int(alpha * n_prots)
df_train = df.iloc[:n_train, :]
df_test = df.iloc[n_train:, :]

print(df_train.shape, df_test.shape)

(10460, 129) (1846, 129)


In [10]:
df_train.to_csv('../data/train_combined_more_new_0.7.csv', index=None)
df_test.to_csv('../data/test_combined_more_new_0.7.csv', index=None)