In [8]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [9]:
## FUNCTIONS

# load embeddings
def load_ProteinEmbeddings(filepath):
    infile = open(filepath,'rb')
    new_dict = pickle.load(infile)
    return new_dict

# load protein links
def load_Pairs(filepath_positive, filepath_negative):
    # load protein pairs from  protein links file
    df_p_pairs = pd.read_csv(filepath_positive, sep='\s', engine='python')
    print('Number of Unique Proteins: ' + str(len(pd.unique(df_p_pairs[['protein1', 'protein2']].values.ravel('K')))))
    
    # positive pairs: all scores above 700
    df_positive_pairs = df_p_pairs[df_p_pairs['combined_score'] > 700]
    # text preprocessing
    df_positive_pairs.loc[:,'protein1'] = df_positive_pairs.loc[:,'protein1'].str.replace('.','_')
    df_positive_pairs.loc[:,'protein2'] = df_positive_pairs.loc[:,'protein2'].str.replace('.','_')
    df_positive_pairs.reset_index(inplace=True)
 
    # negative pairs
    df_negative_pairs = pd.read_csv(filepath_negative, names=['protein1', 'protein2'], sep='\s', engine='python')
    # text preprocessing
    df_negative_pairs.loc[:,'protein1'] = df_negative_pairs.loc[:,'protein1'].str.replace('.','_')
    df_negative_pairs.loc[:,'protein2'] = df_negative_pairs.loc[:,'protein2'].str.replace('.','_')
    df_negative_pairs.reset_index(inplace=True)
    
    # make same size
    #df_positive_pairs = df_positive_pairs.sample(n = len(df_negative_pairs))

    return df_positive_pairs, df_negative_pairs

def get_sample(fraction, dataframes):
    dfs = []
    for df in dataframes:
        df.sample(frac = fraction)
        dfs.append(df)
    return dfs

The below code retrieves protein embeddings, generates positive and negative pairs, and ultimately wrangles the pairs into training and test set indices. These indices are consequently used for all ML models.

In [None]:
# load protein embeddings from pickle file
p_embeddings = load_ProteinEmbeddings('/Users/teissherman/Desktop/GitHub/IndividualProject/Project/Output/Embeddings/1_R_2_no_100_yes_R+.pickle')    

# load positive and negative pairs
df_positive_pairs, df_negative_pairs = load_Pairs('/Users/teissherman/Desktop/GitHub/IndividualProject/Project/data/STRINGDB/9606.protein.links.v11.5.txt',
'/Users/teissherman/Desktop/GitHub/IndividualProject/Project/data/STRINGDB/9606.negative_interactions.txt')

In [None]:
#inspect protein embeddings
p_embeddings

Map proteins to coressponding vectors

In [11]:
# new dictionary with protein pairs keys
pair_emb = {}
# positive keys
for index, row in df_positive_pairs.iterrows():
    pair_emb[(row['protein1'],row['protein2'],row['combined_score'])] = 0 
# negative keys
for index, row in df_negative_pairs.iterrows():
    pair_emb[(row['protein1'],row['protein2'], 0)] = 0
# look up proteins keys, get array and concatenate pair
for key in pair_emb.keys():
    p1 = 'http://purl.obolibrary.org/obo/' + str(key[0])
    p2 = 'http://purl.obolibrary.org/obo/' + str(key[1])
    if p1 in p_embeddings and p2 in p_embeddings:
        p1_array = np.array(p_embeddings[p1]) 
        p2_array = np.array(p_embeddings[p2])
        pair_emb[key] = np.concatenate((p1_array, p2_array))
# remove empty values in dictionary if there is no protein embedding
remove = [k for k in pair_emb.keys() if type(pair_emb[k]) == int]
print(len(remove))
for k in remove: del pair_emb[k]

54752


Save outputs to pickle files

In [7]:
# save pair embeddings dictionary as pickle file
with open('/Users/teissherman/Desktop/GitHub/IndividualProject/Project/data/ProteinEmbeddingsLinks/ProteinEmbeddings.pickle', 'wb') as handle:
    pickle.dump(pair_emb, handle)

In [12]:
# save pair embeddings as dataframe
df_all_pairs = pd.DataFrame.from_dict(pair_emb)

# pickle dataframe
#with open('/Users/teissherman/Desktop/GitHub/IndividualProject/Project/data/ProteinEmbeddingsLinks/dataframe.pickle', 'wb') as handle:
#    pickle.dump(df_all_pairs, handle, protocol=pickle.HIGHEST_PROTOCOL)

wrangle protein embeddings for ml models

In [7]:
#transpose
p_emb_t = df_all_pairs.transpose()
p_emb_t.index.names = ['Protein1', 'Protein2', 'STRING_Score']
# reset index
p_emb_t.reset_index(level=['STRING_Score'], inplace=True)

# depending on experiment uncomment or comment below

# add new column for binary classifier scores where anyscore above 700 is a positive interaction
#p_emb_t['Binary_Score'] = p_emb_t.STRING_Score.apply(lambda x: 1 if x >= 700 else 0)
# add new column for binary classifier scores where anyscore above 0 is a positive interaction
p_emb_t['Binary_Score'] = p_emb_t.STRING_Score.apply(lambda x: 1 if x > 0 else 0)

In [8]:
# sample desired fraction of dataset as dataset is large
p_emb_t = p_emb_t.sample(frac=0.03)
# get index to standardize training and test data
data_index = p_emb_t.index
# inspect class imbalance
print(len(p_emb_t[p_emb_t['Binary_Score'] == 0]))
print(len(p_emb_t[p_emb_t['Binary_Score'] == 1]))

Generate training and test sets/indices for later re-use

In [12]:
train_data, test_data = np.split(p_emb_t.sample(frac=1, random_state=42), 
                                    [int(.7*len(p_emb_t))])

print('Number of training interactions:', len(train_data))
print('Number of testing interactions:', len(test_data))

# set indices
training_index = train_data.index
testing_index = test_data.index

Number of training interactions: 11205
Number of testing interactions: 4803


In [14]:
# inspect
p_emb_t.loc[training_index]

Unnamed: 0_level_0,Unnamed: 1_level_0,STRING_Score,0,1,2,3,4,5,6,7,8,...,191,192,193,194,195,196,197,198,199,Binary_Score
Protein1,Protein2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
9606_ENSP00000362900,9606_ENSP00000307863,956,-0.210142,-0.040956,-0.358236,-0.129174,-0.091641,0.227200,-0.116446,0.219358,0.017651,...,0.655033,-0.048629,-0.422594,0.066375,-0.098913,0.033856,-0.036974,0.080282,-0.167347,1
9606_ENSP00000362690,9606_ENSP00000371790,819,-0.078088,-0.022167,-0.530770,-0.131095,-0.217903,0.218156,-0.097075,0.156174,-0.008234,...,0.775171,0.101787,-0.609932,0.004475,0.069982,-0.039079,-0.098072,-0.005511,-0.227527,1
9606_ENSP00000263038,9606_ENSP00000368022,0,-0.092356,-0.021525,-0.474106,0.006022,-0.049059,0.231246,-0.088004,0.062018,0.111206,...,0.821429,0.073385,-0.579195,0.031790,0.024510,0.025921,-0.117517,-0.008589,-0.241911,0
9606_ENSP00000382863,9606_ENSP00000254950,0,-0.188665,0.007657,-0.471886,-0.222833,-0.233361,0.160075,-0.048778,0.301818,-0.037165,...,0.974961,0.207193,-0.549380,0.132849,0.114350,0.151294,-0.242248,0.008656,-0.414174,0
9606_ENSP00000306999,9606_ENSP00000322180,880,-0.127989,-0.077218,-0.411876,-0.101879,-0.125565,0.215434,-0.044607,0.166199,0.010552,...,0.622621,-0.070800,-0.372930,0.096169,-0.060287,-0.023093,0.012031,0.054574,-0.126477,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9606_ENSP00000471191,9606_ENSP00000343023,934,-0.006994,0.048754,-0.745527,-0.038535,-0.136707,0.192337,-0.076408,0.117858,0.106819,...,0.870575,0.077310,-0.517224,0.105895,0.048449,0.062462,-0.158299,0.058169,-0.339111,1
9606_ENSP00000274289,9606_ENSP00000288207,822,-0.084267,0.062476,-0.601263,-0.121226,-0.142421,0.155562,-0.103029,0.175577,0.080021,...,0.545399,-0.016626,-0.417165,0.050136,-0.084660,-0.057749,-0.023635,0.076294,-0.138367,1
9606_ENSP00000336524,9606_ENSP00000371297,0,-0.142721,-0.002763,-0.377428,-0.090899,-0.052154,0.187728,-0.134534,0.152076,0.072501,...,0.541120,0.020570,-0.379513,0.037188,-0.055246,-0.029827,0.025901,0.050228,-0.129185,0
9606_ENSP00000244227,9606_ENSP00000310448,925,-0.178305,-0.049901,-0.268163,-0.120309,-0.043244,0.164797,-0.092089,0.173299,0.042128,...,0.525322,-0.048043,-0.382552,0.098036,-0.112177,-0.020228,0.019885,0.047952,-0.166155,1


In [17]:
# pickle data 
# save pair embeddings dictionary as pickle files
folders = ['train_owl2vec', 'test_owl2vec']
dataset_name = ['train_data_all', 'test_data_all']
dataset_index = ['train_index_all', 'test_index_all']
dataset = [train_data, test_data]
dataset_2 = [training_index, testing_index]

for (folder,dataname,dataset) in zip(folders,dataset_name,dataset):
    with open('/Users/teissherman/Desktop/GitHub/IndividualProject/Project/data/MLData/' + folder + '/' + dataname + '.pickle', 'wb') as handle:
        pickle.dump(dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

for (folder,dataname,dataset) in zip(folders,dataset_index,dataset_2):
    with open('/Users/teissherman/Desktop/GitHub/IndividualProject/Project/data/MLData/' + folder + '/' + dataname + '.pickle', 'wb') as handle:
        pickle.dump(dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
# inspect training and testing index works
with open('/Users/teissherman/Desktop/GitHub/IndividualProject/Project/data/MLData/train_owl2vec/train_index.pickle', 'rb') as handle:
        training_index = pickle.load(handle)

In [19]:
training_index

MultiIndex([('9606_ENSP00000215071', '9606_ENSP00000322419'),
            ('9606_ENSP00000339145', '9606_ENSP00000301459'),
            ('9606_ENSP00000329715', '9606_ENSP00000285814'),
            ('9606_ENSP00000362716', '9606_ENSP00000458770'),
            ('9606_ENSP00000391249', '9606_ENSP00000295400'),
            ('9606_ENSP00000480132', '9606_ENSP00000359077'),
            ('9606_ENSP00000261636', '9606_ENSP00000307939'),
            ('9606_ENSP00000348538', '9606_ENSP00000357218'),
            ('9606_ENSP00000302665', '9606_ENSP00000358165'),
            ('9606_ENSP00000409581', '9606_ENSP00000321326'),
            ...
            ('9606_ENSP00000416583', '9606_ENSP00000366927'),
            ('9606_ENSP00000341940', '9606_ENSP00000467141'),
            ('9606_ENSP00000351155', '9606_ENSP00000480893'),
            ('9606_ENSP00000221498', '9606_ENSP00000323714'),
            ('9606_ENSP00000342222', '9606_ENSP00000344782'),
            ('9606_ENSP00000356951', '9606_ENSP0000038