I want to analyze avidins not present in the training set, to get a feeling of the model.

Hoefavidin, afifavidin, wilavidin (both forms for each, short are dimers, long are octamers/hexamer respectively). 

Will start with getting the seqs and embedding them, and then infer them from the model. 

The best option is that the model will diferenciate between all the groups.
Second best is assigning dimers (ant their oligos) as dimers
Third is having them all as tetramers but with less confidence.


In [None]:
# Function definitions 
def code_mod(code_string):

    pdb_code = str(code_string.split("_")[0])
    modified_code = pdb_code
    return modified_code

In [None]:
# install Biopython in case it doesn't exist already
!pip install Bio

# imports 
import os 
import os.path
import pandas as pd 
import Bio
from Bio import SeqIO
import pickle 
from google.colab import drive
drive.mount("/content/drive")

# relevant paths for this project
Dir = "drive/MyDrive/OrlyPred/"
Data = "drive/MyDrive/OrlyPred/Data"
qsbio_file = "drive/MyDrive/OrlyPred/Data/QSbio_PiQSi_annotations_V6_2020.csv"

In [None]:
# read qsbio data into a df 
qsbio_df = pd.read_csv(qsbio_file, error_bad_lines=False, low_memory=False, skiprows=21)
qsbio_df = qsbio_df.drop(["Unnamed: 18", "Unnamed: 19", "Unnamed: 20", "Unnamed: 21"], axis=1)

# qsbio_df.head()

# keep only pdbID, subunits columns and homomer indication
qsbio_filt = qsbio_df[['code', 'nsub', 'corrected_nsub','QSBIO_err_prob', 'best_BU', 'homo']]

# remove rows with Nan values of nsub
# removes 12210 rows
qsbio_filt.dropna(subset=["nsub"], inplace=True)

# fill empty corrected_nsub with the val from nsub
qsbio_filt['corrected_nsub'].fillna(qsbio_filt['nsub'], inplace=True)

# create a new column with just pdb id
qsbio_filt['pdb_code'] = qsbio_filt['code'].apply(lambda x: code_mod(x))

# summarize how many instances are unique for each pdb code
# print(qsbio_filt.groupby('pdb_code').nunique())

# for each pdb code, retain only rows that have a unique nsub value
qsbio_uniq = qsbio_filt.drop_duplicates(subset=['pdb_code','nsub'],keep="last")

#store relevant cols as ints
qsbio_uniq["homo"] = pd.to_numeric(qsbio_uniq['homo'], errors='coerce', downcast="integer")
qsbio_uniq["corrected_nsub"] = pd.to_numeric(qsbio_uniq['corrected_nsub'], errors='coerce', downcast="integer")
qsbio_uniq["nsub"] = pd.to_numeric(qsbio_uniq['nsub'], errors='coerce', downcast="integer")
qsbio_uniq["QSBIO_err_prob"] = pd.to_numeric(qsbio_uniq['QSBIO_err_prob'], errors='coerce', downcast="integer")
qsbio_uniq["best_BU"] = pd.to_numeric(qsbio_uniq['best_BU'], errors='coerce', downcast="integer")


# retain only homomers, i.e. rows where homo is 1 (from the documentation: '1' indicates homomers, otherwise heteromers)
qsbio_homomer = qsbio_uniq[qsbio_uniq["homo"] == 1]

# the next rows are meant for data filtration 
# filter by QSbio error estimator as recommended by Emmanuel Levy
qsbio_df_conf = qsbio_homomer[qsbio_homomer["QSBIO_err_prob"] < 15]
# An additional confidence value - the best Biological Unit of the strcuture
qsbio_df_conf_plus = qsbio_df_conf[qsbio_df_conf["best_BU"] == 1]
# take only rows where the nsub and corrected_nsub are the same (removes less than 2000 structures)
qsbio_homomer_final = qsbio_df_conf_plus[qsbio_df_conf_plus['corrected_nsub'] == qsbio_df_conf_plus['nsub']]
qsbio_homomer_final.drop(["corrected_nsub"], axis=1, inplace=True)


print(qsbio_homomer_final.head(20))