In [4]:
import pandas as pd
import numpy as np
import pypdb as pdb
import os

In [14]:

def get_file_path(filename):
    return os.path.dirname(os.getcwd())+"\\pdb-search\\data\\"+filename


# load the data from excel files located in this directory
def load_data():
    # Read the peptides data (in excel format) extracted from the website:
    # http://opm.phar.umich.edu/types.php
    transmembrane = pd.read_excel(get_file_path("1_Transmembrane_type.xlsx"), "Sheet1")
    monotopic = pd.read_excel(get_file_path("2_Monotopic_type.xlsx"), "Sheet1")
    peptides = pd.read_excel(get_file_path("3_Peptides_type.xlsx"), "Sheet1")
    
    # Concatenate the 3 dataframes together to get a master list of proteins
    return pd.concat([transmembrane, monotopic, peptides], keys =["transmembrane", "monotopic", "peptides"], ignore_index=True)


# load the data from excel files located in this directory
def load_opm_data():
    # Read the peptides data (in excel format) extracted from the MySQL dump file:
    # http://opm.phar.umich.edu/types.php
    return pd.read_excel(get_file_path("OPM_data_from_MySQL.xlsx"), "Sheet1")

# search the PDB database for matches to a give query
# and return a subset of a given dataframe
# which contains matching 'PDB_ID's
def find_matches(query, df):
    # make a PDB database query and perform a search,
    # then convert the results to lower case
    search_results = [x.lower() for x in pdb.do_search(pdb.make_query(query))]
    return df[df['PDB_ID'].isin(search_results)]


# search the PDB database for matches to a give query
# and return a subset of a given dataframe
# which contains matching 'PDB_ID's
def find_opm_matches(query, df):
    # make a PDB database query and perform a search,
    # then convert the results to lower case
    search_results = [x.lower() for x in pdb.do_search(pdb.make_query(query))]
    return df[df['pdbid'].isin(search_results)]

In [16]:
df = load_data()
matches = find_matches('transmembrane homosapiens',df)
matches.shape

(62, 10)

In [17]:
opm_df = load_opm_data()

#opm_df.head()
opm_matches = find_opm_matches('transmembrane homosapiens',opm_df)
opm_matches.shape

(58, 19)

In [21]:
matches[~matches['PDB_ID'].isin(opm_matches['pdbid'])]

Unnamed: 0,Family,Protein_Name,PDB_ID,Species,Localization,Num_Subunits,Num_TM_Sec_Structs,Hydrophobic_Thickness or_Depth,Tilt _Angle,Gtransfer
404,1.1.10.08.,Cystic fibrosis transmembrane conductance regu...,5uar,Danio rerio,Endosome,1,12,30.6,0.0,-81.7
426,1.1.107.01.,Scavenger receptor B-1,5ktf,Mus musculus,Eukaryo. plasma,1,1,30.0,41.0,-25.1
695,1.1.16.01.,"Glutamate receptor 2, with TARP channel, struc...",5vou,Rattus norvegicus,Eukaryo. plasma,8,20,32.2,0.0,-232.0
1148,1.2.01.01.,"High affinity nerve growth factor receptor, NTRK1",2n90,Homo sapiens,Eukaryo. plasma,2,2,28.8,11.0,-29.8


In [5]:
matches

Unnamed: 0,Family,Protein_Name,PDB_ID,Species,Localization,Num_Subunits,Num_TM_Sec_Structs,Hydrophobic_Thickness or_Depth,Tilt _Angle,Gtransfer
3,1.1.01.01.,Chimaera of channelrhodopsins 1 and 2,3ug9,Chlamydomonas reinhardtii,Eukaryo. plasma,2,14,31.0,0.0,-100.3
4,1.1.01.01.,"Halorhodopsin, different conformation",3vvk,Natronomonas pharaonis,Archaebac.,3,21,33.6,0.0,-140.5
8,1.1.01.01.,"Bacteriorhodopsin, trimer, complex with annula...",2zzl,Halobacterium salinarum,Archaebac.,3,21,29.8,0.0,-119.0
10,1.1.01.01.,"Sensory rhodopsin, dimer",4tl3,Nostoc sp.,Bact. Gram-neg inner,2,17,30.6,0.0,-116.0
21,1.1.01.01.,Proteorhodopsin,4hyj,Exiguobacterium sibiricum,Bact. Gram-pos plas.,1,7,30.0,15.0,-56.8
25,1.1.01.01.,Cruxrhodopsin-3,4jr8,Haloarcula vallismortis,Archaebac.,3,21,31.8,0.0,-130.4
26,1.1.01.01.,Bacteriorhodopsin,1m0l,Halobacterium salinarum,Archaebac.,3,21,31.8,0.0,-121.1
27,1.1.01.01.,Bacteriorhodopsin-I,4pxk,Haloarcula marismortui,Archaebac.,1,7,31.8,11.0,-72.7
30,1.1.01.01.,"Sensory rhodopsin II, tetramer, structure 1",1h2s,Natronomonas pharaonis,Archaebac.,4,18,30.5,0.0,-123.4
36,1.1.01.01.,Deltarhodopsin,4fbz,Haloterrigena thermotolerans,Archaebac.,3,21,31.6,0.0,-110.6
