Load all of the SQL data from csv files and combine into one useful dataframe (and export into new csv if necessary)

In [2]:
import pandas as pd
import numpy as np
import pypdb as pdb
import os

In [6]:
def get_path(filename):
    return os.path.join(os.path.dirname(os.getcwd()),
                        'pdb-search',
                        'data',
                        'sql_export',
                        filename)


In [9]:
types = pd.read_csv(get_path('type.csv'), sep=';')
classes = pd.read_csv(get_path('class.csv'), sep=';')
superfamilies = pd.read_csv(get_path('superfamily.csv'), sep=';')
families = pd.read_csv(get_path('family.csv'), sep=';')
species = pd.read_csv(get_path('species.csv'), sep=';')
membranes = pd.read_csv(get_path('membrane.csv'), sep=';')
proteins = pd.read_csv(get_path('protein.csv'), sep=';')

classifications = pd.read_csv(get_path('classification.csv'), sep=';')

#combined = proteins[]

In [51]:
classes.head()
#superfamilies.sort_values('number').head()

Unnamed: 0,id,number,name,description
0,1,1.1.,Alpha-helical polytopic,
1,2,1.3.,Beta-barrel transmembrane,
2,6,2.2.,All beta monotopic/peripheral,
3,5,2.1.,All alpha monotopic/peripheral,
4,7,2.3.,Alpha/Beta monotopic/peripheral,


In [57]:
# each row in families['number'] contains a string of numbers separated by periods
# however, there is an extra '.' at the end, which confuses str.split('.')
# so first we must delete the last character by slicing: str[:-1]
# then we split the string in each row into four separate columns, corresponding to:
# type.class.superfamily.family (ex: '1.4.28.03')
# we use these to construct a new dataframe, and finally append the id numbers also
# this allows us to determine the type, class, and superfamily of any protein
# from its family number.

#family_ids = pd.DataFrame([ x[:-1].split('.') for x in families['number'] ],
#                           columns=['type','class','superfamily','family'])
#family_ids = pd.concat([families['id'],family_ids],axis=1)

#family_ids.head()

#class_ids = pd.DataFrame([ x[:4] for x in families['number'] ],
#                           columns=['type','class','superfamily','family'])

families['class_number']=families.number.str[:4]
#families.head()

In [43]:
families.sort_values('number').head()

Unnamed: 0,id,number,name,tcdb,pfam
12,14,1.1.01.01.,Microbial and algal rhodopsins,3.E.1,PF01036
13,15,1.1.01.02.,"G-protein coupled receptors, family A",9.A.14,PF00001
241,265,1.1.01.03.,GPCR Secretin (B) family,9.A.14,PF00002
665,734,1.1.01.04.,Frizzled/Smoothened family,9.A.14,PF01534
689,758,1.1.01.05.,GPCR Metabotropic glutamate receptor (C) family,9.A.14,PF00003


In [59]:
#first we make a dictionary to translate:
# family id --> family name
# family id --> family number
# class number --> class name
family_names = dict(families[['id','name']].values)
family_class = dict(families[['id','class_number']].values)
class_map = dict(classes[['number','name']].values)

#proteins['class_number']
#x[:4] for x in families['number'] 
proteins['family_name'] = proteins.family_id.replace(family_names)
proteins['class_name'] = proteins.family_id.replace(family_class).replace(class_map)

#alternately, use this to accomplish the same thing in one line:
#proteins['family_name'] = proteins.family_id.replace(families.set_index('id')['name'])
#proteins['class_name'] = proteins.family_id.replace(families.set_index('id')['class_number'])
proteins.head()

Unnamed: 0,id,family_id,species_id,membrane_id,pdbid,name,resolution,topology,thickness,thicknesserror,...,tilterror,gibbs,tau,numsubunits,numstrands,verification,comments,date_added,family_name,class_name
0,1,35,9,3,1qjp,"Outer membrane protein A (OMPA), disordered loops",1.65,A in,25.4,1.5,...,1,-29.5,N.D.,1,8,Four interfacial Trp residues of OmpA are loca...,OmpA is required for the action of colicins K ...,2005-09-04,OmpA family,Beta-barrel transmembrane
1,2,390,9,3,1qj8,Outer membrane protein X (OMPX),1.9,A in,23.6,2.8,...,5,-30.7,N.D.,1,8,Locations of the hydrophobic boundaries are co...,OmpX from Escherichia coli promotes adhesio...,2005-09-04,Enterobacterial Ail/Lom protein,Beta-barrel transmembrane
2,3,363,24,3,1p4t,Outer membrane protein NspA,2.55,A in,24.9,2.4,...,3,-42.9,28,1,8,,Pathogenic Neisseria spp. possess a repertoire...,2005-09-04,Opacity porins,Beta-barrel transmembrane
3,1006,286,52,15,3lbw,"M2 proton channel of Influenza A, closed state",1.65,A out,31.1,1.7,...,10,-50.5,,4,4,,,0000-00-00,Influenza virus matrix protein 2,Bitopic proteins
4,1007,37,310,3,2x55,Plasminogen activator PLA (coagulase/fibrinoly...,1.85,A in,25.6,1.9,...,5,-43.9,,1,10,,,0000-00-00,"OM protease omptin, OMPT",Beta-barrel transmembrane


In [50]:
proteins.head()

Unnamed: 0,id,family_id,species_id,membrane_id,pdbid,name,resolution,topology,thickness,thicknesserror,tilt,tilterror,gibbs,tau,numsubunits,numstrands,verification,comments,date_added,family_name
0,1,35,9,3,1qjp,"Outer membrane protein A (OMPA), disordered loops",1.65,A in,25.4,1.5,11,1,-29.5,N.D.,1,8,Four interfacial Trp residues of OmpA are loca...,OmpA is required for the action of colicins K ...,2005-09-04,OmpA family
1,2,390,9,3,1qj8,Outer membrane protein X (OMPX),1.9,A in,23.6,2.8,12,5,-30.7,N.D.,1,8,Locations of the hydrophobic boundaries are co...,OmpX from Escherichia coli promotes adhesio...,2005-09-04,Enterobacterial Ail/Lom protein
2,3,363,24,3,1p4t,Outer membrane protein NspA,2.55,A in,24.9,2.4,22,3,-42.9,28,1,8,,Pathogenic Neisseria spp. possess a repertoire...,2005-09-04,Opacity porins
3,1006,286,52,15,3lbw,"M2 proton channel of Influenza A, closed state",1.65,A out,31.1,1.7,7,10,-50.5,,4,4,,,0000-00-00,Influenza virus matrix protein 2
4,1007,37,310,3,2x55,Plasminogen activator PLA (coagulase/fibrinoly...,1.85,A in,25.6,1.9,4,5,-43.9,,1,10,,,0000-00-00,"OM protease omptin, OMPT"


In [45]:
classifications

Unnamed: 0,id,type_id,class_id,superfamily_id,family_id
0,1,1,1,2,1
1,2,1,1,2,2
2,3,1,1,2,3
3,4,1,1,2,4
4,5,1,1,2,5
5,6,1,1,345,6
6,7,1,1,3,7
7,8,1,1,3,8
8,9,1,1,3,9
9,10,1,1,3,10
