Load all of the SQL data from csv files and combine into one useful dataframe (and export into new csv if necessary)

In [1]:
import pandas as pd
import numpy as np
import pypdb as pdb
import os

def get_path(filename):
    """
    Locate the files in the appropriate directory relative to the current working directory.
    Assumes the current file is in "~/example_notebooks/", and that the data files are located
    in "~/pdb-search/data/sql_export/".
    
    os.path.join is used to provide cross-platform support.
    
    Returns:
    -------
    path : string
        The full path to the file
    """
    return os.path.join(os.path.dirname(os.getcwd()),
                        'pdb-search',
                        'data',
                        'sql_export',
                        filename)

def make_dataframe():
    """
    A Function to read all of the OPM database tables from csv files
    and load it into memory as a pandas dataframe.
    'protein.csv' contains the main table of protein information
    and the rest of the tables contain specific information about
    the various categories of proteins.  These are converted to dicts,
    and used to add the appropriate columns to the proteins dataframe
    
    Returns:
    -------
    df : pandas.DataFrame
        The data from the OPM database, including protein types,
        classes, superfamilies, families, species, and localization
    """
    
    # First we load all of the csv files into memory as pandas dataframes
    # proteins is the main table we are interested in
    proteins = pd.read_csv(get_path('protein.csv'), sep=';')

    # types, classes, families, and membranes will all become
    # dicts to interpret their various id's and turn them into names
    types = pd.read_csv(get_path('type.csv'), sep=';')
    classes = pd.read_csv(get_path('class.csv'), sep=';')
    families = pd.read_csv(get_path('family.csv'), sep=';')
    membranes = pd.read_csv(get_path('membrane.csv'), sep=';')

    # any given family superfamilies and species
    classifications = pd.read_csv(get_path('classification.csv'), sep=';')
    superfamilies = pd.read_csv(get_path('superfamily.csv'), sep=';')
    species = pd.read_csv(get_path('species.csv'), sep=';')

    # Next we make a series of dictionaries to translate:
    # family id --> family name
    # family id --> class number
    # family id --> type id
    # family id --> family tcdb code
    # family id --> family pfam code
    # family id --> superfamily_id
    family_names = dict(families[['id','name']].values)
    family_tcdbs = dict(families[['id','tcdb']].values)
    family_pfams = dict(families[['id','pfam']].values)
    
    # family id --> superfamily id
    # family id --> class id
    # family id --> type id
    family_to_superfam = dict(classifications[['family_id','superfamily_id']].values)
    family_to_class = dict(classifications[['family_id','class_id']].values)
    family_to_type = dict(classifications[['family_id','type_id']].values)

    # superfamily id --> superfamily name
    # superfamily id --> superfamily tcdb code
    # superfamily id --> superfamily pfam code
    superfamily_names = dict(superfamilies[['id','name']].values)
    superfamily_tcdbs = dict(superfamilies[['id','tcdb']].values)
    superfamily_pfams = dict(superfamilies[['id','pfam']].values)
    
    # class id --> class name
    # type id --> type name
    # species id --> species name
    # membrane id --> memrane name
    # memrane id --> membran abbreviation
    class_names = dict(classes[['id','name']].values)
    type_names = dict(types[['id','name']].values)
    species_names = dict(species[['id','name']].values)
    membrane_names = dict(membranes[['id','name']].values)
    membrane_abbr = dict(membranes[['id','abbreviation']].values)

    # now we use the dics from above to create new columns in the
    # proteins data frame containing the actual names of the
    # species, membrane, family, superfamily, etc
    proteins['species'] = proteins.species_id.replace(species_names)
    proteins['membrane'] = proteins.membrane_id.replace(membrane_names)
    proteins['membrane_abbr'] = proteins.membrane_id.replace(membrane_abbr)
    proteins['family'] = proteins.family_id.replace(family_names)
    proteins['family_pfam'] = proteins.family_id.replace(family_pfams)
    proteins['family_tcdbs'] = proteins.family_id.replace(family_tcdbs)
    proteins['superfamily'] = proteins.family_id.replace(family_to_superfam).replace(superfamily_names)
    proteins['superfamily_tcdb'] = proteins.family_id.replace(family_to_superfam).replace(superfamily_tcdbs)
    proteins['superfamily_pfam'] = proteins.family_id.replace(family_to_superfam).replace(superfamily_pfams)
    proteins['class'] = proteins.family_id.replace(family_to_class).replace(class_names)
    proteins['type'] = proteins.family_id.replace(family_to_type).replace(type_names)
    
    return proteins

In [2]:
df = make_dataframe()
df.head()

Unnamed: 0,id,family_id,species_id,membrane_id,pdbid,name,resolution,topology,thickness,thicknesserror,...,membrane,membrane_abbr,family,family_pfam,family_tcdbs,superfamily,superfamily_tcdb,superfamily_pfam,class,type
0,1,35,9,3,1qjp,"Outer membrane protein A (OMPA), disordered loops",1.65,A in,25.4,1.5,...,Bacterial Gram-negative outer membrane,Bact. Gram-neg outer,OmpA family,PF01389,,"OmpA-OmpF porin family (n=8,S=10)",1.B.6,,Beta-barrel transmembrane,Transmembrane
1,2,390,9,3,1qj8,Outer membrane protein X (OMPX),1.9,A in,23.6,2.8,...,Bacterial Gram-negative outer membrane,Bact. Gram-neg outer,Enterobacterial Ail/Lom protein,PF06316,,"OmpA-OmpF porin family (n=8,S=10)",1.B.6,,Beta-barrel transmembrane,Transmembrane
2,3,363,24,3,1p4t,Outer membrane protein NspA,2.55,A in,24.9,2.4,...,Bacterial Gram-negative outer membrane,Bact. Gram-neg outer,Opacity porins,PF02462,,"Opacity porins (n=8,S=10)",,PF02462,Beta-barrel transmembrane,Transmembrane
3,1006,286,52,15,3lbw,"M2 proton channel of Influenza A, closed state",1.65,A out,31.1,1.7,...,Viral membrane,Viral,Influenza virus matrix protein 2,PF00599,1.A.19,Viral channel proteins,1.A.19,PF00599,Bitopic proteins,Transmembrane
4,1007,37,310,3,2x55,Plasminogen activator PLA (coagulase/fibrinoly...,1.85,A in,25.6,1.9,...,Bacterial Gram-negative outer membrane,Bact. Gram-neg outer,"OM protease omptin, OMPT",PF01278,,"Omptin (n=10,S=12)",1.B.1,PF01278,Beta-barrel transmembrane,Transmembrane


In [None]:
classifications.head()

#superfamilies.sort_values('number').head()

In [None]:
#superfamilies.sort_values('pfam').head()
#classes.head()

proteins = pd.read_csv(get_path('protein.csv'), sep=';')
classifications = pd.read_csv(get_path('classification.csv'), sep=';')
superfamilies = pd.read_csv(get_path('superfamily.csv'), sep=';')
family_to_superfam = dict(classifications[['family_id','superfamily_id']].values)
superfamily_names = dict(superfamilies[['id','name']].values)
#proteins['superfamily'] = proteins.family_id.replace(family_to_superfam).replace(superfamily_names)
#proteins.head()
superfamily_names
#superfamilies

In [None]:
species.head()

In [None]:
families.sort_values('pfam').head()

In [None]:
#this code is here for testing and debuggin purposes

types = pd.read_csv(get_path('type.csv'), sep=';')
classes = pd.read_csv(get_path('class.csv'), sep=';')
superfamilies = pd.read_csv(get_path('superfamily.csv'), sep=';')
families = pd.read_csv(get_path('family.csv'), sep=';')
species = pd.read_csv(get_path('species.csv'), sep=';')
membranes = pd.read_csv(get_path('membrane.csv'), sep=';')
proteins = pd.read_csv(get_path('protein.csv'), sep=';')

classifications = pd.read_csv(get_path('classification.csv'), sep=';')

#combined = proteins[]

# The family number is a four digit code, the first two of which represent
# the class number.  So, we can parse that out by taking the first four
# characters from the family number and creating a new column
families['class_number']=families.number.str[:4]
families['type_id']=pd.to_numeric(families.number.str[:1])

#first we make a dictionary to translate:
# family id --> family name
# family id --> class number
# class number --> class name
# family id --> type id
# type id --> type name
family_names = dict(families[['id','name']].values)
family_class = dict(families[['id','class_number']].values)
class_names = dict(classes[['number','name']].values)
family_types = dict(families[['id','type_id']].values)
type_names = dict(types[['id','name']].values)

#proteins['class_number']
#x[:4] for x in families['number'] 
proteins['name'] = proteins.family_id.replace(family_names)
proteins['family'] = proteins.family_id.replace(family_names)
proteins['class'] = proteins.family_id.replace(family_class).replace(class_names)
proteins['type'] = proteins.family_id.replace(family_types).replace(type_names)

#alternately, use this to accomplish the same thing in one line:
#proteins['family_name'] = proteins.family_id.replace(families.set_index('id')['name'])
#proteins['class_name'] = proteins.family_id.replace(families.set_index('id')['class_number']).replace(classes.set_index('numer')['name'])
# proteins.head()

In [None]:
# This code was part of some early testing to parse the family_id
# from the proteins dataframe, but it turned out that the classification of 
# type.class.superfamily.family was not correct.

# each row in families['number'] contains a string of numbers separated by periods
# however, there is an extra '.' at the end, which confuses str.split('.')
# so first we must delete the last character by slicing: str[:-1]
# then we split the string in each row into four separate columns, corresponding to:
# type.class.superfamily.family (ex: '1.4.28.03')
# we use these to construct a new dataframe, and finally append the id numbers also
# this allows us to determine the type, class, and superfamily of any protein
# from its family number.

#family_ids = pd.DataFrame([ x[:-1].split('.') for x in families['number'] ],
#                           columns=['type','class','superfamily','family'])
#family_ids = pd.concat([families['id'],family_ids],axis=1)

#family_ids.head()

#class_ids = pd.DataFrame([ x[:4] for x in families['number'] ],
#                           columns=['type','class','superfamily','family'])

#families.head()

In [None]:
    # The family_id is a four digit code, the first of which represents
    # the type_id (there are only 3 types in the database)
    #families['type_id']=pd.to_numeric(families.number.str[:1])

    # The first two digits of the family_id represents the class_id.
    # note that the class_id is a two-digit code containing both the
    # type_id and a second number.  But they are both needed!
    # In this case, we can parse it from the family_id by taking the
    # first four characters: 'x.y.'
    #families['class_id']=families.number.str[:4]
    
    #family_classes = dict(families[['id','class_id']].values)
    #family_types = dict(families[['id','type_id']].values)
    
    #class_names = dict(classes[['number','name']].values)
    #type_names = dict(types[['id','name']].values)
    
    # The code above was not used, because I realized that I could
    # accomplish the same thing in two steps using the 
    # classifications table:
    # family_id --> class_id --> class_name
    # family_id --> type_id --> type_name